import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
df = pd.read_excel ('Rainfall.xlsx')
df
df.describe()
df1 = df[['MinTemp','MaxTemp','Evaporation',
'Sunshine','WindGustSpeed','WindSpeed9am','WindSpeed3pm',
'Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Cloud9am',
'Cloud3pm','Temp9am','Temp3pm','RainToday','RainTomorrow']]
df1
df1.hist(figsize= (30,20))
'''
column_name = df1.columns
n = 0
for e in column_name:
n = n + 1
plt.figure(n)
plt.hist(df1[e], bins = 30)
plt.legend([e])
plt.show()
'''
'''
n = 0
for e in column_name:
if e == 'RainTomorrow' or e == 'RainToday':
print (e + '\tis categorical data')
else:
print (e)
n = n + 1
plt.figure(n)
sns.distplot(df1[e])
plt.legend([e])
plt.show()
'''
sns.pairplot(df1.iloc[:,[0,1,2,3,4,5,6,7,16]], hue='RainTomorrow')
there seem to be some separation between the two classes (either 'yes' or 'no') for the variable 'Sunshine' and 'Humidity9am'.
sns.pairplot(df1.iloc[:,[8,9,10,11,12,13,14,16]], hue='RainTomorrow')
there seem to be some separation between the two classes (either 'yes' or 'no') for the variable 'Cloud9am' and 'Cloud3am'.
sns.heatmap(df1.corr(), annot=False)
x = df1.isnull().sum()
x.plot(kind='bar', title='Missing values in each variables')
x
for e in column_name:
if df1[e].isnull().sum() > 0:
for i in range(len(df1)):
if np.isnan(df1.loc[i,e]):
df1.loc[i,e] = np.mean(df1.loc[:,e])
else:
continue
# df1['Evaporation'] = df1['Evaporation'].fillna(np.mean(df1['Evaporation']))
df1.isnull().sum()
no more missing values now - REJOICE!
df1['RainToday_No'] = df1['RainToday'].apply(lambda x: 0 if x == 'No' else 1)
df1['RainTomorrow_encode'] = df1['RainTomorrow'].apply(lambda x: 0 if x == 'No' else 1)
df1
df1 = df1.drop(['RainToday', 'RainTomorrow'], axis = 1)
df1
y = df1[['RainTomorrow_encode']] # target attributes
X = df1.iloc[:, 0:16] # input attributes
y.head()
X.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=0, stratify=y)
X_train.head()
y_train.head()
from sklearn import svm
m = svm.SVC()
m.fit(X_train, np.ravel(y_train))
default SVM model uses RBF kernel
m.predict(X_test)
m.score(X_test, y_test)
m.score?
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, m.predict(X_test), labels=[0,1])
m.predict(X_train)
from sklearn.metrics import classification_report
predictions =(m.predict(X_test)).astype("int32")
print(classification_report(y_test, predictions))
m1 = svm.SVC(kernel = 'linear')
m1.fit(X_train, np.ravel(y_train))
m1.score(X_test, y_test)
confusion_matrix(y_test, m1.predict(X_test), labels=[0,1])
predictions =(m1.predict(X_test)).astype("int32")
print(classification_report(y_test, predictions))