-
Notifications
You must be signed in to change notification settings - Fork 1
/
classification.py
126 lines (96 loc) · 3.45 KB
/
classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# matplotlib notebook
import numpy as np
from pandas import read_csv
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.cross_validation import train_test_split, StratifiedKFold
import matplotlib.pyplot as plt
# Load the data into Python
df = read_csv("../python-data-science-exercise/data/Default.csv", index_col=0)
# Display first few rows of our dataset
df.head()
# Check dimensionality
df.shape
# Convert strings to numerical values and reassign to 'df' variable
df = df.replace("Yes", 1)
df = df.replace("No", 0)
# df.head
# Use train_test_split to create training and test sets and assign to variables df_train and df_test
df_train, df_test = train_test_split(df, test_size=0.1)
# Define feature columns "student", "balance", and "income" as a list
feature_cols = ["student", "balance", "income"]
# Define training feature and target values
X_train = df_train[feature_cols]
Y_train = df_train.default
# Define test feature and target values
X_test = df_test[feature_cols]
Y_test = df_test.default
# Train classifier
print "Training classifier..."
gnb = GaussianNB()
clf = gnb.fit(X_train, Y_train)
print "Training done!"
# Test classifier
print "Testing classifier..."
Y_pred = clf.predict(X_test)
print Y_pred
print "Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (Y_test != Y_pred).sum())
# Display confusion matrix
print confusion_matrix(Y_test, Y_pred)
# Display histogram
barwidth = 0.5
index = np.arange(len(np.unique(Y_test)))
plt.figure()
plt.bar(index, np.bincount(Y_test), barwidth)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.xticks(index + 0.5 * barwidth, ('0', '1'))
plt.show()
print classification_report(Y_test, Y_pred)
# Get sample scores for ROC curve
Y_score = clf.predict_proba(X_test)
# Compute ROC metrics
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_score[:, 1], pos_label=1)
roc_auc = auc(false_positive_rate, true_positive_rate)
# Plot ROC curve
plt.figure()
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# Configure cross validation
cv = StratifiedKFold(df.default, n_folds=6)
# Create new plot
plt.figure()
# Get indices for training and test subsets for each fold
for i, (train, test) in enumerate(cv):
# Get training data
X_train = df[feature_cols].as_matrix()[train, :]
Y_train = df.default.as_matrix()[train,]
# Get test data
X_test = df[feature_cols].as_matrix()[test, :]
Y_test = df.default.as_matrix()[test,]
# Configure classifier
clf = GaussianNB()
# Learn classification model
clf.fit(X_train, Y_train)
# Get sample scores for ROC curve
Y_score = clf.predict_proba(X_test)
# Compute ROC metrics
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_score[:, 1], pos_label=1)
roc_auc = auc(false_positive_rate, true_positive_rate)
# Plot ROC curve
plt.plot(false_positive_rate, true_positive_rate, label='AUC = %0.2f' % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.2])
plt.ylim([-0.1, 1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()