-
Notifications
You must be signed in to change notification settings - Fork 0
/
process.py
executable file
·368 lines (303 loc) · 11.9 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# Adam Martini
# ml_suite
# 2-4-14
import csv
import numpy as np
import sys
from time import time
import datetime
import pylab as pl
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import ExtraTreesClassifier as ETC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.neural_network import BernoulliRBM
# sklearn utils
from sklearn.utils.extmath import density
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Binarizer
# from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as get_scores
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import classification_report as cr
from sklearn.metrics import confusion_matrix as cm
from mlutils import mlutils as mlu
from mlutils.feature_reduction import reduce_features
from mlutils.boost import boost_data
### Logging Functionaliy ###
log_file = "process_log.txt"
print 'Opening log file:', log_file
log = open(log_file, "w")
def lw(s):
log.write(s + '\n')
def output(s):
lw(str(s))
print s
now = datetime.datetime.now()
lw('Log Opened @ ' + str(now.strftime("%Y-%m-%d %H:%M")) + '\n')
def lrw():
lw(str(clfr) + '\n')
lw(cr(y_test, y_))
lw('\n\n')
lw(str(cm(y_test, y_)))
lw('\n\n')
log.close()
log = open(log_file, "a")
### Classification Model Processor ###
class CLFProcessor():
def __init__(self, traindata, testdata, load_method, feature_type, boost, boost_amount,
rank_list, reduce_amount):
self.traindata = traindata
self.testdata = testdata
self.load_method = load_method
self.feature_type = feature_type
self.boost = boost
self.boost_amount = boost_amount
self.rank_list = rank_list
self.reduce_amount = reduce_amount
self.beta = 1 # f_measure
def process(self):
"""
Fits and predicts using training and testing data for multiple classifiers. All results are printed to a
log file and two graphs are created to show precision/recall/f_measure results and AUC.
"""
results = [] # stores results for each data set
# load data
X_train, y_train, X_test, y_test = self.clf_setup()
# generate classification results
### Unsueprvised ###
# kNN
output(80 * '=')
output("kNN")
results.append(self.benchmark(KNeighborsClassifier(n_neighbors=20), X_train, y_train, X_test, y_test))
# NearestCentroid without threshold
output(80 * '=')
output("NearestCentroid (aka Rocchio classifier)")
results.append(self.benchmark(NearestCentroid(), X_train, y_train, X_test, y_test))
### Supervised ###
# Logistic Regression classifiers
for penalty in ["l2"]: #, "l1"]:
output(80 * '=')
output("LogisticRegression with %s penalty" % penalty.upper())
# train Logistic Regression model
results.append(self.benchmark(LogisticRegression(penalty=penalty, #class_weight={1:1000},
dual=False, tol=.00000001), X_train, y_train, X_test, y_test))
# RBM/Logistic Regression Pipeline
# output(80 * '=')
# output("RBM/LogisticRegression Pipeline")
# rbm = BernoulliRBM(n_components=X_train.shape[1] + 1, random_state=0, verbose=True)
# binarizer = Binarizer(threshold=0.5)
# X_train_b = binarizer.fit_transform(X_train, )
# hidden_layer = rbm.fit_transform(X_train_b, y_train)
# logistic = LogisticRegression()
# logistic.coef_ = hidden_layer
# # classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])
# results.append(self.benchmark(logistic, X_train, y_train, X_test, y_test))
# Perceptron with class weighting
# results.append(self.benchmark(Perceptron(class_weight={0:1000}, n_iter=1000), X_train, y_train,
# X_test, y_test))
# Naive Bayes classifiers
output(80 * '=')
print "Naive Bayes"
lw("Naive Bayes")
if self.feature_type == 'binary':
results.append(self.benchmark(MultinomialNB(alpha=.01), X_train, y_train, X_test, y_test))
else:
results.append(self.benchmark(GaussianNB(), X_train, y_train, X_test, y_test))
output(80 * '=')
output("SVC with poly kernel")
# results.append(self.benchmark(SVC(C=1000, probability=True, kernel='poly', tol=.00001),
# X_train, y_train, X_test, y_test))
results.append(self.benchmark(SVC(probability=True, kernel='rbf'), X_train, y_train, X_test, y_test))
output(80 * '=')
output("GradientBoostingClassifier with 100 estimators using entropy")
results.append(self.benchmark(GBC(n_estimators=100), X_train, y_train, X_test, y_test))
output(80 * '=')
output("Random Forest Ensemble with 100 estimators using entropy")
results.append(self.benchmark(RFC(criterion='entropy', n_estimators=100), X_train, y_train, X_test, y_test))
# rearrange results into sets
results = zip(*results)
# create P/R and AUC graphs
self.plot_results(results, y_test)
output("<<< FINISHED PROCESSING >>>")
return results, y_test
def clf_setup(self, shuffle=False):
# load tarining and testing data from csv
X_train, y_train = mlu.load_csv(self.traindata, shuffle)
X_test, y_test = mlu.load_csv(self.testdata, shuffle)
# perform feature scaling
X_train = mlu.scale_features(X_train, 0.0, 1.0)
X_test = mlu.scale_features(X_test, 0.0, 1.0)
# boost positive instances in training data
if self.boost:
X_train, y_train = boost_data(X_train, y_train, boost_amount)
# perform feature reduction based on provided rank list
if self.rank_list is not None:
X_train(X_train, self.rank_list, self.reduce_amount)
X_test(X_test, self.rank_list, self.reduce_amount)
return X_train_, y_train, X_test_, y_test
# return data sets and features for classification
return X_train, y_train, X_test, y_test
def benchmark(self, clf, X_train, y_train, X_test, y_test):
output(80 * '_')
# fit
output("Training:")
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
output("train time: %0.3fs" % train_time)
# predict
t0 = time()
pred = clf.predict(X_test)
try:
proba = clf.predict_proba(X_test)
except:
proba = None
try:
log_proba = clf.predict_log_proba(X_test)
except:
log_proba = None
test_time = time() - t0
output("test time: %0.3fs" % test_time)
# get metrics for the positve class only (heavy class imbalance)
# p_score = mlu.get_pos_precision(cm(y_test, pred))
# r_score = mlu.get_pos_recall(cm(y_test, pred))
# f_measure = mlu.get_f_measure(p_score, r_score)
# get metrics
p_scores, r_scores, f_measures, support = get_scores(y_test, pred, self.beta)
p_score_avg = p_scores.mean()
r_score_avg = r_scores.mean()
f_measure_avg = f_measures.mean()
output("precision: %0.3f \trecall: %0.3f" % (p_score_avg, r_score_avg))
# output results
output("Classification results:")
output(cr(y_test, pred))
output(cm(y_test, pred))
clf_descr = str(clf).split('(')[0] # get the name of the classifier from its repr()
return clf_descr, p_score_avg, r_score_avg, f_measure_avg, train_time, test_time, proba
def plot_results(self, results, y_test, metrics_graph=True, auc_graph=True):
# variable setup
reduced = self.rank_list is not None
num_feats = self.reduce_amount
# colors
lllb = '#E7F5FE'
llbb = '#D6E0FF'
lllbb = '#EBF0FF'
lb = '#ccffff'
llg = '#d6fdd6'
g = '#00E672'
gg = '#197519'
dg = '#005C00'
lg = '#80CC80'
b = '#3366FF'
sb = '#E6FAFF'
db = '#5548A4'
lb = '#66CCFF'
llb = '#99B2FF'
y ='#FFFF00'
r = '#E60000'
lr = '#FF3366'
llr = '#FF9999'
def autolabel(rects):
# attach some text labels
for rect in rects:
width = rect.get_width()
pl.text(width+0.025, rect.get_y()+.017, '%.2f'%width,
ha='center', va='bottom')
# get the indices for the classifiers
indices = np.arange(len(results[0]))
# unpack results and average train/test times
clf_names, p_scores, r_scores, f_measure, training_time, test_time, probas = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)
fig = pl.figure(figsize=(31.0, 15.0)) # was 31, 15
fig.set_facecolor(sb)
font = {'family' : 'sans-serif',
'style' : 'normal',
'weight' : 'semibold',
'size' : 16}
pl.rc('font', **font)
# create precision and recall bar chart
if (metrics_graph):
title = pl.title("Overall Classifier Performance")
title.set_y(1.03)
rects = pl.barh(indices + .3, f_measure, .15, label=r'$\bf F_{1}$' + ' measure', color=gg)
autolabel(rects)
rects = pl.barh(indices + .15, r_scores, .15, label="recall", color=r)
autolabel(rects)
rects = pl.barh(indices, p_scores, .15, label="precision", color=b)
autolabel(rects)
pl.yticks(())
pl.legend(loc='best', prop={'size':15})
pl.subplots_adjust(left=.25)
# add classifier names
for i, c in zip(indices, clf_names):
pl.text(-.3, i, c)
if self.boost:
pl.savefig(('PRT_graph-R' + str(reduced) + '-F' + str(num_feats) + '-B' + str(self.boost_amount) + '.png'),
facecolor=fig.get_facecolor(), edgecolor='none')
else:
print "Saving Metrics Plot..."
pl.savefig(('PRT_graph-R' + str(reduced) + '-F' + str(num_feats) + '.png'),
facecolor=fig.get_facecolor(), edgecolor='none')
# create Proba AUC Graph
if (auc_graph):
prob_results = []
for i in xrange(len(results[6])):
if results[6][i] != None:
prob_results.append((results[0][i], results[6][i]))
pl.clf() # clear figure
fig = pl.figure(figsize=(31.0, 15.0))
title = pl.title("Precision and Recall Curves")
title.set_y(1.03)
pl.xlabel('Recall')
ylab = pl.ylabel('Precision')
pl.ylim([0.0, 1.05])
pl.xlim([0.0, 1.0])
fig.set_facecolor(llg)
font = {'family' : 'sans-serif',
'style' : 'normal',
'weight' : 'semibold',
'size' : 22}
pl.rc('font', **font)
for clf, probas in prob_results:
if clf != 'GaussianNB': # avoid poor threshold graphing for GNB
precision, recall, thresholds = precision_recall_curve(y_test, probas[:,1])
area = auc(recall, precision)
print clf, area
pl.plot(recall, precision, label=(clf + " : AUC = " + str(area)), lw=1.3)
pl.legend(loc='best', prop={'size':18})
if self.boost:
pl.savefig(('AUC_graph-R' + str(reduced) + '-F' + str(num_feats) + '-B' + str(boost_amount) + '.png'),
facecolor=fig.get_facecolor(), edgecolor='none')
else:
print "Saving AUC Plot..."
pl.savefig(('AUC_graph-R' + str(reduced) + '-F' + str(num_feats) + '.png'),
facecolor=fig.get_facecolor(), edgecolor='none')
def main(traindata, testdata, load_method="csv", feature_type='continuous', boost=False, boost_amount=0,
rank_list=None, reduce_amount=None):
# open log
log = open(log_file, "a")
output("<<< BEGIN PROCESSING >>>\n")
start_str = "Parameters: " + str(boost) + ' ' + str(boost_amount) + ' ' + str(rank_list) + ' ' \
+ str(reduce_amount) + ' ' + str(feature_type) + '\n'
output(start_str)
# process
processor = CLFProcessor(traindata, testdata, load_method, feature_type, boost, boost_amount,
rank_list, reduce_amount)
processor.process()
# close log
log.close()
if __name__ == '__main__':
args = sys.argv[1:] # Get the filename and amounts parameters from the command line
main( *args )