-
Notifications
You must be signed in to change notification settings - Fork 2
/
model_training.py
121 lines (100 loc) · 4.79 KB
/
model_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Major model traning source code
!!!CAUTION!!! Because of jackknife, the code is extremely slow, plz run in server!
A server with 32 cores may take 10 days to finish (SVM is the battelneck)
"""
__author__ = "Xiangeng Wang"
from chemocommons import * # many useful functions, "reinvented" wheels, wrote by me!
import scipy.io as scio # load ".mat" file in
from skmultilearn.cluster import NetworkXLabelGraphClusterer # clusterer
from skmultilearn.cluster import LabelCooccurrenceGraphBuilder # as it writes
from skmultilearn.ensemble import LabelSpacePartitioningClassifier # so?
from skmultilearn.problem_transform import ClassifierChain, LabelPowerset # sorry, we only used LP
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier # Okay?
from sklearn.model_selection import LeaveOneOut # jackknife, "socalled"
from sklearn.metrics import jaccard_similarity_score # for some calculation
data_dict = scio.loadmat("ATC_42_3883.mat")
X = data_dict['atc_fea'] # X is standardized [0, 1] no need for scale
Y = data_dict['atcClass']
Y[Y==-1] = 0 # why in matlab is -1,1; not 0,1?
loocv = LeaveOneOut() # jackknife
scoring_funcs = {"hamming loss": hamming_func,
"aiming": aiming_func,
"coverage": coverage_func,
"accuracy": accuracy_func,
"absolute true": absolute_true_func,
"absolute false":absolute_false_func
} # Keep recorded
parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [ExtraTreesClassifier()],
'classifier__classifier__n_estimators': [100, 500],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=0, refit="absolute true")
ext.fit(X.T, Y.T)
print(ext.best_score_)
parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [RandomForestClassifier()],
'classifier__classifier__n_estimators': [500,1000],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
rf = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=0, refit="absolute true")
rf.fit(X.T, Y.T)
print(rf.best_score_)
parameters = {
'classifier': [LabelPowerset()],
'classifier__classifier': [SVC(probability=True)],
'classifier__classifier__C': [0.01, 0.1, 1, 10, 100],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
svm = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=0, refit="absolute true")
svm.fit(X.T, Y.T)
print(svm.best_score_)
parameters = {
'classifier': [LabelPowerset(), ClassifierChain()],
'classifier__classifier': [XGBClassifier()],
'classifier__classifier__n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
xgb = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=0, refit="absolute true")
xgb.fit(X.T, Y.T)
print(xgb.best_score_)
parameters = {
'classifier': [LabelPowerset(), ClassifierChain()],
'classifier__classifier': [MLPClassifier()],
'classifier__classifier__hidden_layer_sizes': [50, 100, 200, 500, 1000],
'clusterer' : [
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
]
}
mlp = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv,
scoring=scoring_funcs, verbose=0, refit="absolute true")
mlp.fit(X.T, Y.T)
print(mlp.best_score_)
mytuple = (
ext,
rf,
svm,
xgb,
mlp,
)
to_save = dump(mytuple, filename="ensemble.joblib")