-
Notifications
You must be signed in to change notification settings - Fork 0
/
build-network.py
162 lines (110 loc) · 5.4 KB
/
build-network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import argparse
import logging
import pickle
from patient_record import *
from networkx import *
from itertools import combinations
def main():
args = parse_args()
logging.basicConfig(filename=args.log, level=logging.INFO)
logging.info("begin script")
providers_not_matched = set()
dictionary_for_graph = {}
at_risk_patients = set()
with open("./computed-data/at-risk-patients.pickle", "r") as infile:
at_risk_patients = pickle.load(infile)
print "there are %i patients in this at risk object" % len(at_risk_patients)
provider_subset = set()
with open("./computed-data/subset-providers.pickle", "r") as infile:
provider_subset = pickle.load(infile)
print "there are %i providers n this subset: " % len(provider_subset)
# now lets go back through original full data and build dict of provider: set(patientids)
structured_dataset = []
with open("./computed-data/structured-dataset.pickle", "r") as infile:
structured_dataset = pickle.load(infile)
for r in structured_dataset:
# logging.info("now examing record: %s " % r)
if r.PrescriberId in provider_subset:
logging.info("prescriber %s is in our subset" % r.PrescriberId)
logging.info(" adding patientid %s to prescriber key..." % r.PatientId)
if dictionary_for_graph.has_key(r.PrescriberId):
logging.info(" has key, adding %s to set..." % r.PatientId)
dictionary_for_graph[r.PrescriberId].add(r.PatientId)
else:
logging.info(" key not found, creating new set with r.PatientId = %s " % r.PatientId)
# warning: Don't do this:
# dictionary_for_graph[r.PrescriberId] = set(r.PatientId)
# set(r.PatientId) treats the ID string as an interable and breaks it up into single characters
s = set()
s.add(r.PatientId)
dictionary_for_graph[r.PrescriberId] = s
logging.info("set for providerid %s now looks like: %s " % (r.PrescriberId, dictionary_for_graph[r.PrescriberId]))
else:
providers_not_matched.add(r.PrescriberId)
logging.info("finished with scanning data.")
for k, v in dictionary_for_graph.iteritems():
logging.info("key is %s and set is %s " % (k, v))
logging.info("length of prescriber matches: %i " % len(dictionary_for_graph))
logging.info("length of non-matches: %i " % len(providers_not_matched))
g = nx.Graph()
for k, v in dictionary_for_graph.iteritems():
nodes = v
edges = combinations(v, 2)
#g.add_nodes_from(nodes)
for n in nodes:
at_risk = n in at_risk_patients
#print "at risk for this node is %s " % at_risk
g.add_node(n, at_risk=at_risk)
g.add_edges_from(edges)
# connected component might come in handy later
# sub_graphs = nx.connected_component_subgraphs(g)
print "number of nodes before pruning solitary: %i" % g.number_of_nodes()
# Thank you to Dan Schult for the idea behind this pruning (updated method names)
# https://groups.google.com/d/msg/networkx-discuss/XmP5wZhrDMI/tCPul0GI_LwJ
solitary = [n for n, d in g.degree_iter() if d == 0]
g.remove_nodes_from(solitary)
print "number of nodes after pruning solitary: %i" % g.number_of_nodes()
if is_connected(g):
logging.info("graph is connected")
else:
logging.info("graph is not connected")
cc = connected_component_subgraphs(g)
#logging.info("there are %i subgraphs " % len(cc))
subgraph_node_counts = []
for sg in cc:
# logging.info("this sg has %i nodes " % sg.number_of_nodes())
subgraph_node_counts.append(sg.number_of_nodes())
subgraph_nodes_found = 0
for c in subgraph_node_counts:
subgraph_nodes_found = subgraph_nodes_found + c
logging.info("found %i total subgraphs, with average node count of %f "
% (len(subgraph_node_counts), (float(subgraph_nodes_found) / len(subgraph_node_counts))))
largest_cc = max(nx.connected_component_subgraphs(g), key=len)
logging.info("getting largest connected component...")
G = largest_cc
logging.info("pickling graph object...")
with open("./computed-data/final-graph.pickle", "w") as outfile:
pickle.dump(G, outfile)
logging.info("finished pickling graph object.")
#print("radius: %d" % radius(G))
#print("diameter: %d" % diameter(G))
#print("eccentricity: %s" % eccentricity(G))
#print("center: %s" % center(G))
#print("periphery: %s" % periphery(G))
#print("density: %s" % density(G))
# draw(G)
# this part was wrong before. Must use int not string if that's what I built graph with in first place
# print "degree of 222 is %s" % nx.degree(g, 222)
print "number of nodes: %i" % G.number_of_nodes()
print "number of edges: %i" % G.number_of_edges()
# print "neighbors of 222: %s " % g.neighbors(222)
# print "nodes: %s " % g.nodes()
# print "edges: %s " % g.edges()
logging.info("end of script")
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--log', default='/dev/stderr', help='log file (default=stderr')
parser.add_argument('--out', default='/dev/stdout', help='output. default=stdout')
return parser.parse_args()
if __name__ == '__main__':
main()