-
Notifications
You must be signed in to change notification settings - Fork 0
/
wl.py
167 lines (146 loc) · 6.36 KB
/
wl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""Weisfeiler_Lehman graph kernel.
Python implementation based on: "Weisfeiler-Lehman Graph Kernels", by:
Nino Shervashidze, Pascal Schweitzer, Erik J. van Leeuwen, Kurt
Mehlhorn, Karsten M. Borgwardt, JMLR, 2012.
http://jmlr.csail.mit.edu/papers/v12/shervashidze11a.html
Author : Sandro Vega-Pons, Emanuele Olivetti
"""
import numpy as np
import networkx as nx
import copy
class GK_WL():
"""
Weisfeiler_Lehman graph kernel.
"""
def compare_list(self, graph_list, h=1, node_label=True):
"""Compute the all-pairs kernel values for a list of graphs.
This function can be used to directly compute the kernel
matrix for a list of graphs. The direct computation of the
kernel matrix is faster than the computation of all individual
pairwise kernel values.
Parameters
----------
graph_list: list
A list of graphs (list of networkx graphs)
h : interger
Number of iterations.
node_label : boolean
Whether to use original node labels. True for using node labels
saved in the attribute 'node_label'. False for using the node
degree of each node as node attribute.
Return
------
K: numpy.array, shape = (len(graph_list), len(graph_list))
The similarity matrix of all graphs in graph_list.
"""
self.graphs = graph_list
n = len(graph_list)
lists = [0] * n
k = [0] * (h + 1)
n_nodes = 0
n_max = 0
# Compute adjacency lists and n_nodes, the total number of
# nodes in the dataset.
for i in range(n):
lists[i] = graph_list[i].adjacency_list()
n_nodes = n_nodes + graph_list[i].number_of_nodes()
# Computing the maximum number of nodes in the graphs. It
# will be used in the computation of vectorial
# representation.
if(n_max < graph_list[i].number_of_nodes()):
n_max = graph_list[i].number_of_nodes()
phi = np.zeros((n_max, n), dtype=np.uint64)
# INITIALIZATION: initialize the nodes labels for each graph
# with their labels or with degrees (for unlabeled graphs)
labels = [0] * n
label_lookup = {}
label_counter = 0
# label_lookup is an associative array, which will contain the
# mapping from multiset labels (strings) to short labels
# (integers)
if node_label is True:
for i in range(n):
l_aux = list(nx.get_node_attributes(graph_list[i],
'attr_name').values())
# It is assumed that the graph has an attribute
# 'node_label'
labels[i] = np.zeros(len(l_aux), dtype=np.int32)
for j in range(len(l_aux)):
if not (l_aux[j] in label_lookup):
label_lookup[l_aux[j]] = label_counter
labels[i][j] = label_counter
label_counter += 1
else:
labels[i][j] = label_lookup[l_aux[j]]
# labels are associated to a natural number
# starting with 0.
phi[labels[i][j], i] += 1
else:
for i in range(n):
labels[i] = np.array(graph_list[i].degree().values())
for j in range(len(labels[i])):
phi[labels[i][j], i] += 1
# Simplified vectorial representation of graphs (just taking
# the vectors before the kernel iterations), i.e., it is just
# the original nodes degree.
self.vectors = np.copy(phi.transpose())
k = np.dot(phi.transpose(), phi)
# MAIN LOOP
it = 0
new_labels = copy.deepcopy(labels)
while it < h:
# create an empty lookup table
label_lookup = {}
label_counter = 0
phi = np.zeros((n_nodes, n), dtype=np.uint64)
for i in range(n):
for v in range(len(lists[i])):
# form a multiset label of the node v of the i'th graph
# and convert it to a string
long_label = np.concatenate((np.array([labels[i][v]]),
np.sort(labels[i]
[lists[i][v]])))
long_label_string = str(long_label)
# if the multiset label has not yet occurred, add it to the
# lookup table and assign a number to it
if not (long_label_string in label_lookup):
label_lookup[long_label_string] = label_counter
new_labels[i][v] = label_counter
label_counter += 1
else:
new_labels[i][v] = label_lookup[long_label_string]
# fill the column for i'th graph in phi
aux = np.bincount(new_labels[i])
phi[new_labels[i], i]=np.add(phi[new_labels[i], i], aux[new_labels[i]], out=phi[new_labels[i], i], casting="unsafe")
#phi[new_labels[i], i] += aux[new_labels[i]]
k += np.dot(phi.transpose(), phi)
labels = copy.deepcopy(new_labels)
it = it + 1
# Compute the normalized version of the kernel
k_norm = np.zeros(k.shape)
for i in range(k.shape[0]):
for j in range(k.shape[1]):
k_norm[i, j] = k[i, j] / np.sqrt(k[i, i] * k[j, j])
return k_norm
def compare(self, g_1, g_2, h=1, node_label=True):
"""Compute the kernel value (similarity) between two graphs.
The kernel is normalized to [0,1] by the equation:
k_norm(g1, g2) = k(g1, g2) / sqrt(k(g1,g1) * k(g2,g2))
Parameters
----------
g_1 : networkx.Graph
First graph.
g_2 : networkx.Graph
Second graph.
h : interger
Number of iterations.
node_label : boolean
Whether to use the values under the graph attribute 'node_label'
as node labels. If False, the degree of the nodes are used as
labels.
Returns
-------
k : The similarity value between g1 and g2.
"""
gl = [g_1, g_2]
return self.compare_list(gl, h, node_label)[0, 1]