-
Notifications
You must be signed in to change notification settings - Fork 7
/
RNTN_utils.py
101 lines (80 loc) · 2.26 KB
/
RNTN_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 24 17:55:03 2015
"""
import numpy as np
import re
def load_vocabulary(fn):
"""
"""
with open(fn) as f:
vocabulary = f.read().splitlines()
f.close()
return vocabulary
def extract_vocabulary(fn):
vocabulary = set()
with open(fn) as f:
for line in f:
word_list = re.findall('\([^(^)]*\)', line)
for word in word_list:
vocabulary.add(word[3:-1])
f.close()
return list(vocabulary)
def save_vocabulary(fn_out, vocabulary):
f = open(fn_out, 'w')
for word in vocabulary:
f.write('%s\n' % word)
f.close()
def get_proto_tree(f):
'''
f -- open file, or file name
fn_vocab -- e.g.: 'vocabulary.txt', text file with all the words listed (V).
params
Tree in PTB format
Convert text from the PTB to a tree.
Examle from figure 4 in paper:
(1 (2 'not') (4 (2 'very') (3 'good')))
Graphially:
p2(1)
/ \
/ p1(4)
/ / \
'not'(2) 'very'(2) 'good'(3)
'''
if type(f) is str:
f = open(f)
line = f.readline()
if len(line) < 1:
return None
line = line.strip('\n')
nodes = re.findall('\([0-4] ', line)
# Put commmas after all numbers
scores = np.unique(nodes)
N = len(nodes)
for score in scores:
line = line.replace(score, '%s,' % score)
leaves = re.findall('\([^(^)]*\)', line)
# Put " around strings
for leaf in leaves:
word = leaf[4:-1]
mod_leaf = '%s"%s"%s' % (leaf[:4], leaf[4:-1], leaf[-1])
line = line.replace(leaf, mod_leaf)
# Put commas between neighboring parantheses
line = line.replace(') (', '), (')
line = line.replace(')', ']') # make it a list
line = line.replace('(', '[')
return eval(line), N
def read_proto_trees(fn_data):
"""
"""
f = open(fn_data)
trees = []
Ns = []
while True:
tree = get_proto_tree(f)
if tree is None:
break
trees.append(tree[0])
Ns.append(tree[1])
f.close()
return trees, Ns