-
Notifications
You must be signed in to change notification settings - Fork 0
/
Searcher.py
124 lines (103 loc) · 3.85 KB
/
Searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#15786064(Zijian Chen), 13713641(Qingshuang Su), 70518431(Lingxin Li), 90277259(Jiahao(Kylin) Guo)
import os
import json
import time
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
docid = dict()
def read_docID(root): # load the docID dict
global docid
json_file = os.path.join(root, 'docID.json')
with open(json_file, 'r', encoding='utf-8') as d:
docid = json.load(d)
def Load_json(query_token, root): # get the dict item of sepecific token term
if query_token == "aux":
dir_file = os.path.join(root, query_token[0])
json_file = os.path.join(dir_file, query_token + '_.json')
else:
dir_file = os.path.join(root, query_token[0])
json_file = os.path.join(dir_file, query_token + '.json')
try:
with open(json_file, 'r', encoding='utf-8') as idx:
indexes = json.loads(idx.read())
return indexes
except:
return {}
def tokenize(content): # tokenize query
stemmer = SnowballStemmer("english")
return [stemmer.stem(word) for word in word_tokenize(content) if word.isalnum()]
def Query_search(query, root): # get a list of query terms associated with their postings
relst = []
query_lst = tokenize(query)
for qtoken in query_lst:
query_dic = Load_json(qtoken, root)
relst.append(query_dic)
relst = sorted(relst, key = lambda x: len(x))
return relst
def Merge_query(query_lst): # find intersection among query terms' postings
common_posting = defaultdict(int)
sortID = []
if query_lst:
comEle = query_lst[0].copy()
for query in query_lst:
comEle = {x:0 for x in comEle if x in query}
for ele in comEle:
for query in query_lst:
common_posting[ele] += query[ele]
sortID = sorted(common_posting.keys(), key = lambda x: -common_posting[x])
return sortID
def RankTop(sortPost): # print the top 5 urls
global docid
check = 0
relst = []
while check < len(sortPost):
if check == 5:
break
print(docid[sortPost[check]])
relst.append(docid[sortPost[check]])
check += 1
return relst
def run(root): # text user interface of the search engine
count = 0
while True:
query = input("Query (ENTER Key to exit):\n")
start = time.process_time()
if query == "":
print("EXIT THE SEARCH ENGINE")
break
if count == 0:
read_docID(root)
count += 1
resultUrl = Merge_query(Query_search(query,root))
end = time.process_time()
if not resultUrl:
print("")
print("No result found")
else:
print(f"\nSearch time: {end - start} seconds. Total {len(resultUrl)} Urls are found.")
if len(resultUrl) >= 5:
print("Top 5 results:")
else:
print(f"Top {len(resultUrl)} results (results found are less than 5):")
RankTop(resultUrl)
print("")
def UInterface(query,root):
start = time.process_time()
read_docID(root)
resultUrl = Merge_query(Query_search(query,root))
infolst = []
if not resultUrl:
infolst.append("No result found")
else:
end = time.process_time()
infolst.append(f"\nSearch time: {end - start} seconds. Total {len(resultUrl)} Urls are found.")
if len(resultUrl) >= 5:
infolst.append("Top 5 results:")
else:
infolst.append(f"Top {len(resultUrl)} results (results found are less than 5):")
infolst.extend(RankTop(resultUrl))
return infolst
if __name__ == '__main__':
root = 'B:\CS 121\Assignment3M3\TEST'
run(root)