-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarizer-model.py
103 lines (83 loc) · 2.84 KB
/
summarizer-model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import bs4
import urllib.request as url
import re
#!pip3 install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_word = stopwords.words('english')
import string
url_name = input("Enter the URL of the page you want to summarize:\n")
#Parsing the HTML file and storing a list of all paragraph tags in it
web = url.urlopen(url_name)
page = bs4.BeautifulSoup(web,'html.parser')
elements = page.find_all('p')
article = ''
for i in elements:
article+= (i.text)
#Processing the data to remove irrelavant text
processed = article.replace(r'^\s+|\s+?$','')
processed = processed.replace('\n',' ')
processed = processed.replace("\\",'')
processed = processed.replace('===','')
processed = processed.replace(",",'')
processed = processed.replace('"','')
processed = processed.replace(';','')
processed = processed.replace('\t','')
processed = processed.replace('{','')
processed = processed.replace('}','')
processed = re.sub(r'\[[0-9]*\]','',processed)
processed
#function to clean the sentences
def cleanSentence(sentences):
counter = 0
for sentence in sentences:
count = 0
for character in sentence:
if(character == '('):
count+=1
if(count>=10):
sentences.pop(counter)
counter+=1
return sentences
dirty_sentences = sent_tokenize(processed)
cleaned = cleanSentence(dirty_sentences)
separator = "."
cleaned_string = separator.join(cleaned)
#counting the frequency of all the tokenized words
frequency = {}
processed1 = cleaned_string.lower()
for word in word_tokenize(processed1):
if word not in stop_word:
if word not in frequency.keys():
frequency[word]=1
else:
frequency[word]+=1
#calulating and storing the importance values of the words
max_fre = max(frequency.values())
for word in frequency.keys():
frequency[word]=(frequency[word]/max_fre)
#calculating the sentence scores
sentence_score = {}
for sent in cleaned:
for word in word_tokenize(sent):
if word in frequency.keys():
if len(sent.split(' '))<30:
if sent not in sentence_score.keys():
sentence_score[sent] = frequency[word]
else:
sentence_score[sent] += frequency[word]
import heapq
print("\nTotal number of sentences in the dictionary={0}\n".format(len(sentence_score)))
number = int(input("Enter the number of sentences you want the summary to contain:\n"))
#storing the summary according to the sentence importances
summary = heapq.nlargest(number,sentence_score,key = sentence_score.get)
summary = ' '.join(summary)
final = "SUMMARY:- \n " +summary
print("\n===============SUMMARY===============\n")
print("\n")
print(summary)
print("\n\n=====================================\n")