This repository has been archived by the owner on Aug 30, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HtmlParser.py
96 lines (75 loc) · 3.1 KB
/
HtmlParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# -*- coding: utf-8 -*-
import re
import urlparse
from bs4 import BeautifulSoup
# Parser, to analyse the html pages
class HtmlParser(object):
def __init__(self, hostName):
self.minLength = 1
self.hostName = hostName
self.__content = None
self.__beautifulSoup = None
self.__url_feature = r'/view/\d+\.htm'
def set_content(self, content):
if content is None or content == "":
raise ValueError("Content is valid !")
self.__content = content
# 分析页面,获得url链接
def parser(self):
if self.__content == None or self.__content == "":
raise ValueError("You must first set content in using set_content method")
self.__beautifulSoup = BeautifulSoup(self.__content, "html.parser", from_encoding="utf-8")
def get_new_urls(self):
oldALinks = self.__beautifulSoup.find_all('a', href=re.compile(self.__url_feature))
newALinks = []
for link in oldALinks:
href = link.get('href')
if href is not None and len(href) > 0:
newALinks.append(urlparse.urljoin(self.hostName, href))
return newALinks
def get_context(self):
return self.__content
@property
def get_title(self):
if self.__content is None:
raise ValueError("Content is not valid")
else:
# <dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
titleClass = self.__beautifulSoup.find("dd", class_="lemmaWgt-lemmaTitle-title")
if titleClass is None:
raise ValueError("Title class is not exist !")
titleTag = titleClass.find("h1")
if titleTag is None:
raise ValueError("Title tag is not exist !")
return titleTag.get_text().replace('\n', '')
def get_summary(self):
if self.__content is None:
return ""
else:
# <div class="lemma-summary" label-module="lemmaSummary">
# <div class="para" label-module="para">
summaryTag = self.__beautifulSoup.find("div", class_="lemma-summary")
if summaryTag is None or self.__is_shorter_than_minLength(summaryTag.get_text()):
summaryTag = self.__beautifulSoup.find("div", class_="para")
if summaryTag is None or self.__is_shorter_than_minLength(summaryTag.get_text()):
raise ValueError("Summary Tag is not exist !")
return summaryTag.get_text().replace('\n', '')
def get_beautiful_soup(self):
return self.__beautifulSoup
# def set_host_name(self, host):
# self.hostName = host
def __is_shorter_than_minLength(self, content):
return len(re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "", content)) < self.minLength
# html_doc = """
# soup = BeautifulSoup(html_doc, "html.parser")
# print soup.a
# print soup.title.name
# print soup.title.parent.name
# print soup.p
# print soup.p['class']
#
# links = soup.find_all('a')
# for link in links:
# print link.get_text()
# print soup.find(id="link3")
# print soup.p.get_text()