-
Notifications
You must be signed in to change notification settings - Fork 1
/
spider.py
155 lines (128 loc) · 4.89 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import sqlite3
import urllib
import ssl
from urlparse import urljoin
from urlparse import urlparse
from BeautifulSoup import *
# Deal with SSL certificate anomalies Python > 2.7
# scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
scontext = None
s_no=1
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Pages
(id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
error INTEGER, old_rank REAL, new_rank REAL)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Links
(from_id INTEGER, to_id INTEGER)''')
cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')
# Check to see if we are already in progress...
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
row = cur.fetchone()
if row is not None:
print "Restarting existing crawl. Remove spider.sqlite to start a fresh crawl."
else :
starturl = raw_input('Enter web url or enter: ')
if ( len(starturl) < 1 ) : starturl = 'http://python-data.dr-chuck.net/'
if ( starturl.endswith('/') ) : starturl = starturl[:-1]
web = starturl
if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
pos = starturl.rfind('/')
web = starturl[:pos]
if ( len(web) > 1 ) :
cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
conn.commit()
# Get the current webs
cur.execute('''SELECT url FROM Webs''')
webs = list()
for row in cur:
webs.append(str(row[0]))
print webs
many = 0
while True:
if ( many < 1 ) :
sval = raw_input('How many pages:')
if ( len(sval) < 1 ) : break
many = int(sval)
many = many - 1
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
try:
row = cur.fetchone()
# print row
fromid = row[0]
url = row[1]
except:
print 'No unretrieved HTML pages found'
many = 0
break
print s_no,fromid, url,
s_no+=1
# If we are retrieving this page, there should be no links from it
cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
try:
# Deal with SSL certificate anomalies Python > 2.7
# scontext = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
# document = urllib.urlopen(url, context=scontext)
# Normal Unless you encounter certificate problems
document = urllib.urlopen(url)
html = document.read()
if document.getcode() != 200 :
print "Error on page: ",document.getcode()
cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )
if 'text/html' != document.info().gettype() :
print "Ignore non text/html page"
cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
conn.commit()
continue
print '('+str(len(html))+')',
soup = BeautifulSoup(html)
except KeyboardInterrupt:
print ''
print 'Program interrupted by user...'
break
except:
print "Unable to retrieve or parse page"
cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
conn.commit()
continue
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
cur.execute('UPDATE Pages SET html=? WHERE url=?', (buffer(html), url ) )
conn.commit()
# Retrieve all of the anchor tags
tags = soup('a')
count = 0
for tag in tags:
href = tag.get('href', None)
if ( href is None ) : continue
# Resolve relative references like href="/contact"
up = urlparse(href)
if ( len(up.scheme) < 1 ) :
href = urljoin(url, href)
ipos = href.find('#')
if ( ipos > 1 ) : href = href[:ipos]
if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') or not href.endswith('.html')) : continue
if ( href.endswith('/') ) : href = href[:-1]
if ( len(href) < 1 ) : continue
# Check if the URL is in any of the webs
# found = False
# for web in webs:
# if ( href.startswith(web) ) :
# found = True
# break
# if not found : continue
# print href
cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
count = count + 1
conn.commit()
cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
try:
row = cur.fetchone()
toid = row[0]
except:
print 'Could not retrieve id'
continue
# print fromid, toid
cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )
print count
cur.close()