Skip to content

Commit

Permalink
Merge pull request #394 from nsapa/nico_fix_1
Browse files Browse the repository at this point in the history
Nico's fixes (dumping wiki.dystify.com/CI fixes)
  • Loading branch information
nemobis authored Aug 28, 2020
2 parents 9b1996d + 5986467 commit 0cfde9e
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 29 deletions.
67 changes: 54 additions & 13 deletions dumpgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,16 +299,22 @@ def getPageTitlesScraper(config={}, session=None):
else:
pass # perhaps no subpages

# 3 is the current deep of English Wikipedia for Special:Allpages
deep = 3
# Should be enought subpages on Special:Allpages
deep = 50
c = 0
oldfr = ''
checked_suballpages = []
rawacum = raw
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
# load sub-Allpages
m = re.compile(r_suballpages).finditer(raw)
for i in m:
fr = i.group('from')
currfr = fr

if oldfr == currfr:
# We are looping, exit the loop
pass

if r_suballpages == r_suballpages1:
to = i.group('to')
Expand All @@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
config['index'], name, namespace)



if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
r2 = session.get(url=url, timeout=10)
raw2 = r2.text
raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', \
len(re.findall(r_suballpages, raw2)), 'subpages', \
len(re.findall(r_title, raw2)), 'pages'
r = session.get(url=url, timeout=10)
#print 'Fetching URL: ', url
raw = r.text
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
print ' Reading', name, len(raw), 'bytes', \
len(re.findall(r_suballpages, raw)), 'subpages', \
len(re.findall(r_title, raw)), 'pages'

delay(config=config, session=session)
oldfr = currfr
c += 1

c = 0
Expand Down Expand Up @@ -497,8 +507,9 @@ def getUserAgent():
""" Return a cool user-agent to hide Python user-agent """
useragents = [
# firefox
'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
#'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
#'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'
]
return useragents[0]

Expand Down Expand Up @@ -574,6 +585,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
except requests.exceptions.ConnectionError as e:
print ' Connection error: %s'%(str(e[0]))
xml = ''
except requests.exceptions.ReadTimeout as e:
print ' Read timeout: %s'%(str(e[0]))
xml = ''
c += 1

return xml
Expand Down Expand Up @@ -1471,7 +1485,29 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
print 'Filename is too long, truncating. Now it is:', filename2
filename3 = u'%s/%s' % (imagepath, filename2)
imagefile = open(filename3, 'wb')
r = requests.get(url=url)

r = session.head(url=url, allow_redirects=True)
original_url_redirected = len(r.history) > 0

if original_url_redirected:
#print 'Site is redirecting us to: ', r.url
original_url = url
url = r.url

r = session.get(url=url, allow_redirects=False)

# Try to fix a broken HTTP to HTTPS redirect
if r.status_code == 404 and original_url_redirected:
if original_url.split("://")[0] == "http" and url.split("://")[0] == "https":
url = 'https://' + original_url.split("://")[1]
#print 'Maybe a broken http to https redirect, trying ', url
r = session.get(url=url, allow_redirects=False)

if r.status_code == 404:
logerror(
config=config,
text=u'File %s at URL %s is missing' % (filename2,url))

imagefile.write(r.content)
imagefile.close()
# saving description if any
Expand All @@ -1494,9 +1530,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):

f = open('%s/%s.desc' % (imagepath, filename2), 'w')
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re.search(r'</mediawiki>', xmlfiledesc):
if not re.search(r'</page>', xmlfiledesc):
# failure when retrieving desc? then save it as empty .desc
xmlfiledesc = ''

# Fixup the XML
if xmlfiledesc is not '' and not re.search(r'</mediawiki>', xmlfiledesc):
xmlfiledesc += '</mediawiki>'

f.write(xmlfiledesc.encode('utf-8'))
f.close()
delay(config=config, session=session)
Expand Down
32 changes: 16 additions & 16 deletions testing/test_dumpgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_getImages(self):
tests = [
# Alone wikis
#['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],

# Editthis wikifarm
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_getPageTitles(self):
print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
tests = [
# Alone wikis
['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'April Fools\' Day'],
#['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],

# Test old allpages API behaviour
Expand Down Expand Up @@ -206,7 +206,7 @@ def test_getWikiEngine(self):
tests = [
['https://www.dokuwiki.org', 'DokuWiki'],
#['http://wiki.openwrt.org', 'DokuWiki'],
['http://skilledtests.com/wiki/', 'MediaWiki'],
#['http://skilledtests.com/wiki/', 'MediaWiki'],
#['http://moinmo.in', 'MoinMoin'],
['https://wiki.debian.org', 'MoinMoin'],
['http://twiki.org/cgi-bin/view/', 'TWiki'],
Expand All @@ -219,42 +219,42 @@ def test_getWikiEngine(self):
['http://www.wasteflake.com/', 'TikiWiki'],
['http://foswiki.org/', 'FosWiki'],
['http://www.w3c.br/Home/WebHome', 'FosWiki'],
['http://mojomojo.org/', 'MojoMojo'],
['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
#['http://mojomojo.org/', 'MojoMojo'],
#['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
#['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
#['https://web.archive.org/web/20080517021020id_/http://berlin.xwiki.com/xwiki/bin/view/Main/WebHome', 'XWiki'],
['http://www.xwiki.org/xwiki/bin/view/Main/WebHome', 'XWiki'],
['https://confluence.atlassian.com/', 'Confluence'],
#['https://wiki.hybris.com/dashboard.action', 'Confluence'],
['https://confluence.sakaiproject.org/', 'Confluence'],
#['http://demo.bananadance.org/', 'Banana Dance'],
['http://wagn.org/', 'Wagn'],
['http://wiki.ace-mod.net/', 'Wagn'],
#['http://wiki.ace-mod.net/', 'Wagn'],
#['https://success.mindtouch.com/', 'MindTouch'],
#['https://jspwiki.apache.org/', 'JSPWiki'],
['http://www.ihear.com/FreeCLAS/', 'JSPWiki'],
['http://www.wikkawiki.org/HomePage', 'WikkaWiki'],
['http://puppylinux.org/wikka/', 'WikkaWiki'],
['http://cs.netsville.com/wiki/wikka.php', 'WikkaWiki'],
#['http://puppylinux.org/wikka/', 'WikkaWiki'],
['https://www.cybersphere.net/', 'MediaWiki'],
#['http://web.archive.org/web/20060717202033id_/http://www.comawiki.org/CoMa.php?CoMa=startseite', 'CoMaWiki'],
['http://bootbook.de/CoMa.php', 'CoMaWiki'],
#['http://wikini.net/wakka.php', 'WikiNi'],
['http://wiki.raydium.org/wiki/', 'WikiNi'],
['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
#['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
#['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
['http://www.sw4me.com/wiki/', 'WackoWiki'],
['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
#['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
['http://kw.pm.org/wiki/index.cgi', 'Kwiki'],
['http://wiki.wubi.org/index.cgi', 'Kwiki'],
#['http://perl.bristolbath.org/index.cgi', 'Kwiki'],
['http://www.anwiki.com/', 'Anwiki'],
['http://www.anw.fr/', 'Anwiki'],
#['http://www.anwiki.com/', 'Anwiki'],
#['http://www.anw.fr/', 'Anwiki'],
['http://www.aneuch.org/', 'Aneuch'],
['http://doc.myunixhost.com/', 'Aneuch'],
['http://www.bitweaver.org/wiki/index.php', 'bitweaver'],
['http://wiki.e-shell.org/Home', 'Zwiki'],
['http://leo.zwiki.org/', 'Zwiki'],
['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
#['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
['http://darksouls.wikidot.com/', 'Wikidot'],
['http://www.wikifoundrycentral.com/', 'Wetpaint'],
['http://wiki.openid.net/', 'PBworks'],
Expand All @@ -273,7 +273,7 @@ def test_mwGetAPIAndIndex(self):
print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
tests = [
# Alone wikis
['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
['https://archiveteam.org', 'https://archiveteam.org/api.php', 'https://archiveteam.org/index.php'],
#['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],

# Editthis wikifarm
Expand Down

0 comments on commit 0cfde9e

Please sign in to comment.