Merge pull request #394 from nsapa/nico_fix_1

Nico's fixes (dumping wiki.dystify.com/CI fixes)
WikiTeam · Aug 28, 2020 · 0cfde9e · 0cfde9e
2 parents 9b1996d + 5986467
commit 0cfde9e
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 29 deletions.
diff --git a/dumpgenerator.py b/dumpgenerator.py
@@ -299,16 +299,22 @@ def getPageTitlesScraper(config={}, session=None):
         else:
             pass  # perhaps no subpages
 
-        # 3 is the current deep of English Wikipedia for Special:Allpages
-        deep = 3
+        # Should be enought subpages on Special:Allpages
+        deep = 50
         c = 0
+        oldfr = ''
         checked_suballpages = []
         rawacum = raw
         while r_suballpages and re.search(r_suballpages, raw) and c < deep:
             # load sub-Allpages
             m = re.compile(r_suballpages).finditer(raw)
             for i in m:
                 fr = i.group('from')
+                currfr = fr
+
+                if oldfr == currfr:
+                    # We are looping, exit the loop
+                    pass
 
                 if r_suballpages == r_suballpages1:
                     to = i.group('to')
@@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
                     url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
                         config['index'], name, namespace)
 
+
+
                 if name not in checked_suballpages:
                     # to avoid reload dupe subpages links
                     checked_suballpages.append(name)
                     delay(config=config, session=session)
-                    r2 = session.get(url=url, timeout=10)
-                    raw2 = r2.text
-                    raw2 = cleanHTML(raw2)
-                    rawacum += raw2  # merge it after removed junk
-                    print '    Reading', name, len(raw2), 'bytes', \
-                        len(re.findall(r_suballpages, raw2)), 'subpages', \
-                        len(re.findall(r_title, raw2)), 'pages'
+                    r = session.get(url=url, timeout=10)
+                    #print 'Fetching URL: ', url
+                    raw = r.text
+                    raw = cleanHTML(raw)
+                    rawacum += raw  # merge it after removed junk
+                    print '    Reading', name, len(raw), 'bytes', \
+                        len(re.findall(r_suballpages, raw)), 'subpages', \
+                        len(re.findall(r_title, raw)), 'pages'
 
                 delay(config=config, session=session)
+            oldfr = currfr
             c += 1
 
         c = 0
@@ -497,8 +507,9 @@ def getUserAgent():
     """ Return a cool user-agent to hide Python user-agent """
     useragents = [
         # firefox
-        'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
-        'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+        #'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
+        #'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0',
+        'Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0'
     ]
     return useragents[0]
 
@@ -574,6 +585,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
         except requests.exceptions.ConnectionError as e:
             print '    Connection error: %s'%(str(e[0]))
             xml = ''
+        except requests.exceptions.ReadTimeout as e:
+            print '    Read timeout: %s'%(str(e[0]))
+            xml = ''       
         c += 1
 
     return xml
@@ -1471,7 +1485,29 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
             print 'Filename is too long, truncating. Now it is:', filename2
         filename3 = u'%s/%s' % (imagepath, filename2)
         imagefile = open(filename3, 'wb')
-        r = requests.get(url=url)
+
+        r = session.head(url=url, allow_redirects=True)
+        original_url_redirected = len(r.history) > 0
+
+        if original_url_redirected:
+            #print 'Site is redirecting us to: ', r.url
+            original_url = url
+            url = r.url
+
+        r = session.get(url=url, allow_redirects=False)
+
+        # Try to fix a broken HTTP to HTTPS redirect
+        if r.status_code ==  404 and  original_url_redirected:
+           if original_url.split("://")[0] == "http" and url.split("://")[0] == "https":
+              url = 'https://' + original_url.split("://")[1]
+              #print 'Maybe a broken http to https redirect, trying ', url
+              r = session.get(url=url, allow_redirects=False)
+
+        if r.status_code ==  404:
+              logerror(
+                  config=config,
+                  text=u'File %s at URL %s is missing' % (filename2,url))
+
         imagefile.write(r.content)
         imagefile.close()
         # saving description if any
@@ -1494,9 +1530,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
 
         f = open('%s/%s.desc' % (imagepath, filename2), 'w')
         # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
-        if not re.search(r'</mediawiki>', xmlfiledesc):
+        if not re.search(r'</page>', xmlfiledesc):
             # failure when retrieving desc? then save it as empty .desc
             xmlfiledesc = ''
+
+        # Fixup the XML
+        if xmlfiledesc is not '' and not re.search(r'</mediawiki>', xmlfiledesc):
+            xmlfiledesc += '</mediawiki>'
+
         f.write(xmlfiledesc.encode('utf-8'))
         f.close()
         delay(config=config, session=session)

diff --git a/testing/test_dumpgenerator.py b/testing/test_dumpgenerator.py
@@ -62,7 +62,7 @@ def test_getImages(self):
         tests = [
             # Alone wikis
             #['http://wiki.annotation.jp/index.php', 'http://wiki.annotation.jp/api.php', u'かずさアノテーション - ソーシャル・ゲノム・アノテーション.jpg'],
-            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
+            ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'Archive-is 2013-07-02 17-05-40.png'],
             #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Benham\'s disc (animated).gif'],
 
             # Editthis wikifarm
@@ -146,7 +146,7 @@ def test_getPageTitles(self):
         print '\n', '#'*73, '\n', 'test_getPageTitles', '\n', '#'*73
         tests = [
             # Alone wikis
-            ['https://www.archiveteam.org/index.php', 'https://www.archiveteam.org/api.php', u'April Fools\' Day'],
+            ['https://archiveteam.org/index.php', 'https://archiveteam.org/api.php', u'April Fools\' Day'],
             #['http://skilledtests.com/wiki/index.php', 'http://skilledtests.com/wiki/api.php', u'Conway\'s Game of Life'],
 
             # Test old allpages API behaviour
@@ -206,7 +206,7 @@ def test_getWikiEngine(self):
         tests = [
             ['https://www.dokuwiki.org', 'DokuWiki'],
             #['http://wiki.openwrt.org', 'DokuWiki'],
-            ['http://skilledtests.com/wiki/', 'MediaWiki'],
+            #['http://skilledtests.com/wiki/', 'MediaWiki'],
             #['http://moinmo.in', 'MoinMoin'],
             ['https://wiki.debian.org', 'MoinMoin'],
             ['http://twiki.org/cgi-bin/view/', 'TWiki'],
@@ -219,42 +219,42 @@ def test_getWikiEngine(self):
             ['http://www.wasteflake.com/', 'TikiWiki'],
             ['http://foswiki.org/', 'FosWiki'],
             ['http://www.w3c.br/Home/WebHome', 'FosWiki'],
-            ['http://mojomojo.org/', 'MojoMojo'],
-            ['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
-            ['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
+            #['http://mojomojo.org/', 'MojoMojo'],
+            #['http://wiki.catalystframework.org/wiki/', 'MojoMojo'],
+            #['https://www.ictu.nl/archief/wiki.noiv.nl/xwiki/bin/view/Main', 'XWiki'],
             #['https://web.archive.org/web/20080517021020id_/http://berlin.xwiki.com/xwiki/bin/view/Main/WebHome', 'XWiki'],
             ['http://www.xwiki.org/xwiki/bin/view/Main/WebHome', 'XWiki'],
             ['https://confluence.atlassian.com/', 'Confluence'],
             #['https://wiki.hybris.com/dashboard.action', 'Confluence'],
             ['https://confluence.sakaiproject.org/', 'Confluence'],
             #['http://demo.bananadance.org/', 'Banana Dance'],
             ['http://wagn.org/', 'Wagn'],
-            ['http://wiki.ace-mod.net/', 'Wagn'],
+            #['http://wiki.ace-mod.net/', 'Wagn'],
             #['https://success.mindtouch.com/', 'MindTouch'],
             #['https://jspwiki.apache.org/', 'JSPWiki'],
             ['http://www.ihear.com/FreeCLAS/', 'JSPWiki'],
             ['http://www.wikkawiki.org/HomePage', 'WikkaWiki'],
-            ['http://puppylinux.org/wikka/', 'WikkaWiki'],
-            ['http://cs.netsville.com/wiki/wikka.php', 'WikkaWiki'],
+            #['http://puppylinux.org/wikka/', 'WikkaWiki'],
+            ['https://www.cybersphere.net/', 'MediaWiki'],
             #['http://web.archive.org/web/20060717202033id_/http://www.comawiki.org/CoMa.php?CoMa=startseite', 'CoMaWiki'],
             ['http://bootbook.de/CoMa.php', 'CoMaWiki'],
             #['http://wikini.net/wakka.php', 'WikiNi'],
             ['http://wiki.raydium.org/wiki/', 'WikiNi'],
-            ['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
-            ['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
+            #['http://wiki.cs.cityu.edu.hk/CitiWiki/SourceCode', 'CitiWiki'],
+            #['http://wackowiki.sourceforge.net/test/', 'WackoWiki'],
             ['http://www.sw4me.com/wiki/', 'WackoWiki'],
-            ['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
+            #['http://lslwiki.net/lslwiki/wakka.php', 'WakkaWiki'],
             ['http://kw.pm.org/wiki/index.cgi', 'Kwiki'],
             ['http://wiki.wubi.org/index.cgi', 'Kwiki'],
             #['http://perl.bristolbath.org/index.cgi', 'Kwiki'],
-            ['http://www.anwiki.com/', 'Anwiki'],
-            ['http://www.anw.fr/', 'Anwiki'],
+            #['http://www.anwiki.com/', 'Anwiki'],
+            #['http://www.anw.fr/', 'Anwiki'],
             ['http://www.aneuch.org/', 'Aneuch'],
             ['http://doc.myunixhost.com/', 'Aneuch'],
             ['http://www.bitweaver.org/wiki/index.php', 'bitweaver'],
             ['http://wiki.e-shell.org/Home', 'Zwiki'],
             ['http://leo.zwiki.org/', 'Zwiki'],
-            ['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
+            #['http://accessibility4all.wikispaces.com/', 'Wikispaces'],
             ['http://darksouls.wikidot.com/', 'Wikidot'],
             ['http://www.wikifoundrycentral.com/', 'Wetpaint'],
             ['http://wiki.openid.net/', 'PBworks'],
@@ -273,7 +273,7 @@ def test_mwGetAPIAndIndex(self):
         print '\n', '#'*73, '\n', 'test_mwGetAPIAndIndex', '\n', '#'*73
         tests = [
             # Alone wikis
-            ['https://www.archiveteam.org', 'https://www.archiveteam.org/api.php', 'https://www.archiveteam.org/index.php'],
+            ['https://archiveteam.org', 'https://archiveteam.org/api.php', 'https://archiveteam.org/index.php'],
             #['http://skilledtests.com/wiki/', 'http://skilledtests.com/wiki/api.php', 'http://skilledtests.com/wiki/index.php'],
 
             # Editthis wikifarm