benchmark: several post-review improvements

- download_sites: fix encoding unconditionally if it is missing - download_sites: add base/href only if it is missing - download_sites: remove scripts unconditionally - benchmark: specify pre-existing splash instance with --splash-server HOST:PORT
scrapinghub · Mar 2, 2015 · 70128ec · kmike · Mar 2, 2015 · kmike
1 parent 7510550
commit 70128ec
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 17 deletions.
diff --git a/splash/benchmark/benchmark.py b/splash/benchmark/benchmark.py
@@ -87,6 +87,8 @@ def make_render_png_lua_req(splash, params):
                     help='Benchmark request count')
 parser.add_argument('--sites-dir', type=str, default='sites',
                     help='Directory with downloaded sites')
+parser.add_argument('--splash-server', metavar='HOST:PORT',
+                    help='Use existing Splash instance available at HOST:PORT')
 
 
 def generate_requests(splash, args):
@@ -139,16 +141,36 @@ def invoke_request(invoke_args):
             'height': kwargs['params']['height']}
 
 
+class ExistingSplashWrapper(object):
+    """Wrapper for pre-existing Splash instance."""
+    def __init__(self, server):
+        self.server = server
+        if not self.server.startswith('http://'):
+            self.server = 'http://' + self.server
+
+    def url(self, endpoint):
+        return self.server + '/' + endpoint
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        pass
+
+
 def main():
     log = logging.getLogger("benchmark")
     args = parser.parse_args()
     logging.basicConfig(level=logging.DEBUG)
 
-    splash = SplashServer(
-        logfile=SPLASH_LOG,
-        extra_args=['--disable-lua-sandbox',
-                    '--disable-xvfb',
-                    '--max-timeout=600'])
+    if args.splash_server:
+        splash = ExistingSplashWrapper(args.splash_server)
+    else:
+        splash = SplashServer(
+            logfile=SPLASH_LOG,
+            extra_args=['--disable-lua-sandbox',
+                        '--disable-xvfb',
+                        '--max-timeout=600'])
 
     with splash, serve_files(PORT, args.sites_dir):
         start_time = time()

diff --git a/splash/benchmark/download_sites.py b/splash/benchmark/download_sites.py
@@ -39,26 +39,33 @@
 
 
 def preprocess_main_page(sites_dir, url):
+    """
+    This function does several things:
+    - strip javascript so that downloaded pages look exactly the same
+    - add baseurl to resolve relative links properly (if it is missing)
+    - add meta charset description (if it is missing)
+    """
     out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
                                  splash_args=['--disable-lua-sandbox',
                                               '--disable-xvfb',
                                               '--max-timeout=600'],
                                  timeout=600.,))
     final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
-    if not w3lib.html.get_base_url(out['html']):
-        out['html'] = w3lib.html.remove_tags_with_content(
-            out['html'], ('script',))
-        root = html.fromstring(out['html'], parser=html.HTMLParser(),
-                               base_url=final_url)
-        try:
-            head = root.xpath('./head')[0]
-        except IndexError:
-            head = html.Element('head')
-            root.insert(0, head)
+    # Ensure there are no scripts to be executed.
+    out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',))
+    root = html.fromstring(out['html'], parser=html.HTMLParser(),
+                           base_url=final_url)
+    try:
+        head = root.xpath('./head')[0]
+    except IndexError:
+        head = html.Element('head')
+        root.insert(0, head)
+    if not head.xpath('./base/@href'):
         head.insert(0, html.Element('base', {'href': final_url}))
+    if not head.xpath('./meta/@charset'):
         head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
-        out['html'] = html.tostring(root, encoding='utf-8',
-                                    doctype='<!DOCTYPE html>')
+    out['html'] = html.tostring(root, encoding='utf-8',
+                                doctype='<!DOCTYPE html>')
     filename = re.sub(r'[^\w]+', '_', url) + '.html'
     with open(os.path.join(sites_dir, filename), 'w') as f:
         f.write(out['html'])