Skip to content

Commit

Permalink
benchmark: several post-review improvements
Browse files Browse the repository at this point in the history
- download_sites: fix encoding unconditionally if it is missing
- download_sites: add base/href only if it is missing
- download_sites: remove scripts unconditionally
- benchmark: specify pre-existing splash instance with --splash-server HOST:PORT
  • Loading branch information
immerrr committed Mar 2, 2015
1 parent 7510550 commit 70128ec
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 17 deletions.
32 changes: 27 additions & 5 deletions splash/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def make_render_png_lua_req(splash, params):
help='Benchmark request count')
parser.add_argument('--sites-dir', type=str, default='sites',
help='Directory with downloaded sites')
parser.add_argument('--splash-server', metavar='HOST:PORT',
help='Use existing Splash instance available at HOST:PORT')


def generate_requests(splash, args):
Expand Down Expand Up @@ -139,16 +141,36 @@ def invoke_request(invoke_args):
'height': kwargs['params']['height']}


class ExistingSplashWrapper(object):
"""Wrapper for pre-existing Splash instance."""
def __init__(self, server):
self.server = server
if not self.server.startswith('http://'):
self.server = 'http://' + self.server

def url(self, endpoint):
return self.server + '/' + endpoint

def __enter__(self):
return self

def __exit__(self, *args):
pass


def main():
log = logging.getLogger("benchmark")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)

splash = SplashServer(
logfile=SPLASH_LOG,
extra_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'])
if args.splash_server:
splash = ExistingSplashWrapper(args.splash_server)
else:
splash = SplashServer(
logfile=SPLASH_LOG,
extra_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'])

with splash, serve_files(PORT, args.sites_dir):
start_time = time()
Expand Down
31 changes: 19 additions & 12 deletions splash/benchmark/download_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,33 @@


def preprocess_main_page(sites_dir, url):
"""
This function does several things:
- strip javascript so that downloaded pages look exactly the same
- add baseurl to resolve relative links properly (if it is missing)
- add meta charset description (if it is missing)

This comment has been minimized.

Copy link
@kmike

kmike Mar 2, 2015

Member

This is interesting.. Splash always returns the result in utf8 encoding, even if the page originally used some other encoding. Could you please check that webkit fixes <meta charset> tags itself? If not, we should fix meta charset unconditionally.

This comment has been minimized.

Copy link
@kmike

kmike Mar 2, 2015

Member

by the way, is fixing of meta charset needed for httrack?

This comment has been minimized.

Copy link
@immerrr

immerrr Mar 2, 2015

Author Contributor

No, if one has a <meta charset="foobar"/> page, it is downloaded by webkit, converted to UTF-16 from foobar encoding and then converted to utf-8 by splash, but the <meta .../> element is unchanged, so the next time a browser will try to show it, it may show incorrect characters.

This seems like a bug in Splash itself, but you won't see it until you try to open the resulting html in a browser that interprets meta (which realistically may never happen in real life). In this benchmark suite, the benchmarked instance of Splash will be that browser and it is almost certainly will be run. During these runs downloaded pages will be consistent with each other, but will be different from their online counterparts.

This comment has been minimized.

Copy link
@immerrr

immerrr Mar 2, 2015

Author Contributor

Also, my detection of meta charset is incomplete, another option according to this SO question is

<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
"""
out = json.loads(lua_runonce(SCRIPT_HTML, url=url,
splash_args=['--disable-lua-sandbox',
'--disable-xvfb',
'--max-timeout=600'],
timeout=600.,))
final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl()
if not w3lib.html.get_base_url(out['html']):
out['html'] = w3lib.html.remove_tags_with_content(
out['html'], ('script',))
root = html.fromstring(out['html'], parser=html.HTMLParser(),
base_url=final_url)
try:
head = root.xpath('./head')[0]
except IndexError:
head = html.Element('head')
root.insert(0, head)
# Ensure there are no scripts to be executed.
out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',))
root = html.fromstring(out['html'], parser=html.HTMLParser(),
base_url=final_url)
try:
head = root.xpath('./head')[0]
except IndexError:
head = html.Element('head')
root.insert(0, head)
if not head.xpath('./base/@href'):
head.insert(0, html.Element('base', {'href': final_url}))
if not head.xpath('./meta/@charset'):
head.insert(0, html.Element('meta', {'charset': 'utf-8'}))
out['html'] = html.tostring(root, encoding='utf-8',
doctype='<!DOCTYPE html>')
out['html'] = html.tostring(root, encoding='utf-8',
doctype='<!DOCTYPE html>')
filename = re.sub(r'[^\w]+', '_', url) + '.html'
with open(os.path.join(sites_dir, filename), 'w') as f:
f.write(out['html'])
Expand Down

0 comments on commit 70128ec

Please sign in to comment.