-
Notifications
You must be signed in to change notification settings - Fork 512
Commit
- download_sites: fix encoding unconditionally if it is missing - download_sites: add base/href only if it is missing - download_sites: remove scripts unconditionally - benchmark: specify pre-existing splash instance with --splash-server HOST:PORT
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,26 +39,33 @@ | |
|
||
|
||
def preprocess_main_page(sites_dir, url): | ||
""" | ||
This function does several things: | ||
- strip javascript so that downloaded pages look exactly the same | ||
- add baseurl to resolve relative links properly (if it is missing) | ||
- add meta charset description (if it is missing) | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
immerrr
Author
Contributor
|
||
""" | ||
out = json.loads(lua_runonce(SCRIPT_HTML, url=url, | ||
splash_args=['--disable-lua-sandbox', | ||
'--disable-xvfb', | ||
'--max-timeout=600'], | ||
timeout=600.,)) | ||
final_url = urlsplit(out['url'])._replace(query='', fragment='').geturl() | ||
if not w3lib.html.get_base_url(out['html']): | ||
out['html'] = w3lib.html.remove_tags_with_content( | ||
out['html'], ('script',)) | ||
root = html.fromstring(out['html'], parser=html.HTMLParser(), | ||
base_url=final_url) | ||
try: | ||
head = root.xpath('./head')[0] | ||
except IndexError: | ||
head = html.Element('head') | ||
root.insert(0, head) | ||
# Ensure there are no scripts to be executed. | ||
out['html'] = w3lib.html.remove_tags_with_content(out['html'], ('script',)) | ||
root = html.fromstring(out['html'], parser=html.HTMLParser(), | ||
base_url=final_url) | ||
try: | ||
head = root.xpath('./head')[0] | ||
except IndexError: | ||
head = html.Element('head') | ||
root.insert(0, head) | ||
if not head.xpath('./base/@href'): | ||
head.insert(0, html.Element('base', {'href': final_url})) | ||
if not head.xpath('./meta/@charset'): | ||
head.insert(0, html.Element('meta', {'charset': 'utf-8'})) | ||
out['html'] = html.tostring(root, encoding='utf-8', | ||
doctype='<!DOCTYPE html>') | ||
out['html'] = html.tostring(root, encoding='utf-8', | ||
doctype='<!DOCTYPE html>') | ||
filename = re.sub(r'[^\w]+', '_', url) + '.html' | ||
with open(os.path.join(sites_dir, filename), 'w') as f: | ||
f.write(out['html']) | ||
|
This is interesting.. Splash always returns the result in utf8 encoding, even if the page originally used some other encoding. Could you please check that webkit fixes
<meta charset>
tags itself? If not, we should fix meta charset unconditionally.