diff --git a/csbot/plugins/linkinfo.py b/csbot/plugins/linkinfo.py index ba020b40..758bd576 100644 --- a/csbot/plugins/linkinfo.py +++ b/csbot/plugins/linkinfo.py @@ -214,67 +214,70 @@ def scrape_html_title(self, url): make_error = partial(LinkInfoResult, url.geturl(), is_error=True) # Let's see what's on the other end... - with closing(simple_http_get(url.geturl(), stream=True)) as r: - # Only bother with 200 OK - if r.status_code != requests.codes.ok: - return make_error('HTTP request failed: {}' - .format(r.status_code)) - # Only process HTML-ish responses - if 'Content-Type' not in r.headers: - return make_error('No Content-Type header') - elif 'html' not in r.headers['Content-Type']: - return make_error('Content-Type not HTML-ish: {}' - .format(r.headers['Content-Type'])) - # Don't try to process massive responses - if 'Content-Length' in r.headers: - max_size = int(self.config_get('max_response_size')) - if int(r.headers['Content-Length']) > max_size: - return make_error('Content-Length too large: {} bytes, >{}' - .format(r.headers['Content-Length'], - self.config_get('max_response_size'))) - - # Get the correct parser - if 'charset=' in r.headers['content-type']: - # If present, HTTP Content-Type header charset takes precedence - parser = lxml.html.HTMLParser( - encoding=r.headers['content-type'].rsplit('=', 1)[1]) - else: - parser = lxml.html.html_parser - - # In case Content-Length is absent on a massive file, get only a - # reasonable chunk instead. We don't just get the first chunk - # because chunk-encoded responses iterate over chunks rather than - # the size we request... - chunk = b'' - for next_chunk in r.iter_content(self.config_get('max_response_size')): - chunk += next_chunk - if len(chunk) >= self.config_get('max_response_size'): - break - # Try to trim chunk to a tag end to help the HTML parser out - try: - chunk = chunk[:chunk.rindex(b'>') + 1] - except ValueError: - pass - - # Attempt to parse as an HTML document - html = lxml.etree.fromstring(chunk, parser) - if html is None: - return make_error('Response not usable as HTML') - - # Attempt to get the tag - title = html.findtext('.//title') or '' - # Normalise title whitespace - title = ' '.join(title.strip().split()) - - if not title: - return make_error('Missing or empty <title> tag') - - # Build result - result = LinkInfoResult(url, title, - nsfw=url.netloc.endswith('.xxx')) - # See if the title is redundant, i.e. appears in the URL - result.is_redundant = self._filter_title_in_url(url, title) - return result + try: + with closing(simple_http_get(url.geturl(), stream=True)) as r: + # Only bother with 200 OK + if r.status_code != requests.codes.ok: + return make_error('HTTP request failed: {}' + .format(r.status_code)) + # Only process HTML-ish responses + if 'Content-Type' not in r.headers: + return make_error('No Content-Type header') + elif 'html' not in r.headers['Content-Type']: + return make_error('Content-Type not HTML-ish: {}' + .format(r.headers['Content-Type'])) + # Don't try to process massive responses + if 'Content-Length' in r.headers: + max_size = int(self.config_get('max_response_size')) + if int(r.headers['Content-Length']) > max_size: + return make_error('Content-Length too large: {} bytes, >{}' + .format(r.headers['Content-Length'], + self.config_get('max_response_size'))) + + # Get the correct parser + if 'charset=' in r.headers['content-type']: + # If present, HTTP Content-Type header charset takes precedence + parser = lxml.html.HTMLParser( + encoding=r.headers['content-type'].rsplit('=', 1)[1]) + else: + parser = lxml.html.html_parser + + # In case Content-Length is absent on a massive file, get only a + # reasonable chunk instead. We don't just get the first chunk + # because chunk-encoded responses iterate over chunks rather than + # the size we request... + chunk = b'' + for next_chunk in r.iter_content(self.config_get('max_response_size')): + chunk += next_chunk + if len(chunk) >= self.config_get('max_response_size'): + break + # Try to trim chunk to a tag end to help the HTML parser out + try: + chunk = chunk[:chunk.rindex(b'>') + 1] + except ValueError: + pass + + # Attempt to parse as an HTML document + html = lxml.etree.fromstring(chunk, parser) + if html is None: + return make_error('Response not usable as HTML') + + # Attempt to get the <title> tag + title = html.findtext('.//title') or '' + # Normalise title whitespace + title = ' '.join(title.strip().split()) + + if not title: + return make_error('Missing or empty <title> tag') + + # Build result + result = LinkInfoResult(url, title, + nsfw=url.netloc.endswith('.xxx')) + # See if the title is redundant, i.e. appears in the URL + result.is_redundant = self._filter_title_in_url(url, title) + return result + except requests.exceptions.ConnectionError: + return make_error('Connection error') def _filter_title_in_url(self, url, title): """See if *title* is represented in *url*.