diff --git a/csbot/plugins/linkinfo.py b/csbot/plugins/linkinfo.py
index ba020b40..758bd576 100644
--- a/csbot/plugins/linkinfo.py
+++ b/csbot/plugins/linkinfo.py
@@ -214,67 +214,70 @@ def scrape_html_title(self, url):
make_error = partial(LinkInfoResult, url.geturl(), is_error=True)
# Let's see what's on the other end...
- with closing(simple_http_get(url.geturl(), stream=True)) as r:
- # Only bother with 200 OK
- if r.status_code != requests.codes.ok:
- return make_error('HTTP request failed: {}'
- .format(r.status_code))
- # Only process HTML-ish responses
- if 'Content-Type' not in r.headers:
- return make_error('No Content-Type header')
- elif 'html' not in r.headers['Content-Type']:
- return make_error('Content-Type not HTML-ish: {}'
- .format(r.headers['Content-Type']))
- # Don't try to process massive responses
- if 'Content-Length' in r.headers:
- max_size = int(self.config_get('max_response_size'))
- if int(r.headers['Content-Length']) > max_size:
- return make_error('Content-Length too large: {} bytes, >{}'
- .format(r.headers['Content-Length'],
- self.config_get('max_response_size')))
-
- # Get the correct parser
- if 'charset=' in r.headers['content-type']:
- # If present, HTTP Content-Type header charset takes precedence
- parser = lxml.html.HTMLParser(
- encoding=r.headers['content-type'].rsplit('=', 1)[1])
- else:
- parser = lxml.html.html_parser
-
- # In case Content-Length is absent on a massive file, get only a
- # reasonable chunk instead. We don't just get the first chunk
- # because chunk-encoded responses iterate over chunks rather than
- # the size we request...
- chunk = b''
- for next_chunk in r.iter_content(self.config_get('max_response_size')):
- chunk += next_chunk
- if len(chunk) >= self.config_get('max_response_size'):
- break
- # Try to trim chunk to a tag end to help the HTML parser out
- try:
- chunk = chunk[:chunk.rindex(b'>') + 1]
- except ValueError:
- pass
-
- # Attempt to parse as an HTML document
- html = lxml.etree.fromstring(chunk, parser)
- if html is None:
- return make_error('Response not usable as HTML')
-
- # Attempt to get the
tag
- title = html.findtext('.//title') or ''
- # Normalise title whitespace
- title = ' '.join(title.strip().split())
-
- if not title:
- return make_error('Missing or empty tag')
-
- # Build result
- result = LinkInfoResult(url, title,
- nsfw=url.netloc.endswith('.xxx'))
- # See if the title is redundant, i.e. appears in the URL
- result.is_redundant = self._filter_title_in_url(url, title)
- return result
+ try:
+ with closing(simple_http_get(url.geturl(), stream=True)) as r:
+ # Only bother with 200 OK
+ if r.status_code != requests.codes.ok:
+ return make_error('HTTP request failed: {}'
+ .format(r.status_code))
+ # Only process HTML-ish responses
+ if 'Content-Type' not in r.headers:
+ return make_error('No Content-Type header')
+ elif 'html' not in r.headers['Content-Type']:
+ return make_error('Content-Type not HTML-ish: {}'
+ .format(r.headers['Content-Type']))
+ # Don't try to process massive responses
+ if 'Content-Length' in r.headers:
+ max_size = int(self.config_get('max_response_size'))
+ if int(r.headers['Content-Length']) > max_size:
+ return make_error('Content-Length too large: {} bytes, >{}'
+ .format(r.headers['Content-Length'],
+ self.config_get('max_response_size')))
+
+ # Get the correct parser
+ if 'charset=' in r.headers['content-type']:
+ # If present, HTTP Content-Type header charset takes precedence
+ parser = lxml.html.HTMLParser(
+ encoding=r.headers['content-type'].rsplit('=', 1)[1])
+ else:
+ parser = lxml.html.html_parser
+
+ # In case Content-Length is absent on a massive file, get only a
+ # reasonable chunk instead. We don't just get the first chunk
+ # because chunk-encoded responses iterate over chunks rather than
+ # the size we request...
+ chunk = b''
+ for next_chunk in r.iter_content(self.config_get('max_response_size')):
+ chunk += next_chunk
+ if len(chunk) >= self.config_get('max_response_size'):
+ break
+ # Try to trim chunk to a tag end to help the HTML parser out
+ try:
+ chunk = chunk[:chunk.rindex(b'>') + 1]
+ except ValueError:
+ pass
+
+ # Attempt to parse as an HTML document
+ html = lxml.etree.fromstring(chunk, parser)
+ if html is None:
+ return make_error('Response not usable as HTML')
+
+ # Attempt to get the tag
+ title = html.findtext('.//title') or ''
+ # Normalise title whitespace
+ title = ' '.join(title.strip().split())
+
+ if not title:
+ return make_error('Missing or empty tag')
+
+ # Build result
+ result = LinkInfoResult(url, title,
+ nsfw=url.netloc.endswith('.xxx'))
+ # See if the title is redundant, i.e. appears in the URL
+ result.is_redundant = self._filter_title_in_url(url, title)
+ return result
+ except requests.exceptions.ConnectionError:
+ return make_error('Connection error')
def _filter_title_in_url(self, url, title):
"""See if *title* is represented in *url*.