Catch connection errors from linkinfo.

Not pretty, but it works. Fixes #118.
HackSoc · Jun 30, 2018 · 0a9c370 · 0a9c370
1 parent a1d3bb9
commit 0a9c370
Showing 1 changed file with 64 additions and 61 deletions.
diff --git a/csbot/plugins/linkinfo.py b/csbot/plugins/linkinfo.py
@@ -214,67 +214,70 @@ def scrape_html_title(self, url):
         make_error = partial(LinkInfoResult, url.geturl(), is_error=True)
 
         # Let's see what's on the other end...
-        with closing(simple_http_get(url.geturl(), stream=True)) as r:
-            # Only bother with 200 OK
-            if r.status_code != requests.codes.ok:
-                return make_error('HTTP request failed: {}'
-                                  .format(r.status_code))
-            # Only process HTML-ish responses
-            if 'Content-Type' not in r.headers:
-                return make_error('No Content-Type header')
-            elif 'html' not in r.headers['Content-Type']:
-                return make_error('Content-Type not HTML-ish: {}'
-                                  .format(r.headers['Content-Type']))
-            # Don't try to process massive responses
-            if 'Content-Length' in r.headers:
-                max_size = int(self.config_get('max_response_size'))
-                if int(r.headers['Content-Length']) > max_size:
-                    return make_error('Content-Length too large: {} bytes, >{}'
-                                      .format(r.headers['Content-Length'],
-                                              self.config_get('max_response_size')))
-
-            # Get the correct parser
-            if 'charset=' in r.headers['content-type']:
-                # If present, HTTP Content-Type header charset takes precedence
-                parser = lxml.html.HTMLParser(
-                    encoding=r.headers['content-type'].rsplit('=', 1)[1])
-            else:
-                parser = lxml.html.html_parser
-
-            # In case Content-Length is absent on a massive file, get only a
-            # reasonable chunk instead. We don't just get the first chunk
-            # because chunk-encoded responses iterate over chunks rather than
-            # the size we request...
-            chunk = b''
-            for next_chunk in r.iter_content(self.config_get('max_response_size')):
-                chunk += next_chunk
-                if len(chunk) >= self.config_get('max_response_size'):
-                    break
-            # Try to trim chunk to a tag end to help the HTML parser out
-            try:
-                chunk = chunk[:chunk.rindex(b'>') + 1]
-            except ValueError:
-                pass
-
-            # Attempt to parse as an HTML document
-            html = lxml.etree.fromstring(chunk, parser)
-            if html is None:
-                return make_error('Response not usable as HTML')
-
-            # Attempt to get the <title> tag
-            title = html.findtext('.//title') or ''
-            # Normalise title whitespace
-            title = ' '.join(title.strip().split())
-
-            if not title:
-                return make_error('Missing or empty <title> tag')
-
-            # Build result
-            result = LinkInfoResult(url, title,
-                                    nsfw=url.netloc.endswith('.xxx'))
-            # See if the title is redundant, i.e. appears in the URL
-            result.is_redundant = self._filter_title_in_url(url, title)
-            return result
+        try:
+            with closing(simple_http_get(url.geturl(), stream=True)) as r:
+                # Only bother with 200 OK
+                if r.status_code != requests.codes.ok:
+                    return make_error('HTTP request failed: {}'
+                                      .format(r.status_code))
+                # Only process HTML-ish responses
+                if 'Content-Type' not in r.headers:
+                    return make_error('No Content-Type header')
+                elif 'html' not in r.headers['Content-Type']:
+                    return make_error('Content-Type not HTML-ish: {}'
+                                      .format(r.headers['Content-Type']))
+                # Don't try to process massive responses
+                if 'Content-Length' in r.headers:
+                    max_size = int(self.config_get('max_response_size'))
+                    if int(r.headers['Content-Length']) > max_size:
+                        return make_error('Content-Length too large: {} bytes, >{}'
+                                          .format(r.headers['Content-Length'],
+                                                  self.config_get('max_response_size')))
+
+                # Get the correct parser
+                if 'charset=' in r.headers['content-type']:
+                    # If present, HTTP Content-Type header charset takes precedence
+                    parser = lxml.html.HTMLParser(
+                        encoding=r.headers['content-type'].rsplit('=', 1)[1])
+                else:
+                    parser = lxml.html.html_parser
+
+                # In case Content-Length is absent on a massive file, get only a
+                # reasonable chunk instead. We don't just get the first chunk
+                # because chunk-encoded responses iterate over chunks rather than
+                # the size we request...
+                chunk = b''
+                for next_chunk in r.iter_content(self.config_get('max_response_size')):
+                    chunk += next_chunk
+                    if len(chunk) >= self.config_get('max_response_size'):
+                        break
+                # Try to trim chunk to a tag end to help the HTML parser out
+                try:
+                    chunk = chunk[:chunk.rindex(b'>') + 1]
+                except ValueError:
+                    pass
+
+                # Attempt to parse as an HTML document
+                html = lxml.etree.fromstring(chunk, parser)
+                if html is None:
+                    return make_error('Response not usable as HTML')
+
+                # Attempt to get the <title> tag
+                title = html.findtext('.//title') or ''
+                # Normalise title whitespace
+                title = ' '.join(title.strip().split())
+
+                if not title:
+                    return make_error('Missing or empty <title> tag')
+
+                # Build result
+                result = LinkInfoResult(url, title,
+                                        nsfw=url.netloc.endswith('.xxx'))
+                # See if the title is redundant, i.e. appears in the URL
+                result.is_redundant = self._filter_title_in_url(url, title)
+                return result
+        except requests.exceptions.ConnectionError:
+            return make_error('Connection error')
 
     def _filter_title_in_url(self, url, title):
         """See if *title* is represented in *url*.