Skip to content

Commit

Permalink
Catch connection errors from linkinfo.
Browse files Browse the repository at this point in the history
Not pretty, but it works. Fixes #118.
  • Loading branch information
LordAro committed Jun 30, 2018
1 parent a1d3bb9 commit 0a9c370
Showing 1 changed file with 64 additions and 61 deletions.
125 changes: 64 additions & 61 deletions csbot/plugins/linkinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,67 +214,70 @@ def scrape_html_title(self, url):
make_error = partial(LinkInfoResult, url.geturl(), is_error=True)

# Let's see what's on the other end...
with closing(simple_http_get(url.geturl(), stream=True)) as r:
# Only bother with 200 OK
if r.status_code != requests.codes.ok:
return make_error('HTTP request failed: {}'
.format(r.status_code))
# Only process HTML-ish responses
if 'Content-Type' not in r.headers:
return make_error('No Content-Type header')
elif 'html' not in r.headers['Content-Type']:
return make_error('Content-Type not HTML-ish: {}'
.format(r.headers['Content-Type']))
# Don't try to process massive responses
if 'Content-Length' in r.headers:
max_size = int(self.config_get('max_response_size'))
if int(r.headers['Content-Length']) > max_size:
return make_error('Content-Length too large: {} bytes, >{}'
.format(r.headers['Content-Length'],
self.config_get('max_response_size')))

# Get the correct parser
if 'charset=' in r.headers['content-type']:
# If present, HTTP Content-Type header charset takes precedence
parser = lxml.html.HTMLParser(
encoding=r.headers['content-type'].rsplit('=', 1)[1])
else:
parser = lxml.html.html_parser

# In case Content-Length is absent on a massive file, get only a
# reasonable chunk instead. We don't just get the first chunk
# because chunk-encoded responses iterate over chunks rather than
# the size we request...
chunk = b''
for next_chunk in r.iter_content(self.config_get('max_response_size')):
chunk += next_chunk
if len(chunk) >= self.config_get('max_response_size'):
break
# Try to trim chunk to a tag end to help the HTML parser out
try:
chunk = chunk[:chunk.rindex(b'>') + 1]
except ValueError:
pass

# Attempt to parse as an HTML document
html = lxml.etree.fromstring(chunk, parser)
if html is None:
return make_error('Response not usable as HTML')

# Attempt to get the <title> tag
title = html.findtext('.//title') or ''
# Normalise title whitespace
title = ' '.join(title.strip().split())

if not title:
return make_error('Missing or empty <title> tag')

# Build result
result = LinkInfoResult(url, title,
nsfw=url.netloc.endswith('.xxx'))
# See if the title is redundant, i.e. appears in the URL
result.is_redundant = self._filter_title_in_url(url, title)
return result
try:
with closing(simple_http_get(url.geturl(), stream=True)) as r:
# Only bother with 200 OK
if r.status_code != requests.codes.ok:
return make_error('HTTP request failed: {}'
.format(r.status_code))
# Only process HTML-ish responses
if 'Content-Type' not in r.headers:
return make_error('No Content-Type header')
elif 'html' not in r.headers['Content-Type']:
return make_error('Content-Type not HTML-ish: {}'
.format(r.headers['Content-Type']))
# Don't try to process massive responses
if 'Content-Length' in r.headers:
max_size = int(self.config_get('max_response_size'))
if int(r.headers['Content-Length']) > max_size:
return make_error('Content-Length too large: {} bytes, >{}'
.format(r.headers['Content-Length'],
self.config_get('max_response_size')))

# Get the correct parser
if 'charset=' in r.headers['content-type']:
# If present, HTTP Content-Type header charset takes precedence
parser = lxml.html.HTMLParser(
encoding=r.headers['content-type'].rsplit('=', 1)[1])
else:
parser = lxml.html.html_parser

# In case Content-Length is absent on a massive file, get only a
# reasonable chunk instead. We don't just get the first chunk
# because chunk-encoded responses iterate over chunks rather than
# the size we request...
chunk = b''
for next_chunk in r.iter_content(self.config_get('max_response_size')):
chunk += next_chunk
if len(chunk) >= self.config_get('max_response_size'):
break
# Try to trim chunk to a tag end to help the HTML parser out
try:
chunk = chunk[:chunk.rindex(b'>') + 1]
except ValueError:
pass

# Attempt to parse as an HTML document
html = lxml.etree.fromstring(chunk, parser)
if html is None:
return make_error('Response not usable as HTML')

# Attempt to get the <title> tag
title = html.findtext('.//title') or ''
# Normalise title whitespace
title = ' '.join(title.strip().split())

if not title:
return make_error('Missing or empty <title> tag')

# Build result
result = LinkInfoResult(url, title,
nsfw=url.netloc.endswith('.xxx'))
# See if the title is redundant, i.e. appears in the URL
result.is_redundant = self._filter_title_in_url(url, title)
return result
except requests.exceptions.ConnectionError:
return make_error('Connection error')

def _filter_title_in_url(self, url, title):
"""See if *title* is represented in *url*.
Expand Down

0 comments on commit 0a9c370

Please sign in to comment.