From 6aada2e83b13315416d7f40d4fefd7fd74799a93 Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Sun, 30 Jul 2017 16:13:11 +0100 Subject: [PATCH] Fix requests guessing the wrong charset sometimes --- dave/modules/title.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/dave/modules/title.py b/dave/modules/title.py index f5abad1..450f5e9 100644 --- a/dave/modules/title.py +++ a/dave/modules/title.py @@ -21,17 +21,22 @@ res = get(match, timeout=3, headers={'user-agent': 'irc bot (https://github.com/w4)'}) + # sometimes requests guesses the charset wrong + if res.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in \ + res.headers.get('Content-Type', ''): + res.encoding = res.apparent_encoding + soup = BeautifulSoup(res.text, "html.parser") - title = soup.title + title = soup.title.string if title is not None: title = re.sub(r"(\r?\n|\r| )+", " ", - title.string.strip()) + title.strip()) title = title[:140] + (title[140:] and '...') dave.config.redis.setex("site:{}".format(match), 300, title) else: - title = dave.config.redis.get("site:{}".format(match)).decode('utf-8') + title = str(dave.config.redis.get("site:{}".format(match)), 'utf8') if title is not None: titles.append(assembleFormattedText(A.bold[title])) -- rgit 0.1.3