🏡 index : ~doyle/dave.git

author Jordan Doyle <jordan@doyle.wf> 2017-07-30 15:13:11.0 +00:00:00
committer Jordan Doyle <jordan@doyle.wf> 2017-07-30 15:13:11.0 +00:00:00
commit
6aada2e83b13315416d7f40d4fefd7fd74799a93 [patch]
tree
b8bcea05732d8e22b6bbf9c380e13582afc2a323
parent
2c8ecc4ef2d5a465802d3367ca07fa50aa9cf126
download
6aada2e83b13315416d7f40d4fefd7fd74799a93.tar.gz

Fix requests guessing the wrong charset sometimes



Diff

 dave/modules/title.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dave/modules/title.py b/dave/modules/title.py
index f5abad1..450f5e9 100644
--- a/dave/modules/title.py
+++ b/dave/modules/title.py
@@ -21,17 +21,22 @@ def link_parse(bot, args, sender, source):
            res = get(match, timeout=3,
                      headers={'user-agent': 'irc bot (https://github.com/w4)'})

            # sometimes requests guesses the charset wrong
            if res.encoding == 'ISO-8859-1' and not 'ISO-8859-1' in \
                    res.headers.get('Content-Type', ''):
                res.encoding = res.apparent_encoding

            soup = BeautifulSoup(res.text, "html.parser")
            title = soup.title
            title = soup.title.string

            if title is not None:
                title = re.sub(r"(\r?\n|\r| )+",
                               " ",
                               title.string.strip())
                               title.strip())
                title = title[:140] + (title[140:] and '...')
                dave.config.redis.setex("site:{}".format(match), 300, title)
        else:
            title = dave.config.redis.get("site:{}".format(match)).decode('utf-8')
            title = str(dave.config.redis.get("site:{}".format(match)), 'utf8')

        if title is not None:
            titles.append(assembleFormattedText(A.bold[title]))