diff options
Diffstat (limited to 'src/scrape_lexicanum.py')
| -rw-r--r-- | src/scrape_lexicanum.py | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index 2252ba7..d327811 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -3,6 +3,7 @@ from markdownify import markdownify from requests.exceptions import HTTPError import requests +import textwrap class WikiError(Exception): pass @@ -60,15 +61,19 @@ def get_page_content(config, page_name): decompose(parser_output, "div.portal") decompose(parser_output, "table.nottemplate") decompose(parser_output, "table.Anpassen") + decompose(parser_output, "div.quotebox") - image = parser_output.find("a.image") + image = parser_output.find("a", {"class": "image"}) image_link = None if image: + if image["href"] == "/wiki/File:Targetdrone.gif": + image = parser_output.find_all("a", {"class": "image"})[1] + + if image: if image.contents[0]: fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] - decompose(parser_output, "div.quotebox") decompose(parser_output, "br") decompose(parser_output, "table") @@ -81,7 +86,7 @@ def get_page_content(config, page_name): fix_link(config, a, "href") header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) - content_md = markdownify(str(content)) + content_md = textwrap.shorten(markdownify(str(content)), width=1000, placeholder="...") return header_md, content_md, image_link |
