From 7e2e39ef6cc4ca15d49afe9bbc48745710b64d6e Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Sat, 10 Jan 2026 01:26:58 -0500 Subject: Correctly decompose other incorrect images --- src/scrape_lexicanum.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'src/scrape_lexicanum.py') diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index 2252ba7..d327811 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -3,6 +3,7 @@ from markdownify import markdownify from requests.exceptions import HTTPError import requests +import textwrap class WikiError(Exception): pass @@ -60,15 +61,19 @@ def get_page_content(config, page_name): decompose(parser_output, "div.portal") decompose(parser_output, "table.nottemplate") decompose(parser_output, "table.Anpassen") + decompose(parser_output, "div.quotebox") - image = parser_output.find("a.image") + image = parser_output.find("a", {"class": "image"}) image_link = None + if image: + if image["href"] == "/wiki/File:Targetdrone.gif": + image = parser_output.find_all("a", {"class": "image"})[1] + if image: if image.contents[0]: fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] - decompose(parser_output, "div.quotebox") decompose(parser_output, "br") decompose(parser_output, "table") @@ -81,7 +86,7 @@ def get_page_content(config, page_name): fix_link(config, a, "href") header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) - content_md = markdownify(str(content)) + content_md = textwrap.shorten(markdownify(str(content)), width=1000, placeholder="...") return header_md, content_md, image_link -- cgit v1.2.3