From ec09b0e5dccdabf52f7c55a42527a29ff1016943 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Fri, 9 Jan 2026 13:25:44 -0500 Subject: Allow images to be embedded from document --- src/scrape_lexicanum.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'src/scrape_lexicanum.py') diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index a719fb1..88001a3 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -7,6 +7,9 @@ import requests class WikiError(Exception): pass +def fix_link(config, node, field): + node[field] = node[field].replace("/wiki", config["site"] + "/wiki") + def get_search_response(config, query): try: response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7) @@ -49,10 +52,17 @@ def get_page_content(config, page_name): parser_output = soup.find(id="mw-content-text").contents[0] content = parser_output.find("p") for a in content.findAll("a"): - a["href"] = a["href"].replace("/wiki", config["site"] + "/wiki") + fix_link(config, a, "href") + + image = parser_output.find("a", {"class": "image"}) + image_link = None + if image: + if image.contents[0]: + fix_link(config, image.contents[0], "src") + image_link = image.contents[0]["src"] header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) content_md = markdownify(str(content)) - return header_md, content_md + return header_md, content_md, image_link -- cgit v1.2.3