diff options
| author | Samuel Johnson <[email protected]> | 2026-01-09 14:24:08 -0500 |
|---|---|---|
| committer | Samuel Johnson <[email protected]> | 2026-01-09 14:24:08 -0500 |
| commit | 1e983de7023b97a443b608c529bc1c9a424e5346 (patch) | |
| tree | 7b2d3f3a212eeafbed6d039163254f39e8bab24a | |
| parent | 669cb5a53d2a95905e6b556b4a2613359cde0288 (diff) | |
Fix images
| -rw-r--r-- | src/main.py | 10 | ||||
| -rw-r--r-- | src/scrape_lexicanum.py | 21 |
2 files changed, 22 insertions, 9 deletions
diff --git a/src/main.py b/src/main.py index af53021..c773f4f 100644 --- a/src/main.py +++ b/src/main.py @@ -25,13 +25,15 @@ async def explain(ctx, *args): if args[0].lower() == "whfb": args.pop(0) config["site"] = "https://whfb.lexicanum.com" - if args[0].lower() == "wh40k": + elif args[0].lower() == "wh40k": args.pop(0) - config["site"] = "https://wh40k.lexicanum/com" - if args[0].lower() == "aos": + config["site"] = "https://wh40k.lexicanum.com" + elif args[0].lower() == "aos": args.pop(0) config["site"] = "https://ageofsigmar.lexicanum.com" - + else: + config["site"] = "https://ageofsigmar.lexicanum.com" + query = " ".join([x.replace('"', "") for x in args]) try: diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index 20bc0da..34e9726 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -52,12 +52,12 @@ def get_page_content(config, page_name): parser_output = soup.find(id="mw-content-text").contents[0] - for unwanted in parser_output.select("table"): + for unwanted in parser_output.select("div.portal"): + unwanted.decompose() + for unwanted in parser_output.select("table.nottemplate"): + unwanted.decompose() + for unwanted in parser_output.select("table.Anpassen"): unwanted.decompose() - - content = parser_output.find("p") - for a in content.findAll("a"): - fix_link(config, a, "href") image = parser_output.find("a", {"class": "image"}) image_link = None @@ -66,6 +66,17 @@ def get_page_content(config, page_name): fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] + for unwanted in parser_output.select("div.quotebox"): + unwanted.decompose() + for unwanted in parser_output.select("p > br"): + unwanted.parent.decompose() + for unwanted in parser_output.select("table"): + unwanted.decompose() + + content = parser_output.find("p") + for a in content.findAll("a"): + fix_link(config, a, "href") + header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) content_md = markdownify(str(content)) |
