Fix images

author: Samuel Johnson <[email protected]> 2026-01-09 14:24:08 -0500
committer: Samuel Johnson <[email protected]> 2026-01-09 14:24:08 -0500
commit: 1e983de7023b97a443b608c529bc1c9a424e5346 (patch)
tree: 7b2d3f3a212eeafbed6d039163254f39e8bab24a
parent: 669cb5a53d2a95905e6b556b4a2613359cde0288 (diff)
2 files changed, 22 insertions, 9 deletions
diff --git a/src/main.py b/src/main.py
index af53021..c773f4f 100644
--- a/src/main.py
+++ b/src/main.py
@@ -25,13 +25,15 @@ async def explain(ctx, *args):
     if args[0].lower() == "whfb":
         args.pop(0)
         config["site"] = "https://whfb.lexicanum.com"
-    if args[0].lower() == "wh40k":
+    elif args[0].lower() == "wh40k":
         args.pop(0)
-        config["site"] = "https://wh40k.lexicanum/com"
-    if args[0].lower() == "aos":
+        config["site"] = "https://wh40k.lexicanum.com"
+    elif args[0].lower() == "aos":
         args.pop(0)
         config["site"] = "https://ageofsigmar.lexicanum.com"
-
+    else:
+        config["site"] = "https://ageofsigmar.lexicanum.com"
+    
     query = " ".join([x.replace('"', "") for x in args])
 
     try:
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 20bc0da..34e9726 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -52,12 +52,12 @@ def get_page_content(config, page_name):
     
     parser_output = soup.find(id="mw-content-text").contents[0]
 
-    for unwanted in parser_output.select("table"):
+    for unwanted in parser_output.select("div.portal"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("table.nottemplate"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("table.Anpassen"):
         unwanted.decompose()
-
-    content = parser_output.find("p")
-    for a in content.findAll("a"):
-        fix_link(config, a, "href")
 
     image = parser_output.find("a", {"class": "image"})
     image_link = None
@@ -66,6 +66,17 @@ def get_page_content(config, page_name):
             fix_link(config, image.contents[0], "src")
             image_link = image.contents[0]["src"]
 
+    for unwanted in parser_output.select("div.quotebox"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("p > br"):
+        unwanted.parent.decompose()
+    for unwanted in parser_output.select("table"):
+        unwanted.decompose()
+
+    content = parser_output.find("p")
+    for a in content.findAll("a"):
+        fix_link(config, a, "href")
+
     header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
     content_md = markdownify(str(content))
author	Samuel Johnson <[email protected]>	2026-01-09 14:24:08 -0500
committer	Samuel Johnson <[email protected]>	2026-01-09 14:24:08 -0500
commit	1e983de7023b97a443b608c529bc1c9a424e5346 (patch)
tree	7b2d3f3a212eeafbed6d039163254f39e8bab24a
parent	669cb5a53d2a95905e6b556b4a2613359cde0288 (diff)