summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Johnson <[email protected]>2026-01-09 14:24:08 -0500
committerSamuel Johnson <[email protected]>2026-01-09 14:24:08 -0500
commit1e983de7023b97a443b608c529bc1c9a424e5346 (patch)
tree7b2d3f3a212eeafbed6d039163254f39e8bab24a
parent669cb5a53d2a95905e6b556b4a2613359cde0288 (diff)
Fix images
-rw-r--r--src/main.py10
-rw-r--r--src/scrape_lexicanum.py21
2 files changed, 22 insertions, 9 deletions
diff --git a/src/main.py b/src/main.py
index af53021..c773f4f 100644
--- a/src/main.py
+++ b/src/main.py
@@ -25,13 +25,15 @@ async def explain(ctx, *args):
if args[0].lower() == "whfb":
args.pop(0)
config["site"] = "https://whfb.lexicanum.com"
- if args[0].lower() == "wh40k":
+ elif args[0].lower() == "wh40k":
args.pop(0)
- config["site"] = "https://wh40k.lexicanum/com"
- if args[0].lower() == "aos":
+ config["site"] = "https://wh40k.lexicanum.com"
+ elif args[0].lower() == "aos":
args.pop(0)
config["site"] = "https://ageofsigmar.lexicanum.com"
-
+ else:
+ config["site"] = "https://ageofsigmar.lexicanum.com"
+
query = " ".join([x.replace('"', "") for x in args])
try:
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 20bc0da..34e9726 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -52,12 +52,12 @@ def get_page_content(config, page_name):
parser_output = soup.find(id="mw-content-text").contents[0]
- for unwanted in parser_output.select("table"):
+ for unwanted in parser_output.select("div.portal"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("table.nottemplate"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("table.Anpassen"):
unwanted.decompose()
-
- content = parser_output.find("p")
- for a in content.findAll("a"):
- fix_link(config, a, "href")
image = parser_output.find("a", {"class": "image"})
image_link = None
@@ -66,6 +66,17 @@ def get_page_content(config, page_name):
fix_link(config, image.contents[0], "src")
image_link = image.contents[0]["src"]
+ for unwanted in parser_output.select("div.quotebox"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("p > br"):
+ unwanted.parent.decompose()
+ for unwanted in parser_output.select("table"):
+ unwanted.decompose()
+
+ content = parser_output.find("p")
+ for a in content.findAll("a"):
+ fix_link(config, a, "href")
+
header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
content_md = markdownify(str(content))