1 files changed, 16 insertions, 5 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 20bc0da..34e9726 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -52,12 +52,12 @@ def get_page_content(config, page_name):
     
     parser_output = soup.find(id="mw-content-text").contents[0]
 
-    for unwanted in parser_output.select("table"):
+    for unwanted in parser_output.select("div.portal"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("table.nottemplate"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("table.Anpassen"):
         unwanted.decompose()
-
-    content = parser_output.find("p")
-    for a in content.findAll("a"):
-        fix_link(config, a, "href")
 
     image = parser_output.find("a", {"class": "image"})
     image_link = None
@@ -66,6 +66,17 @@ def get_page_content(config, page_name):
             fix_link(config, image.contents[0], "src")
             image_link = image.contents[0]["src"]
 
+    for unwanted in parser_output.select("div.quotebox"):
+        unwanted.decompose()
+    for unwanted in parser_output.select("p > br"):
+        unwanted.parent.decompose()
+    for unwanted in parser_output.select("table"):
+        unwanted.decompose()
+
+    content = parser_output.find("p")
+    for a in content.findAll("a"):
+        fix_link(config, a, "href")
+
     header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
     content_md = markdownify(str(content))