summaryrefslogtreecommitdiff
path: root/src/scrape_lexicanum.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scrape_lexicanum.py')
-rw-r--r--src/scrape_lexicanum.py21
1 files changed, 16 insertions, 5 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 20bc0da..34e9726 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -52,12 +52,12 @@ def get_page_content(config, page_name):
parser_output = soup.find(id="mw-content-text").contents[0]
- for unwanted in parser_output.select("table"):
+ for unwanted in parser_output.select("div.portal"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("table.nottemplate"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("table.Anpassen"):
unwanted.decompose()
-
- content = parser_output.find("p")
- for a in content.findAll("a"):
- fix_link(config, a, "href")
image = parser_output.find("a", {"class": "image"})
image_link = None
@@ -66,6 +66,17 @@ def get_page_content(config, page_name):
fix_link(config, image.contents[0], "src")
image_link = image.contents[0]["src"]
+ for unwanted in parser_output.select("div.quotebox"):
+ unwanted.decompose()
+ for unwanted in parser_output.select("p > br"):
+ unwanted.parent.decompose()
+ for unwanted in parser_output.select("table"):
+ unwanted.decompose()
+
+ content = parser_output.find("p")
+ for a in content.findAll("a"):
+ fix_link(config, a, "href")
+
header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
content_md = markdownify(str(content))