summaryrefslogtreecommitdiff
path: root/src/scrape_lexicanum.py
diff options
context:
space:
mode:
authorSamuel Johnson <[email protected]>2026-01-09 13:52:51 -0500
committerSamuel Johnson <[email protected]>2026-01-09 13:52:51 -0500
commita41f159731b580852661bd8f222e730dc737e664 (patch)
tree9b28e6d107d518ca6e2dbeb0fab42477098eabfe /src/scrape_lexicanum.py
parent513acd702142808a1b3257f1cabbc775f4dfaaf7 (diff)
Remove annoying initial tables
Diffstat (limited to 'src/scrape_lexicanum.py')
-rw-r--r--src/scrape_lexicanum.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 6853455..20bc0da 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -51,6 +51,10 @@ def get_page_content(config, page_name):
soup = BeautifulSoup(response.content, "html.parser")
parser_output = soup.find(id="mw-content-text").contents[0]
+
+ for unwanted in parser_output.select("table"):
+ unwanted.decompose()
+
content = parser_output.find("p")
for a in content.findAll("a"):
fix_link(config, a, "href")