diff options
| author | Samuel Johnson <[email protected]> | 2026-01-09 13:52:51 -0500 |
|---|---|---|
| committer | Samuel Johnson <[email protected]> | 2026-01-09 13:52:51 -0500 |
| commit | a41f159731b580852661bd8f222e730dc737e664 (patch) | |
| tree | 9b28e6d107d518ca6e2dbeb0fab42477098eabfe /src/scrape_lexicanum.py | |
| parent | 513acd702142808a1b3257f1cabbc775f4dfaaf7 (diff) | |
Remove annoying initial tables
Diffstat (limited to 'src/scrape_lexicanum.py')
| -rw-r--r-- | src/scrape_lexicanum.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index 6853455..20bc0da 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -51,6 +51,10 @@ def get_page_content(config, page_name): soup = BeautifulSoup(response.content, "html.parser") parser_output = soup.find(id="mw-content-text").contents[0] + + for unwanted in parser_output.select("table"): + unwanted.decompose() + content = parser_output.find("p") for a in content.findAll("a"): fix_link(config, a, "href") |
