diff options
| author | Samuel Johnson <[email protected]> | 2026-01-10 00:56:42 -0500 |
|---|---|---|
| committer | Samuel Johnson <[email protected]> | 2026-01-10 00:56:42 -0500 |
| commit | 7be755c22ce30ec842eb78226fbc27deb004610f (patch) | |
| tree | f719bbf069f277f4a3df688834ec87d8dbcd9f5c | |
| parent | 88feb9f5e9444c9f8f27d2b74fa1fa30876d7385 (diff) | |
Fix accidental decomposing of p tags
| -rw-r--r-- | src/scrape_lexicanum.py | 27 |
1 files changed, 15 insertions, 12 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py index a873f43..2252ba7 100644 --- a/src/scrape_lexicanum.py +++ b/src/scrape_lexicanum.py @@ -11,6 +11,10 @@ def fix_link(config, node, field): node[field] = node[field].replace("/wiki", config["site"] + "/wiki") node[field] = node[field].replace("/mediawiki", config["site"] + "/mediawiki") +def decompose(node, field): + for unwanted in node.select(field): + unwanted.decompose() + def get_search_response(config, query): try: response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7) @@ -52,12 +56,10 @@ def get_page_content(config, page_name): parser_output = soup.find(id="mw-content-text").contents[0] - for unwanted in parser_output.select("div.portal"): - unwanted.decompose() - for unwanted in parser_output.select("table.nottemplate"): - unwanted.decompose() - for unwanted in parser_output.select("table.Anpassen"): - unwanted.decompose() + decompose(parser_output, "sup") + decompose(parser_output, "div.portal") + decompose(parser_output, "table.nottemplate") + decompose(parser_output, "table.Anpassen") image = parser_output.find("a.image") image_link = None @@ -66,12 +68,13 @@ def get_page_content(config, page_name): fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] - for unwanted in parser_output.select("div.quotebox"): - unwanted.decompose() - for unwanted in parser_output.select("p > br"): - unwanted.parent.decompose() - for unwanted in parser_output.select("table"): - unwanted.decompose() + decompose(parser_output, "div.quotebox") + decompose(parser_output, "br") + decompose(parser_output, "table") + + for p in parser_output.find_all("p"): + if not p.get_text(strip=True): + p.decompose() content = parser_output.find("p") for a in content.findAll("a"): |
