summaryrefslogtreecommitdiff
path: root/src/scrape_lexicanum.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scrape_lexicanum.py')
-rw-r--r--src/scrape_lexicanum.py27
1 files changed, 15 insertions, 12 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index a873f43..2252ba7 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -11,6 +11,10 @@ def fix_link(config, node, field):
node[field] = node[field].replace("/wiki", config["site"] + "/wiki")
node[field] = node[field].replace("/mediawiki", config["site"] + "/mediawiki")
+def decompose(node, field):
+ for unwanted in node.select(field):
+ unwanted.decompose()
+
def get_search_response(config, query):
try:
response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7)
@@ -52,12 +56,10 @@ def get_page_content(config, page_name):
parser_output = soup.find(id="mw-content-text").contents[0]
- for unwanted in parser_output.select("div.portal"):
- unwanted.decompose()
- for unwanted in parser_output.select("table.nottemplate"):
- unwanted.decompose()
- for unwanted in parser_output.select("table.Anpassen"):
- unwanted.decompose()
+ decompose(parser_output, "sup")
+ decompose(parser_output, "div.portal")
+ decompose(parser_output, "table.nottemplate")
+ decompose(parser_output, "table.Anpassen")
image = parser_output.find("a.image")
image_link = None
@@ -66,12 +68,13 @@ def get_page_content(config, page_name):
fix_link(config, image.contents[0], "src")
image_link = image.contents[0]["src"]
- for unwanted in parser_output.select("div.quotebox"):
- unwanted.decompose()
- for unwanted in parser_output.select("p > br"):
- unwanted.parent.decompose()
- for unwanted in parser_output.select("table"):
- unwanted.decompose()
+ decompose(parser_output, "div.quotebox")
+ decompose(parser_output, "br")
+ decompose(parser_output, "table")
+
+ for p in parser_output.find_all("p"):
+ if not p.get_text(strip=True):
+ p.decompose()
content = parser_output.find("p")
for a in content.findAll("a"):