summaryrefslogtreecommitdiff
path: root/src/scrape_lexicanum.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/scrape_lexicanum.py')
-rw-r--r--src/scrape_lexicanum.py11
1 files changed, 8 insertions, 3 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py
index 2252ba7..d327811 100644
--- a/src/scrape_lexicanum.py
+++ b/src/scrape_lexicanum.py
@@ -3,6 +3,7 @@ from markdownify import markdownify
from requests.exceptions import HTTPError
import requests
+import textwrap
class WikiError(Exception):
pass
@@ -60,15 +61,19 @@ def get_page_content(config, page_name):
decompose(parser_output, "div.portal")
decompose(parser_output, "table.nottemplate")
decompose(parser_output, "table.Anpassen")
+ decompose(parser_output, "div.quotebox")
- image = parser_output.find("a.image")
+ image = parser_output.find("a", {"class": "image"})
image_link = None
if image:
+ if image["href"] == "/wiki/File:Targetdrone.gif":
+ image = parser_output.find_all("a", {"class": "image"})[1]
+
+ if image:
if image.contents[0]:
fix_link(config, image.contents[0], "src")
image_link = image.contents[0]["src"]
- decompose(parser_output, "div.quotebox")
decompose(parser_output, "br")
decompose(parser_output, "table")
@@ -81,7 +86,7 @@ def get_page_content(config, page_name):
fix_link(config, a, "href")
header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
- content_md = markdownify(str(content))
+ content_md = textwrap.shorten(markdownify(str(content)), width=1000, placeholder="...")
return header_md, content_md, image_link