from bs4 import BeautifulSoup from markdownify import markdownify from requests.exceptions import HTTPError from urllib.parse import urlparse import requests import textwrap headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0" } class WikiError(Exception): pass def fix_link(config, node, field): node[field] = node[field].replace("/wiki", config["site"] + "/wiki") node[field] = node[field].replace("/mediawiki", config["site"] + "/mediawiki") def decompose(node, field): for unwanted in node.select(field): unwanted.decompose() def get_search_response(config, api_root, query): try: response = requests.get(config['site'] + api_root + '/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', headers=headers, timeout=7) except requests.exceptions.Timeout: raise WikiError("Request timed out.") except requests.exceptions.RequestException as e: print(f"Failed to complete request: {e}") raise WikiError("Error with wiki request.") try: response.raise_for_status() except HTTPError as e: print(f"HTTP error occured: {e}") raise WikiError("Bad HTTP response.") data = response.json() if not data[3]: raise WikiError("No such wiki entry found.") return data[3] def get_page_content(config, page_name): try: response = requests.get(page_name, headers=headers, timeout=7) except requests.exceptions.Timeout: raise WikiError("Request timed out.") except requests.exceptions.RequestException as e: print(f"Failed to complete request: {e}") raise WikiError("Error with wiki request.") try: response.raise_for_status() except HTTPError as e: print(f"HTTP error occured: {e}") raise WikiError("Bad HTTP response.") soup = BeautifulSoup(response.content, "html.parser") parser_output = soup.select_one(".mw-parser-output") decompose(parser_output, "sup") decompose(parser_output, "div.portal") decompose(parser_output, "table.nottemplate") decompose(parser_output, "table.Anpassen") decompose(parser_output, "div.quotebox") decompose(parser_output, "div.floatleft") image = parser_output.find("a", {"class": "image"}) image_link = None if image: if image["href"] == "/wiki/File:Targetdrone.gif": image = parser_output.find_all("a", {"class": "image"})[1] if image: if image.contents[0]: fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] try: result = urlparse(image_link) if not result.netloc: fix_link(config, image.contents[0], "data-src") image_link = image.contents[0]["data-src"] except ValueError: fix_link(config, image.contents[0], "data-src") image_link = image.contents[0]["data-src"] decompose(parser_output, "br") decompose(parser_output, "table") decompose(parser_output, "aside") for p in parser_output.find_all("p"): if not p.get_text(strip=True): p.decompose() content = parser_output.find("p") for a in content.findAll("a"): fix_link(config, a, "href") header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) if not header_md: header_md = markdownify(str(soup.select_one(".mw-page-title-main"))) content_md = textwrap.shorten(markdownify(str(content)), width=1000, placeholder="...") return header_md, content_md, image_link