from bs4 import BeautifulSoup from markdownify import markdownify from requests.exceptions import HTTPError import requests class WikiError(Exception): pass def fix_link(config, node, field): node[field] = node[field].replace("/wiki", config["site"] + "/wiki") node[field] = node[field].replace("/mediawiki", config["site"] + "/mediawiki") def decompose(node, field): for unwanted in node.select(field): unwanted.decompose() def get_search_response(config, query): try: response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7) except requests.exceptions.Timeout: raise WikiError("Request timed out.") except requests.exceptions.RequestException as e: print(f"Failed to complete request: {e}") raise WikiError("Error with wiki request.") try: response.raise_for_status() except HTTPError as e: print(f"HTTP error occured: {e}") raise WikiError("Bad HTTP response.") data = response.json() if not data[3]: raise WikiError("No such wiki entry found.") return data[3] def get_page_content(config, page_name): try: response = requests.get(page_name, timeout=7) except requests.exceptions.Timeout: raise WikiError("Request timed out.") except requests.exceptions.RequestException as e: print(f"Failed to complete request: {e}") raise WikiError("Error with wiki request.") try: response.raise_for_status() except HTTPError as e: print(f"HTTP error occured: {e}") raise WikiError("Bad HTTP response.") soup = BeautifulSoup(response.content, "html.parser") parser_output = soup.find(id="mw-content-text").contents[0] decompose(parser_output, "sup") decompose(parser_output, "div.portal") decompose(parser_output, "table.nottemplate") decompose(parser_output, "table.Anpassen") image = parser_output.find("a.image") image_link = None if image: if image.contents[0]: fix_link(config, image.contents[0], "src") image_link = image.contents[0]["src"] decompose(parser_output, "div.quotebox") decompose(parser_output, "br") decompose(parser_output, "table") for p in parser_output.find_all("p"): if not p.get_text(strip=True): p.decompose() content = parser_output.find("p") for a in content.findAll("a"): fix_link(config, a, "href") header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) content_md = markdownify(str(content)) return header_md, content_md, image_link