From f041cdfb2e86f742ab0f1b470d43de8659995cfe Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Fri, 9 Jan 2026 00:29:45 -0500 Subject: Add basic scraping and markdown --- src/scrape_lexicanum.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 src/scrape_lexicanum.py (limited to 'src/scrape_lexicanum.py') diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py new file mode 100644 index 0000000..a719fb1 --- /dev/null +++ b/src/scrape_lexicanum.py @@ -0,0 +1,58 @@ +from bs4 import BeautifulSoup +from markdownify import markdownify +from requests.exceptions import HTTPError + +import requests + +class WikiError(Exception): + pass + +def get_search_response(config, query): + try: + response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7) + except requests.exceptions.Timeout: + raise WikiError("Request timed out.") + except requests.exceptions.RequestException as e: + print(f"Failed to complete request: {e}") + raise WikiError("Error with wiki request.") + + try: + response.raise_for_status() + except HTTPError as e: + print(f"HTTP error occured: {e}") + raise WikiError("Bad HTTP response.") + + data = response.json() + + if not data[3]: + raise WikiError("No such wiki entry found.") + + return data[3] + +def get_page_content(config, page_name): + try: + response = requests.get(page_name, timeout=7) + except requests.exceptions.Timeout: + raise WikiError("Request timed out.") + except requests.exceptions.RequestException as e: + print(f"Failed to complete request: {e}") + raise WikiError("Error with wiki request.") + + try: + response.raise_for_status() + except HTTPError as e: + print(f"HTTP error occured: {e}") + raise WikiError("Bad HTTP response.") + + soup = BeautifulSoup(response.content, "html.parser") + + parser_output = soup.find(id="mw-content-text").contents[0] + content = parser_output.find("p") + for a in content.findAll("a"): + a["href"] = a["href"].replace("/wiki", config["site"] + "/wiki") + + header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) + content_md = markdownify(str(content)) + + return header_md, content_md + -- cgit v1.2.3