diff options
Diffstat (limited to 'src/scrape_lexicanum.py')
| -rw-r--r-- | src/scrape_lexicanum.py | 58 |
1 files changed, 58 insertions, 0 deletions
diff --git a/src/scrape_lexicanum.py b/src/scrape_lexicanum.py new file mode 100644 index 0000000..a719fb1 --- /dev/null +++ b/src/scrape_lexicanum.py @@ -0,0 +1,58 @@ +from bs4 import BeautifulSoup +from markdownify import markdownify +from requests.exceptions import HTTPError + +import requests + +class WikiError(Exception): + pass + +def get_search_response(config, query): + try: + response = requests.get(config['site'] + '/mediawiki/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', timeout=7) + except requests.exceptions.Timeout: + raise WikiError("Request timed out.") + except requests.exceptions.RequestException as e: + print(f"Failed to complete request: {e}") + raise WikiError("Error with wiki request.") + + try: + response.raise_for_status() + except HTTPError as e: + print(f"HTTP error occured: {e}") + raise WikiError("Bad HTTP response.") + + data = response.json() + + if not data[3]: + raise WikiError("No such wiki entry found.") + + return data[3] + +def get_page_content(config, page_name): + try: + response = requests.get(page_name, timeout=7) + except requests.exceptions.Timeout: + raise WikiError("Request timed out.") + except requests.exceptions.RequestException as e: + print(f"Failed to complete request: {e}") + raise WikiError("Error with wiki request.") + + try: + response.raise_for_status() + except HTTPError as e: + print(f"HTTP error occured: {e}") + raise WikiError("Bad HTTP response.") + + soup = BeautifulSoup(response.content, "html.parser") + + parser_output = soup.find(id="mw-content-text").contents[0] + content = parser_output.find("p") + for a in content.findAll("a"): + a["href"] = a["href"].replace("/wiki", config["site"] + "/wiki") + + header_md = markdownify(str(soup.find(id="firstHeading").contents[0])) + content_md = markdownify(str(content)) + + return header_md, content_md + |
