summaryrefslogtreecommitdiff
path: root/src/scrape_lexicanum.py
blob: 7920a0d2d76a3c8af7103c45f5632f2bcb0081af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from bs4 import BeautifulSoup
from markdownify import markdownify
from requests.exceptions import HTTPError
from urllib.parse import urlparse

import requests
import textwrap

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0"
        }

class WikiError(Exception):
    pass

def fix_link(config, node, field):
    node[field] = node[field].replace("/wiki", config["site"] + "/wiki")
    node[field] = node[field].replace("/mediawiki", config["site"] + "/mediawiki")

def decompose(node, field):
    for unwanted in node.select(field):
        unwanted.decompose()

def get_search_response(config, api_root, query):
    try:
        response = requests.get(config['site'] + api_root + '/api.php?action=opensearch&format=json&formatversion=2&search=' + query + '&namespace=0&limit=6&suggest=true', headers=headers, timeout=7)
    except requests.exceptions.Timeout:
        raise WikiError("Request timed out.")
    except requests.exceptions.RequestException as e:
        print(f"Failed to complete request: {e}")
        raise WikiError("Error with wiki request.")

    try:
        response.raise_for_status()
    except HTTPError as e:
        print(f"HTTP error occured: {e}")
        raise WikiError("Bad HTTP response.")

    data = response.json()

    if not data[3]:
        raise WikiError("No such wiki entry found.")

    return data[3]

def get_page_content(config, page_name):
    try:
        response = requests.get(page_name, headers=headers, timeout=7)
    except requests.exceptions.Timeout:
        raise WikiError("Request timed out.")
    except requests.exceptions.RequestException as e:
        print(f"Failed to complete request: {e}")
        raise WikiError("Error with wiki request.")

    try:
        response.raise_for_status()
    except HTTPError as e:
        print(f"HTTP error occured: {e}")
        raise WikiError("Bad HTTP response.")

    soup = BeautifulSoup(response.content, "html.parser")
    
    parser_output = soup.select_one(".mw-parser-output")

    decompose(parser_output, "sup")
    decompose(parser_output, "div.portal")
    decompose(parser_output, "table.nottemplate")
    decompose(parser_output, "table.Anpassen")
    decompose(parser_output, "div.quotebox")
    decompose(parser_output, "div.floatleft")

    image = parser_output.find("a", {"class": "image"})
    image_link = None
    if image:
        if image["href"] == "/wiki/File:Targetdrone.gif":
            image = parser_output.find_all("a", {"class": "image"})[1]

    if image:
        if image.contents[0]:
            fix_link(config, image.contents[0], "src")
            image_link = image.contents[0]["src"]

            try:
                result = urlparse(image_link)

                if not result.netloc:
                    fix_link(config, image.contents[0], "data-src")
                    image_link = image.contents[0]["data-src"]
            except ValueError:
                fix_link(config, image.contents[0], "data-src")
                image_link = image.contents[0]["data-src"]

    decompose(parser_output, "br")
    decompose(parser_output, "table")
    decompose(parser_output, "aside")

    for p in parser_output.find_all("p"):
        if not p.get_text(strip=True):
            p.decompose()

    content = parser_output.find("p")
    for a in content.findAll("a"):
        fix_link(config, a, "href")

    header_md = markdownify(str(soup.find(id="firstHeading").contents[0]))
    if not header_md:
        header_md = markdownify(str(soup.select_one(".mw-page-title-main")))

    content_md = textwrap.shorten(markdownify(str(content)), width=1000, placeholder="...")

    return header_md, content_md, image_link