some basic functionality

2025-11-13 21:56:15 +01:00 · 2025-11-13 21:56:15 +01:00 · f8b33e1d6b
commit f8b33e1d6b
parent 197b85d8e6
2 changed files with 112 additions and 0 deletions
--- a/crawler.py
+++ b/crawler.py
@ -0,0 +1,107 @@
+import requests
+import json
+from bs4 import BeautifulSoup
+from time import sleep
+
+BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
+BUNDESTAG_BASE_URL = "https://www.bundestag.de"
+
+
+class Biography:
+    def __init__(
+        self,
+        name,
+        cv,
+        speeches,
+        votes,
+        functions,
+        additional_functions,
+        mandate,
+        disclosures,
+    ):
+        self.name = name
+        self.cv = cv
+        self.votes = votes
+        self.functions = functions
+        self.additional_functions = additional_functions
+        self.mandate = mandate
+        self.disclosures = disclosures
+
+
+def main():
+    links, names = get_links_and_names()
+    bios = [get_bio(bio) for bio in links]
+
+
+def get_links_and_names():
+    response = requests.get(BUNDESTAG_URL)
+    soup = BeautifulSoup(response.content)
+    links = [a.get("href") for a in soup.find_all("a")]
+    names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
+
+    return (links, names)
+
+
+def get_bio(url):
+    print(f"Getting {url}")
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content)
+    cv = soup.find(class_="m-biography__biography").text.strip()
+    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
+    speech_div = None
+    vote_div = None
+    for div in ajax_divs:
+        if "abstimmung" in div.get("x-data"):
+            vote_div = div
+        else:
+            speech_div = div
+    speech = get_ajax(speech_div)
+    speech_infos, speech_titles = parse_speech(speech)
+    vote = get_ajax(vote_div)
+    votes = parse_vote(vote)
+    print(cv, speech_infos, speech_titles, votes)
+    sleep(10)
+
+
+def parse_speech(page):
+    if not page:
+        return (None, None)
+    soup = BeautifulSoup(page.content)
+    infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
+    titles = [
+        title.text
+        for title in soup.find_all(class_="a-link__label")
+        if "--hidden" not in title.get("class")
+    ][::2]
+    return (titles, infos)
+
+
+def parse_vote(page):
+    if not page:
+        return None
+    soup = BeautifulSoup(page.content)
+    rows = soup.find_all("tr")[1:]
+    parsed = []
+    for row in rows:
+        cols = row.find_all("td")
+        parsed.append([col.text.strip() for col in cols])
+    return parsed
+
+
+def get_ajax(elem):
+    if not elem:
+        return None
+    inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
+    data = json.loads(inner)
+    url = BUNDESTAG_BASE_URL + data["endpoint"]
+    filters = data["filters"]
+    sanitized_filters = [
+        (key, value.replace(" ", "+").replace("#", "%23"))
+        for key, value in filters.items()
+    ]
+    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
+    return requests.get(url)
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,3 +10,8 @@ maintainers = [
 ]

 description = "Crawls the website of the german parlament and tracks any changes in a separate repository."
+
+dependencies = [
+  "beautifulsoup4",
+  "requests",
+]