From f8b33e1d6be76373978ff09dc70715ee19a072e3 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Thu, 13 Nov 2025 21:56:15 +0100 Subject: [PATCH] some basic functionality --- crawler.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 5 +++ 2 files changed, 112 insertions(+) create mode 100644 crawler.py diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..dbe46b4 --- /dev/null +++ b/crawler.py @@ -0,0 +1,107 @@ +import requests +import json +from bs4 import BeautifulSoup +from time import sleep + +BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" +BUNDESTAG_BASE_URL = "https://www.bundestag.de" + + +class Biography: + def __init__( + self, + name, + cv, + speeches, + votes, + functions, + additional_functions, + mandate, + disclosures, + ): + self.name = name + self.cv = cv + self.votes = votes + self.functions = functions + self.additional_functions = additional_functions + self.mandate = mandate + self.disclosures = disclosures + + +def main(): + links, names = get_links_and_names() + bios = [get_bio(bio) for bio in links] + + +def get_links_and_names(): + response = requests.get(BUNDESTAG_URL) + soup = BeautifulSoup(response.content) + links = [a.get("href") for a in soup.find_all("a")] + names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")] + + return (links, names) + + +def get_bio(url): + print(f"Getting {url}") + response = requests.get(url) + soup = BeautifulSoup(response.content) + cv = soup.find(class_="m-biography__biography").text.strip() + ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") + speech_div = None + vote_div = None + for div in ajax_divs: + if "abstimmung" in div.get("x-data"): + vote_div = div + else: + speech_div = div + speech = get_ajax(speech_div) + speech_infos, speech_titles = parse_speech(speech) + vote = get_ajax(vote_div) + votes = parse_vote(vote) + print(cv, speech_infos, speech_titles, votes) + sleep(10) + + +def parse_speech(page): + if not page: + return (None, None) + soup = BeautifulSoup(page.content) + infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")] + titles = [ + title.text + for title in soup.find_all(class_="a-link__label") + if "--hidden" not in title.get("class") + ][::2] + return (titles, infos) + + +def parse_vote(page): + if not page: + return None + soup = BeautifulSoup(page.content) + rows = soup.find_all("tr")[1:] + parsed = [] + for row in rows: + cols = row.find_all("td") + parsed.append([col.text.strip() for col in cols]) + return parsed + + +def get_ajax(elem): + if not elem: + return None + inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")") + data = json.loads(inner) + url = BUNDESTAG_BASE_URL + data["endpoint"] + filters = data["filters"] + sanitized_filters = [ + (key, value.replace(" ", "+").replace("#", "%23")) + for key, value in filters.items() + ] + url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) + return requests.get(url) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index ef3a739..6cbae08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,3 +10,8 @@ maintainers = [ ] description = "Crawls the website of the german parlament and tracks any changes in a separate repository." + +dependencies = [ + "beautifulsoup4", + "requests", +]