import requests import json from bs4 import BeautifulSoup from time import sleep BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" BUNDESTAG_BASE_URL = "https://www.bundestag.de" class Biography: def __init__( self, name, cv, speeches, votes, functions, additional_functions, mandate, disclosures, ): self.name = name self.cv = cv self.votes = votes self.functions = functions self.additional_functions = additional_functions self.mandate = mandate self.disclosures = disclosures def main(): links, names = get_links_and_names() bios = [get_bio(bio) for bio in links] def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) links = [a.get("href") for a in soup.find_all("a")] names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")] return (links, names) def get_bio(url): print(f"Getting {url}") response = requests.get(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") speech_div = None vote_div = None for div in ajax_divs: if "abstimmung" in div.get("x-data"): vote_div = div else: speech_div = div speech = get_ajax(speech_div) speech_infos, speech_titles = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) print(cv, speech_infos, speech_titles, votes) sleep(10) def parse_speech(page): if not page: return (None, None) soup = BeautifulSoup(page.content) infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")] titles = [ title.text for title in soup.find_all(class_="a-link__label") if "--hidden" not in title.get("class") ][::2] return (titles, infos) def parse_vote(page): if not page: return None soup = BeautifulSoup(page.content) rows = soup.find_all("tr")[1:] parsed = [] for row in rows: cols = row.find_all("td") parsed.append([col.text.strip() for col in cols]) return parsed def get_ajax(elem): if not elem: return None inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")") data = json.loads(inner) url = BUNDESTAG_BASE_URL + data["endpoint"] filters = data["filters"] sanitized_filters = [ (key, value.replace(" ", "+").replace("#", "%23")) for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) return requests.get(url) if __name__ == "__main__": main()