import requests import re import json from bs4 import BeautifulSoup from time import sleep from os.path import commonprefix from os import makedirs import argparse BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" BUNDESTAG_BASE_URL = "https://www.bundestag.de" class Biography: def __init__( self, name, party, cv, speeches, votes, functions, additional_functions, mandate, disclosures, ): self.name = name self.party = party self.cv = cv self.speeches = speeches self.votes = votes self.functions = functions self.additional_functions = additional_functions self.mandate = mandate self.disclosures = disclosures def __repr__(self): txt = f""" name: {self.name} party: {self.party} cv: {self.cv} speeches: {self.speeches} votes: {self.votes} functions: {self.functions} additional_functions: {self.additional_functions} mandate: {self.mandate} disclosures: {self.disclosures} """ return txt def to_dict(self): return { "name": self.name, "party": self.party, "cv": self.cv, "speeches": self.speeches, "votes": self.votes, "functions": self.functions, "additional_functions": self.additional_functions, "mandate": self.mandate, "disclosures": self.disclosures, } def main(): parser = argparse.ArgumentParser( prog="Bundescrawler", description="Crawls the pages of german representatives and saves the information in a git repository", ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") try: makedirs(args.out) except FileExistsError: print("Path already exists") pass links, names = get_links_and_names() if args.debug: links = links[:5] names = names[:5] bios = [get_bio(link, name) for link, name in zip(links, names)] save_info(bios, args.out) def save_info(bios, out): with open(f"{out}/raw.json", "w") as raw_file: json.dump([bio.to_dict() for bio in bios], raw_file, indent=2) def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) links = [a.get("href") for a in soup.find_all("a")] names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")] return (links, names) def get_bio(url, name): print(name) name, party = name print(name) name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") response = requests.get(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") speech_div = None vote_div = None for div in ajax_divs: if "abstimmung" in div.get("x-data"): vote_div = div else: speech_div = div speech = get_ajax(speech_div) speeches = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) function_divs = soup.find_all(class_="m-biography__membership") if len(function_divs) > 0: functions = get_functions(function_divs[0]) else: functions = None if len(function_divs) > 1: additional_functions = get_functions(function_divs[1]) else: additional_functions = None mandate = ( soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() bio = Biography( name, party, cv, speeches, votes, functions, additional_functions, mandate, disclosures, ) print(bio) sleep(1) return bio def get_functions(elem): out = [] current_heading = None current_body = [] for child in elem.children: if child.name == "h3": if current_body != []: out.append((current_heading, current_body)) current_heading = child.text.strip() current_body = [] continue if not child.name: continue current_body.append(child.text.strip()) out.append((current_heading, current_body)) return out def parse_speech(page): if not page: return (None, None) soup = BeautifulSoup(page.content) infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ title.text for title in soup.find_all(class_="a-link__label") if "--hidden" not in title.get("class") ][::2] return list(zip(titles, infos)) def parse_vote(page): if not page: return None soup = BeautifulSoup(page.content) rows = soup.find_all("tr")[1:] parsed = [] for row in rows: cols = row.find_all("td") parsed.append([col.text.strip() for col in cols]) return parsed def get_ajax(elem): if not elem: return None inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")") data = json.loads(inner) url = BUNDESTAG_BASE_URL + data["endpoint"] filters = data["filters"] sanitized_filters = [ (key, value.replace(" ", "+").replace("#", "%23")) for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) return requests.get(url) def common_suffix(strings): return commonprefix([s[::-1] for s in strings])[::-1] if __name__ == "__main__": main()