import requests import re import json from bs4 import BeautifulSoup from time import sleep from os.path import commonprefix from git import Repo import argparse from datetime import datetime BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" BUNDESTAG_BASE_URL = "https://www.bundestag.de" class Biography: def __init__( self, name, party, cv, speeches, votes, functions, additional_functions, mandate, disclosures, ): self.name = name self.party = party self.cv = cv self.speeches = speeches self.votes = votes self.functions = functions self.additional_functions = additional_functions self.mandate = mandate self.disclosures = disclosures def __repr__(self): txt = f""" name: {self.name} party: {self.party} cv: {self.cv} speeches: {self.speeches} votes: {self.votes} functions: {self.functions} additional_functions: {self.additional_functions} mandate: {self.mandate} disclosures: {self.disclosures} """ return txt def __str__(self): if self.speeches: speeches_str = "".join( [f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches] ) else: speeches_str = "" if self.votes: votes_str = "".join( [f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes] ) else: votes_str = "" if self.functions: func_str = "".join([f"\n- "]) txt = f""" Name: {self.name[1]} {self.name[0]} Partei: {self.party} Biographie: {self.cv} Reden: {speeches_str} Abstimmungen: {votes_str} Ämter im Bundestag: {self.functions} additional_functions: {self.additional_functions} mandate: {self.mandate} disclosures: {self.disclosures} """ return txt def to_dict(self): return { "name": self.name, "party": self.party, "cv": self.cv, "speeches": self.speeches, "votes": self.votes, "functions": self.functions, "additional_functions": self.additional_functions, "mandate": self.mandate, "disclosures": self.disclosures, } def main(): parser = argparse.ArgumentParser( prog="Bundescrawler", description="Crawls the pages of german representatives and saves the information in a git repository", ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") repo = Repo(args.out) links, names = get_links_and_names() if args.debug: links = links[:5] names = names[:5] bios = [get_bio(link, name) for link, name in zip(links, names)] save_raw(bios, args.out) repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") origin.push() def save_individuals(bios, out): for rep in bios: first_letter = rep.name[0][0].upper() name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_") with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file: json.dump(str(rep), rep_file, indent=2, ensure_ascii=False) def save_raw(bios, out): with open(f"{out}/raw.json", "w") as raw_file: json.dump( [bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False ) def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) links = [a.get("href") for a in soup.find_all("a")] names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")] return (links, names) def get_bio(url, name): print(name) name, party = name print(name) name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") response = requests.get(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") speech_div = None vote_div = None for div in ajax_divs: if "abstimmung" in div.get("x-data"): vote_div = div else: speech_div = div speech = get_ajax(speech_div) speeches = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) function_divs = soup.find_all(class_="m-biography__memberships") if len(function_divs) > 0: functions = get_functions(function_divs[0]) else: functions = None if len(function_divs) > 1: additional_functions = get_functions(function_divs[1]) else: additional_functions = None mandate = ( soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() bio = Biography( name, party, cv, speeches, votes, functions, additional_functions, mandate, disclosures, ) print(bio) sleep(1) return bio def get_functions(elem): out = [] current_heading = None current_body = [] for child in elem.children: if child.name == "h3": if current_body != []: out.append((current_heading, current_body)) current_heading = child.text.strip() current_body = [] continue if not child.name: continue current_body.extend( grandchild.text.strip().replace("\n\n\n(Interner Link)", "") for grandchild in child.children if grandchild.text.strip() != "" ) out.append((current_heading, current_body)) return out def parse_speech(page): if not page: return (None, None) soup = BeautifulSoup(page.content) infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ title.text for title in soup.find_all(class_="a-link__label") if "--hidden" not in title.get("class") ][::2] return list(zip(titles, infos)) def parse_vote(page): if not page: return None soup = BeautifulSoup(page.content) rows = soup.find_all("tr")[1:] parsed = [] for row in rows: cols = row.find_all("td") parsed.append([col.text.strip() for col in cols]) return parsed def get_ajax(elem): if not elem: return None inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")") data = json.loads(inner) url = BUNDESTAG_BASE_URL + data["endpoint"] filters = data["filters"] sanitized_filters = [ (key, value.replace(" ", "+").replace("#", "%23")) for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) return requests.get(url) def common_suffix(strings): return commonprefix([s[::-1] for s in strings])[::-1] if __name__ == "__main__": main()