Bundescrawler/crawler.py

import requests
import re
import json
from bs4 import BeautifulSoup
from time import sleep
from os.path import commonprefix
from os import makedirs
import argparse

BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"


class Biography:
    def __init__(
        self,
        name,
        party,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    ):
        self.name = name
        self.party = party
        self.cv = cv
        self.speeches = speeches
        self.votes = votes
        self.functions = functions
        self.additional_functions = additional_functions
        self.mandate = mandate
        self.disclosures = disclosures

    def __repr__(self):
        txt = f"""
name: {self.name}
party: {self.party}
cv: {self.cv}
speeches: {self.speeches}
votes: {self.votes}
functions: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
            """
        return txt

    def to_dict(self):
        return {
            "name": self.name,
            "party": self.party,
            "cv": self.cv,
            "speeches": self.speeches,
            "votes": self.votes,
            "functions": self.functions,
            "additional_functions": self.additional_functions,
            "mandate": self.mandate,
            "disclosures": self.disclosures,
        }


def main():
    parser = argparse.ArgumentParser(
        prog="Bundescrawler",
        description="Crawls the pages of german representatives and saves the information in a git repository",
    )
    parser.add_argument("-o", "--out")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    if not args.out:
        raise ValueError("must supply out directory")
    try:
        makedirs(args.out)
    except FileExistsError:
        print("Path already exists")
        pass
    links, names = get_links_and_names()
    if args.debug:
        links = links[:5]
        names = names[:5]
    bios = [get_bio(link, name) for link, name in zip(links, names)]

    save_info(bios, args.out)


def save_info(bios, out):
    with open(f"{out}/raw.json", "w") as raw_file:
        json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)


def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
    names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]

    return (links, names)


def get_bio(url, name):
    print(name)
    name, party = name
    print(name)
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
    speech_div = None
    vote_div = None
    for div in ajax_divs:
        if "abstimmung" in div.get("x-data"):
            vote_div = div
        else:
            speech_div = div
    speech = get_ajax(speech_div)
    speeches = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
    function_divs = soup.find_all(class_="m-biography__membership")
    if len(function_divs) > 0:
        functions = get_functions(function_divs[0])
    else:
        functions = None
    if len(function_divs) > 1:
        additional_functions = get_functions(function_divs[1])
    else:
        additional_functions = None
    mandate = (
        soup.find(class_="m-biography__subHeading --mandate").text,
        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
    )
    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()

    bio = Biography(
        name,
        party,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )

    print(bio)
    sleep(1)

    return bio


def get_functions(elem):
    out = []
    current_heading = None
    current_body = []
    for child in elem.children:
        if child.name == "h3":
            if current_body != []:
                out.append((current_heading, current_body))
            current_heading = child.text.strip()
            current_body = []
            continue
        if not child.name:
            continue
        current_body.append(child.text.strip())
    out.append((current_heading, current_body))
    return out


def parse_speech(page):
    if not page:
        return (None, None)
    soup = BeautifulSoup(page.content)
    infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
    titles = [
        title.text
        for title in soup.find_all(class_="a-link__label")
        if "--hidden" not in title.get("class")
    ][::2]
    return list(zip(titles, infos))


def parse_vote(page):
    if not page:
        return None
    soup = BeautifulSoup(page.content)
    rows = soup.find_all("tr")[1:]
    parsed = []
    for row in rows:
        cols = row.find_all("td")
        parsed.append([col.text.strip() for col in cols])
    return parsed


def get_ajax(elem):
    if not elem:
        return None
    inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
    data = json.loads(inner)
    url = BUNDESTAG_BASE_URL + data["endpoint"]
    filters = data["filters"]
    sanitized_filters = [
        (key, value.replace(" ", "+").replace("#", "%23"))
        for key, value in filters.items()
    ]
    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
    return requests.get(url)


def common_suffix(strings):
    return commonprefix([s[::-1] for s in strings])[::-1]


if __name__ == "__main__":
    main()