Bundescrawler/crawler.py

import requests
import re
import json
from bs4 import BeautifulSoup
from time import sleep

BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"


class Biography:
    def __init__(
        self,
        name,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    ):
        self.name = name
        self.cv = cv
        self.speeches = speeches
        self.votes = votes
        self.functions = functions
        self.additional_functions = additional_functions
        self.mandate = mandate
        self.disclosures = disclosures


def main():
    links, names = get_links_and_names()
    bios = [get_bio(link, name) for link, name in zip(links, names)]


def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
    names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]

    return (links, names)


def get_bio(url, name):
    print(name)
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
    speech_div = None
    vote_div = None
    for div in ajax_divs:
        if "abstimmung" in div.get("x-data"):
            vote_div = div
        else:
            speech_div = div
    speech = get_ajax(speech_div)
    speech_infos, speech_titles = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
    function_divs = soup.find_all(class_="m-biography__membership")
    if len(function_divs) > 0:
        functions = get_functions(function_divs[0])
    else:
        functions = None
    if len(function_divs) > 1:
        additional_functions = get_functions(function_divs[1])
    else:
        additional_functions = None
    mandate = (
        soup.find(class_="m-biography__subHeading --mandate").text,
        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
    )
    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()

    print(
        name,
        cv,
        (speech_titles, speech_infos),
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )
    sleep(10)

    return Biography(
        name,
        cv,
        (speech_titles, speech_infos),
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )


def get_functions(elem):
    out = []
    current_heading = None
    current_body = []
    for child in elem.children:
        if child.name == "h3":
            if current_body != []:
                out.append((current_heading, current_body))
            current_heading = child.text.strip()
            current_body = []
            continue
        if not child.name:
            continue
        current_body.append(child.text.strip())
    out.append((current_heading, current_body))
    return out


def parse_speech(page):
    if not page:
        return (None, None)
    soup = BeautifulSoup(page.content)
    infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
    titles = [
        title.text
        for title in soup.find_all(class_="a-link__label")
        if "--hidden" not in title.get("class")
    ][::2]
    return (titles, infos)


def parse_vote(page):
    if not page:
        return None
    soup = BeautifulSoup(page.content)
    rows = soup.find_all("tr")[1:]
    parsed = []
    for row in rows:
        cols = row.find_all("td")
        parsed.append([col.text.strip() for col in cols])
    return parsed


def get_ajax(elem):
    if not elem:
        return None
    inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
    data = json.loads(inner)
    url = BUNDESTAG_BASE_URL + data["endpoint"]
    filters = data["filters"]
    sanitized_filters = [
        (key, value.replace(" ", "+").replace("#", "%23"))
        for key, value in filters.items()
    ]
    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
    return requests.get(url)


if __name__ == "__main__":
    main()