Bundescrawler/crawler.py

import requests
import json
from bs4 import BeautifulSoup
from time import sleep

BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"


class Biography:
    def __init__(
        self,
        name,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    ):
        self.name = name
        self.cv = cv
        self.votes = votes
        self.functions = functions
        self.additional_functions = additional_functions
        self.mandate = mandate
        self.disclosures = disclosures


def main():
    links, names = get_links_and_names()
    bios = [get_bio(bio) for bio in links]


def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
    names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]

    return (links, names)


def get_bio(url):
    print(f"Getting {url}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
    speech_div = None
    vote_div = None
    for div in ajax_divs:
        if "abstimmung" in div.get("x-data"):
            vote_div = div
        else:
            speech_div = div
    speech = get_ajax(speech_div)
    speech_infos, speech_titles = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
    print(cv, speech_infos, speech_titles, votes)
    sleep(10)


def parse_speech(page):
    if not page:
        return (None, None)
    soup = BeautifulSoup(page.content)
    infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
    titles = [
        title.text
        for title in soup.find_all(class_="a-link__label")
        if "--hidden" not in title.get("class")
    ][::2]
    return (titles, infos)


def parse_vote(page):
    if not page:
        return None
    soup = BeautifulSoup(page.content)
    rows = soup.find_all("tr")[1:]
    parsed = []
    for row in rows:
        cols = row.find_all("td")
        parsed.append([col.text.strip() for col in cols])
    return parsed


def get_ajax(elem):
    if not elem:
        return None
    inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
    data = json.loads(inner)
    url = BUNDESTAG_BASE_URL + data["endpoint"]
    filters = data["filters"]
    sanitized_filters = [
        (key, value.replace(" ", "+").replace("#", "%23"))
        for key, value in filters.items()
    ]
    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
    return requests.get(url)


if __name__ == "__main__":
    main()