Bundescrawler/crawler.py

import requests
import re
import json
from bs4 import BeautifulSoup
from time import sleep
from os.path import commonprefix
from git import Repo
import argparse
from datetime import datetime

BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"


class Biography:
    def __init__(
        self,
        name,
        party,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    ):
        self.name = name
        self.party = party
        self.cv = cv
        self.speeches = speeches
        self.votes = votes
        self.functions = functions
        self.additional_functions = additional_functions
        self.mandate = mandate
        self.disclosures = disclosures

    def __repr__(self):
        txt = f"""
name: {self.name}
party: {self.party}
cv: {self.cv}
speeches: {self.speeches}
votes: {self.votes}
functions: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
            """
        return txt

    def __str__(self):
        if self.speeches:
            speeches_str = "".join(
                [f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches]
            )
        else:
            speeches_str = ""

        if self.votes:
            votes_str = "".join(
                [f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes]
            )
        else:
            votes_str = ""

        if self.functions:
            func_str = "".join([f"\n- "])
        txt = f"""
Name: {self.name[1]} {self.name[0]}

Partei: {self.party}

Biographie: {self.cv}

Reden: {speeches_str}

Abstimmungen: {votes_str}

Ämter im Bundestag: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
            """
        return txt

    def to_dict(self):
        return {
            "name": self.name,
            "party": self.party,
            "cv": self.cv,
            "speeches": self.speeches,
            "votes": self.votes,
            "functions": self.functions,
            "additional_functions": self.additional_functions,
            "mandate": self.mandate,
            "disclosures": self.disclosures,
        }


def main():
    parser = argparse.ArgumentParser(
        prog="Bundescrawler",
        description="Crawls the pages of german representatives and saves the information in a git repository",
    )
    parser.add_argument("-o", "--out")
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()
    if not args.out:
        raise ValueError("must supply out directory")
    repo = Repo(args.out)
    links, names = get_links_and_names()
    if args.debug:
        links = links[:5]
        names = names[:5]
    bios = [get_bio(link, name) for link, name in zip(links, names)]

    save_raw(bios, args.out)

    repo.git.add("*")
    repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    origin = repo.remote(name="origin")
    origin.push()


def save_individuals(bios, out):
    for rep in bios:
        first_letter = rep.name[0][0].upper()
        name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
        with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file:
            json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)


def save_raw(bios, out):
    with open(f"{out}/raw.json", "w") as raw_file:
        json.dump(
            [bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
        )


def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
    names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]

    return (links, names)


def get_bio(url, name):
    print(name)
    name, party = name
    print(name)
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
    speech_div = None
    vote_div = None
    for div in ajax_divs:
        if "abstimmung" in div.get("x-data"):
            vote_div = div
        else:
            speech_div = div
    speech = get_ajax(speech_div)
    speeches = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
    function_divs = soup.find_all(class_="m-biography__memberships")
    if len(function_divs) > 0:
        functions = get_functions(function_divs[0])
    else:
        functions = None
    if len(function_divs) > 1:
        additional_functions = get_functions(function_divs[1])
    else:
        additional_functions = None
    mandate = (
        soup.find(class_="m-biography__subHeading --mandate").text,
        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
    )
    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()

    bio = Biography(
        name,
        party,
        cv,
        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )

    print(bio)
    sleep(1)

    return bio


def get_functions(elem):
    out = []
    current_heading = None
    current_body = []
    for child in elem.children:
        if child.name == "h3":
            if current_body != []:
                out.append((current_heading, current_body))
            current_heading = child.text.strip()
            current_body = []
            continue
        if not child.name:
            continue
        current_body.extend(
            grandchild.text.strip().replace("\n\n\n(Interner Link)", "")
            for grandchild in child.children
            if grandchild.text.strip() != ""
        )
    out.append((current_heading, current_body))
    return out


def parse_speech(page):
    if not page:
        return (None, None)
    soup = BeautifulSoup(page.content)
    infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
    titles = [
        title.text
        for title in soup.find_all(class_="a-link__label")
        if "--hidden" not in title.get("class")
    ][::2]
    return list(zip(titles, infos))


def parse_vote(page):
    if not page:
        return None
    soup = BeautifulSoup(page.content)
    rows = soup.find_all("tr")[1:]
    parsed = []
    for row in rows:
        cols = row.find_all("td")
        parsed.append([col.text.strip() for col in cols])
    return parsed


def get_ajax(elem):
    if not elem:
        return None
    inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
    data = json.loads(inner)
    url = BUNDESTAG_BASE_URL + data["endpoint"]
    filters = data["filters"]
    sanitized_filters = [
        (key, value.replace(" ", "+").replace("#", "%23"))
        for key, value in filters.items()
    ]
    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
    return requests.get(url)


def common_suffix(strings):
    return commonprefix([s[::-1] for s in strings])[::-1]


if __name__ == "__main__":
    main()