save raw data as json

2025-11-14 11:28:39 +01:00 · 2025-11-14 11:28:39 +01:00 · 19cdfb486d
commit 19cdfb486d
parent cb3186e00e
1 changed files with 73 additions and 17 deletions
--- a/crawler.py
+++ b/crawler.py
@ -3,6 +3,9 @@ import re
 import json
 from bs4 import BeautifulSoup
 from time import sleep
+from os.path import commonprefix
+from os import makedirs
+import argparse

 BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
 BUNDESTAG_BASE_URL = "https://www.bundestag.de"
@ -12,6 +15,7 @@ class Biography:
    def __init__(
        self,
        name,
+        party,
        cv,
        speeches,
        votes,
@ -21,6 +25,7 @@ class Biography:
        disclosures,
    ):
        self.name = name
+        self.party = party
        self.cv = cv
        self.speeches = speeches
        self.votes = votes
@ -29,22 +34,75 @@ class Biography:
        self.mandate = mandate
        self.disclosures = disclosures

+    def __repr__(self):
+        txt = f"""
+name: {self.name}
+party: {self.party}
+cv: {self.cv}
+speeches: {self.speeches}
+votes: {self.votes}
+functions: {self.functions}
+additional_functions: {self.additional_functions}
+mandate: {self.mandate}
+disclosures: {self.disclosures}
+            """
+        return txt
+
+    def to_dict(self):
+        return {
+            "name": self.name,
+            "party": self.party,
+            "cv": self.cv,
+            "speeches": self.speeches,
+            "votes": self.votes,
+            "functions": self.functions,
+            "additional_functions": self.additional_functions,
+            "mandate": self.mandate,
+            "disclosures": self.disclosures,
+        }
+

 def main():
+    parser = argparse.ArgumentParser(
+        prog="Bundescrawler",
+        description="Crawls the pages of german representatives and saves the information in a git repository",
+    )
+    parser.add_argument("-o", "--out")
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    if not args.out:
+        raise ValueError("must supply out directory")
+    try:
+        makedirs(args.out)
+    except FileExistsError:
+        print("Path already exists")
+        pass
    links, names = get_links_and_names()
+    if args.debug:
+        links = links[:5]
+        names = names[:5]
    bios = [get_bio(link, name) for link, name in zip(links, names)]

+    save_info(bios, args.out)
+
+
+def save_info(bios, out):
+    with open(f"{out}/raw.json", "w") as raw_file:
+        json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
+

 def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
-    names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
+    names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]

    return (links, names)


 def get_bio(url, name):
+    print(name)
+    name, party = name
    print(name)
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
@ -60,7 +118,7 @@ def get_bio(url, name):
        else:
            speech_div = div
    speech = get_ajax(speech_div)
-    speech_infos, speech_titles = parse_speech(speech)
+    speeches = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
    function_divs = soup.find_all(class_="m-biography__membership")
@ -78,28 +136,22 @@ def get_bio(url, name):
    )
    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()

-    print(
+    bio = Biography(
        name,
+        party,
        cv,
-        (speech_titles, speech_infos),
+        speeches,
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )
-    sleep(10)

-    return Biography(
-        name,
-        cv,
-        (speech_titles, speech_infos),
-        votes,
-        functions,
-        additional_functions,
-        mandate,
-        disclosures,
-    )
+    print(bio)
+    sleep(1)
+
+    return bio


 def get_functions(elem):
@ -124,13 +176,13 @@ def parse_speech(page):
    if not page:
        return (None, None)
    soup = BeautifulSoup(page.content)
-    infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
+    infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
    titles = [
        title.text
        for title in soup.find_all(class_="a-link__label")
        if "--hidden" not in title.get("class")
    ][::2]
-    return (titles, infos)
+    return list(zip(titles, infos))


 def parse_vote(page):
@ -160,5 +212,9 @@ def get_ajax(elem):
    return requests.get(url)


+def common_suffix(strings):
+    return commonprefix([s[::-1] for s in strings])[::-1]
+
+
 if __name__ == "__main__":
    main()