full crawler functionality

2025-11-14 10:10:24 +01:00 · 2025-11-14 10:10:24 +01:00 · cb3186e00e
commit cb3186e00e
parent 2583829836
1 changed files with 62 additions and 5 deletions
--- a/crawler.py
+++ b/crawler.py
@ -1,4 +1,5 @@
 import requests
 import re
 import json
 from bs4 import BeautifulSoup
 from time import sleep
@ -21,6 +22,7 @@ class Biography:
    ):
        self.name = name
        self.cv = cv
        self.speeches = speeches
        self.votes = votes
        self.functions = functions
        self.additional_functions = additional_functions
@ -30,20 +32,22 @@ class Biography:
 def main():
    links, names = get_links_and_names()
-    bios = [get_bio(bio) for bio in links]
+    bios = [get_bio(link, name) for link, name in zip(links, names)]
 def get_links_and_names():
    response = requests.get(BUNDESTAG_URL)
    soup = BeautifulSoup(response.content)
    links = [a.get("href") for a in soup.find_all("a")]
-    names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
+    names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
    return (links, names)
-def get_bio(url):
+def get_bio(url, name):
-    print(f"Getting {url}")
+    print(name)
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
    response = requests.get(url)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
@ -59,9 +63,62 @@ def get_bio(url):
    speech_infos, speech_titles = parse_speech(speech)
    vote = get_ajax(vote_div)
    votes = parse_vote(vote)
-    print(cv, speech_infos, speech_titles, votes)
+    function_divs = soup.find_all(class_="m-biography__membership")
    if len(function_divs) > 0:
        functions = get_functions(function_divs[0])
    else:
        functions = None
    if len(function_divs) > 1:
        additional_functions = get_functions(function_divs[1])
    else:
        additional_functions = None
    mandate = (
        soup.find(class_="m-biography__subHeading --mandate").text,
        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
    )
    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
    print(
        name,
        cv,
        (speech_titles, speech_infos),
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )
    sleep(10)
    return Biography(
        name,
        cv,
        (speech_titles, speech_infos),
        votes,
        functions,
        additional_functions,
        mandate,
        disclosures,
    )
 def get_functions(elem):
    out = []
    current_heading = None
    current_body = []
    for child in elem.children:
        if child.name == "h3":
            if current_body != []:
                out.append((current_heading, current_body))
            current_heading = child.text.strip()
            current_body = []
            continue
        if not child.name:
            continue
        current_body.append(child.text.strip())
    out.append((current_heading, current_body))
    return out
 def parse_speech(page):
    if not page: