From cb3186e00e71a65ad51db4ea69fcf58f943475b3 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Fri, 14 Nov 2025 10:10:24 +0100 Subject: [PATCH] full crawler functionality --- crawler.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index dbe46b4..59a5600 100644 --- a/crawler.py +++ b/crawler.py @@ -1,4 +1,5 @@ import requests +import re import json from bs4 import BeautifulSoup from time import sleep @@ -21,6 +22,7 @@ class Biography: ): self.name = name self.cv = cv + self.speeches = speeches self.votes = votes self.functions = functions self.additional_functions = additional_functions @@ -30,20 +32,22 @@ class Biography: def main(): links, names = get_links_and_names() - bios = [get_bio(bio) for bio in links] + bios = [get_bio(link, name) for link, name in zip(links, names)] def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) links = [a.get("href") for a in soup.find_all("a")] - names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")] + names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")] return (links, names) -def get_bio(url): - print(f"Getting {url}") +def get_bio(url, name): + print(name) + name = name.split(", ") + print(f"Getting {url} for {name[1]} {name[0]}") response = requests.get(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() @@ -59,9 +63,62 @@ def get_bio(url): speech_infos, speech_titles = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) - print(cv, speech_infos, speech_titles, votes) + function_divs = soup.find_all(class_="m-biography__membership") + if len(function_divs) > 0: + functions = get_functions(function_divs[0]) + else: + functions = None + if len(function_divs) > 1: + additional_functions = get_functions(function_divs[1]) + else: + additional_functions = None + mandate = ( + soup.find(class_="m-biography__subHeading --mandate").text, + soup.find(string=re.compile(r"^Wahlkreis \d*:")), + ) + disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() + + print( + name, + cv, + (speech_titles, speech_infos), + votes, + functions, + additional_functions, + mandate, + disclosures, + ) sleep(10) + return Biography( + name, + cv, + (speech_titles, speech_infos), + votes, + functions, + additional_functions, + mandate, + disclosures, + ) + + +def get_functions(elem): + out = [] + current_heading = None + current_body = [] + for child in elem.children: + if child.name == "h3": + if current_body != []: + out.append((current_heading, current_body)) + current_heading = child.text.strip() + current_body = [] + continue + if not child.name: + continue + current_body.append(child.text.strip()) + out.append((current_heading, current_body)) + return out + def parse_speech(page): if not page: