From cb3186e00e71a65ad51db4ea69fcf58f943475b3 Mon Sep 17 00:00:00 2001
From: Marco Lents <lentsmarco@gmail.com>
Date: Fri, 14 Nov 2025 10:10:24 +0100
Subject: [PATCH] full crawler functionality

---
 crawler.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 62 insertions(+), 5 deletions(-)

diff --git a/crawler.py b/crawler.py
index dbe46b4..59a5600 100644
--- a/crawler.py
+++ b/crawler.py
@@ -1,4 +1,5 @@
 import requests
+import re
 import json
 from bs4 import BeautifulSoup
 from time import sleep
@@ -21,6 +22,7 @@ class Biography:
     ):
         self.name = name
         self.cv = cv
+        self.speeches = speeches
         self.votes = votes
         self.functions = functions
         self.additional_functions = additional_functions
@@ -30,20 +32,22 @@ class Biography:
 
 def main():
     links, names = get_links_and_names()
-    bios = [get_bio(bio) for bio in links]
+    bios = [get_bio(link, name) for link, name in zip(links, names)]
 
 
 def get_links_and_names():
     response = requests.get(BUNDESTAG_URL)
     soup = BeautifulSoup(response.content)
     links = [a.get("href") for a in soup.find_all("a")]
-    names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
+    names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
 
     return (links, names)
 
 
-def get_bio(url):
-    print(f"Getting {url}")
+def get_bio(url, name):
+    print(name)
+    name = name.split(", ")
+    print(f"Getting {url} for {name[1]} {name[0]}")
     response = requests.get(url)
     soup = BeautifulSoup(response.content)
     cv = soup.find(class_="m-biography__biography").text.strip()
@@ -59,9 +63,62 @@ def get_bio(url):
     speech_infos, speech_titles = parse_speech(speech)
     vote = get_ajax(vote_div)
     votes = parse_vote(vote)
-    print(cv, speech_infos, speech_titles, votes)
+    function_divs = soup.find_all(class_="m-biography__membership")
+    if len(function_divs) > 0:
+        functions = get_functions(function_divs[0])
+    else:
+        functions = None
+    if len(function_divs) > 1:
+        additional_functions = get_functions(function_divs[1])
+    else:
+        additional_functions = None
+    mandate = (
+        soup.find(class_="m-biography__subHeading --mandate").text,
+        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
+    )
+    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
+
+    print(
+        name,
+        cv,
+        (speech_titles, speech_infos),
+        votes,
+        functions,
+        additional_functions,
+        mandate,
+        disclosures,
+    )
     sleep(10)
 
+    return Biography(
+        name,
+        cv,
+        (speech_titles, speech_infos),
+        votes,
+        functions,
+        additional_functions,
+        mandate,
+        disclosures,
+    )
+
+
+def get_functions(elem):
+    out = []
+    current_heading = None
+    current_body = []
+    for child in elem.children:
+        if child.name == "h3":
+            if current_body != []:
+                out.append((current_heading, current_body))
+            current_heading = child.text.strip()
+            current_body = []
+            continue
+        if not child.name:
+            continue
+        current_body.append(child.text.strip())
+    out.append((current_heading, current_body))
+    return out
+
 
 def parse_speech(page):
     if not page: