From 93310a803025e6248178f19d5ee923165870645c Mon Sep 17 00:00:00 2001
From: Marco Lents <lentsmarco@gmail.com>
Date: Sat, 15 Nov 2025 14:43:31 +0100
Subject: [PATCH 1/4] fix handling of empty speeches

---
 crawler.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crawler.py b/crawler.py
index 969c837..c47432e 100644
--- a/crawler.py
+++ b/crawler.py
@@ -133,9 +133,9 @@ def main():
     if args.debug:
         links = links[:5]
         names = names[:5]
-        sleep_for = 1
+        sleep_for = 0
     else:
-        sleep_for = 5
+        sleep_for = 10
 
     bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
 
@@ -252,7 +252,7 @@ def get_functions(elem):
 
 def parse_speech(page):
     if not page:
-        return (None, None)
+        return None
     soup = BeautifulSoup(page.content)
     infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
     titles = [

From 3f0130c75b20f93c48a2b79187886a36fbc77b6f Mon Sep 17 00:00:00 2001
From: Marco Lents <lentsmarco@gmail.com>
Date: Sat, 15 Nov 2025 15:30:07 +0100
Subject: [PATCH 2/4] fix disclosuure handling

---
 crawler.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/crawler.py b/crawler.py
index c47432e..46aa103 100644
--- a/crawler.py
+++ b/crawler.py
@@ -88,8 +88,7 @@ Biographie: {self.cv}
 # Mandat 
 {self.mandate[0]},  {self.mandate[1]}
 
-# Veröffentlichungspflichtige Angaben
-{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
+# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)}
             """
         return txt
 
@@ -125,6 +124,7 @@ def main():
     )
     parser.add_argument("-o", "--out")
     parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--no-git", action="store_true")
     args = parser.parse_args()
     if not args.out:
         raise ValueError("must supply out directory")
@@ -142,6 +142,9 @@ def main():
     save_raw(bios, args.out)
     save_individuals(bios, args.out)
 
+    if args.no_git:
+        return
+
     repo.git.add("*")
     repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
     origin = repo.remote(name="origin")
@@ -209,7 +212,7 @@ def get_bio(url, name, sleep_for):
         soup.find(class_="m-biography__subHeading --mandate").text,
         soup.find(string=re.compile(r"^Wahlkreis \d*:")),
     )
-    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
+    disclosures = get_disclosures(soup.find(class_="m-biography__infos"))
 
     bio = Biography(
         name,
@@ -228,6 +231,30 @@ def get_bio(url, name, sleep_for):
     return bio
 
 
+def get_disclosures(elem):
+    if not elem:
+        return None
+    divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
+    out = []
+    for div in divs:
+        current_heading = ""
+        current_body = []
+        for child in div.children:
+            if child.name == "h3":
+                if current_body != []:
+                    out.append((current_heading, current_body))
+                current_heading = child.text.strip()
+                current_body = []
+                continue
+            if not child.name:
+                continue
+            if child.text.strip() == "":
+                continue
+            current_body.append(child.text.strip())
+        out.append((current_heading, current_body))
+    return out
+
+
 def get_functions(elem):
     out = []
     current_heading = None

From be2371e0f5c0cc2fe54ac8dc69f985ec7b16ae23 Mon Sep 17 00:00:00 2001
From: Marco Lents <lentsmarco@gmail.com>
Date: Sat, 15 Nov 2025 15:35:11 +0100
Subject: [PATCH 3/4] sort functions alphabetically and handle rate limit more
 gracefully

---
 crawler.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/crawler.py b/crawler.py
index 46aa103..c31051e 100644
--- a/crawler.py
+++ b/crawler.py
@@ -112,7 +112,7 @@ def funcs_to_str(funcs):
     out = ""
     for func in funcs:
         out += f"\n- {func[0]}"
-        for loc in func[1]:
+        for loc in sorted(func[1]):
             out += f"\n  - {loc}"
     return out
 
@@ -184,7 +184,12 @@ def get_bio(url, name, sleep_for):
     name, party = name
     name = name.split(", ")
     print(f"Getting {url} for {name[1]} {name[0]}")
-    response = requests.get(url)
+    for _ in range(5):
+        try:
+            response = requests.get(url)
+        except:
+            print("Rate limit! waiting 5min")
+            sleep(300)
     soup = BeautifulSoup(response.content)
     cv = soup.find(class_="m-biography__biography").text.strip()
     ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
@@ -314,7 +319,13 @@ def get_ajax(elem):
         for key, value in filters.items()
     ]
     url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
-    return requests.get(url)
+    for _ in range(5):
+        try:
+            response = requests.get(url)
+        except:
+            print("Rate limit! waiting 5min")
+            sleep(300)
+    return response
 
 
 def common_suffix(strings):

From d47627643f9637469d84a415d4e16ccba6e69a99 Mon Sep 17 00:00:00 2001
From: Marco Lents <lentsmarco@gmail.com>
Date: Sat, 15 Nov 2025 15:38:56 +0100
Subject: [PATCH 4/4] sort functions alphabetically also in raw

---
 crawler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawler.py b/crawler.py
index c31051e..7b5ea7d 100644
--- a/crawler.py
+++ b/crawler.py
@@ -267,7 +267,7 @@ def get_functions(elem):
     for child in elem.children:
         if child.name == "h3":
             if current_body != []:
-                out.append((current_heading, current_body))
+                out.append((current_heading, sorted(current_body)))
             current_heading = child.text.strip()
             current_body = []
             continue
@@ -278,7 +278,7 @@ def get_functions(elem):
             for grandchild in child.children
             if grandchild.text.strip() != ""
         )
-    out.append((current_heading, current_body))
+    out.append((current_heading, sorted(current_body)))
     return out