sort functions alphabetically also in raw

sort functions alphabetically and handle rate limit more gracefully
fix disclosuure handling
2025-11-15 15:38:56 +01:00 · 2025-11-15 15:35:11 +01:00 · 2025-11-15 15:30:07 +01:00 · 2025-11-15 14:43:31 +01:00
1 changed files with 49 additions and 11 deletions
--- a/crawler.py
+++ b/crawler.py
@ -88,8 +88,7 @@ Biographie: {self.cv}
 # Mandat 
 {self.mandate[0]},  {self.mandate[1]}

-# Veröffentlichungspflichtige Angaben
-{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
+# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)}
            """
        return txt

@ -113,7 +112,7 @@ def funcs_to_str(funcs):
    out = ""
    for func in funcs:
        out += f"\n- {func[0]}"
-        for loc in func[1]:
+        for loc in sorted(func[1]):
            out += f"\n  - {loc}"
    return out

@ -125,6 +124,7 @@ def main():
    )
    parser.add_argument("-o", "--out")
    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--no-git", action="store_true")
    args = parser.parse_args()
    if not args.out:
        raise ValueError("must supply out directory")
@ -133,15 +133,18 @@ def main():
    if args.debug:
        links = links[:5]
        names = names[:5]
-        sleep_for = 1
+        sleep_for = 0
    else:
-        sleep_for = 5
+        sleep_for = 10

    bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]

    save_raw(bios, args.out)
    save_individuals(bios, args.out)

+    if args.no_git:
+        return
+
    repo.git.add("*")
    repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    origin = repo.remote(name="origin")
@ -181,7 +184,12 @@ def get_bio(url, name, sleep_for):
    name, party = name
    name = name.split(", ")
    print(f"Getting {url} for {name[1]} {name[0]}")
-    response = requests.get(url)
+    for _ in range(5):
+        try:
+            response = requests.get(url)
+        except:
+            print("Rate limit! waiting 5min")
+            sleep(300)
    soup = BeautifulSoup(response.content)
    cv = soup.find(class_="m-biography__biography").text.strip()
    ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
@ -209,7 +217,7 @@ def get_bio(url, name, sleep_for):
        soup.find(class_="m-biography__subHeading --mandate").text,
        soup.find(string=re.compile(r"^Wahlkreis \d*:")),
    )
-    disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
+    disclosures = get_disclosures(soup.find(class_="m-biography__infos"))

    bio = Biography(
        name,
@ -228,6 +236,30 @@ def get_bio(url, name, sleep_for):
    return bio


+def get_disclosures(elem):
+    if not elem:
+        return None
+    divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
+    out = []
+    for div in divs:
+        current_heading = ""
+        current_body = []
+        for child in div.children:
+            if child.name == "h3":
+                if current_body != []:
+                    out.append((current_heading, current_body))
+                current_heading = child.text.strip()
+                current_body = []
+                continue
+            if not child.name:
+                continue
+            if child.text.strip() == "":
+                continue
+            current_body.append(child.text.strip())
+        out.append((current_heading, current_body))
+    return out
+
+
 def get_functions(elem):
    out = []
    current_heading = None
@ -235,7 +267,7 @@ def get_functions(elem):
    for child in elem.children:
        if child.name == "h3":
            if current_body != []:
-                out.append((current_heading, current_body))
+                out.append((current_heading, sorted(current_body)))
            current_heading = child.text.strip()
            current_body = []
            continue
@ -246,13 +278,13 @@ def get_functions(elem):
            for grandchild in child.children
            if grandchild.text.strip() != ""
        )
-    out.append((current_heading, current_body))
+    out.append((current_heading, sorted(current_body)))
    return out


 def parse_speech(page):
    if not page:
-        return (None, None)
+        return None
    soup = BeautifulSoup(page.content)
    infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
    titles = [
@ -287,7 +319,13 @@ def get_ajax(elem):
        for key, value in filters.items()
    ]
    url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
-    return requests.get(url)
+    for _ in range(5):
+        try:
+            response = requests.get(url)
+        except:
+            print("Rate limit! waiting 5min")
+            sleep(300)
+    return response


 def common_suffix(strings):
Author	SHA1	Message	Date
Marco Lents	d47627643f	sort functions alphabetically also in raw	2025-11-15 15:38:56 +01:00
Marco Lents	be2371e0f5	sort functions alphabetically and handle rate limit more gracefully	2025-11-15 15:35:11 +01:00
Marco Lents	3f0130c75b	fix disclosuure handling	2025-11-15 15:30:07 +01:00
Marco Lents	93310a8030	fix handling of empty speeches	2025-11-15 14:43:31 +01:00