From 93310a803025e6248178f19d5ee923165870645c Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sat, 15 Nov 2025 14:43:31 +0100 Subject: [PATCH 1/4] fix handling of empty speeches --- crawler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index 969c837..c47432e 100644 --- a/crawler.py +++ b/crawler.py @@ -133,9 +133,9 @@ def main(): if args.debug: links = links[:5] names = names[:5] - sleep_for = 1 + sleep_for = 0 else: - sleep_for = 5 + sleep_for = 10 bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] @@ -252,7 +252,7 @@ def get_functions(elem): def parse_speech(page): if not page: - return (None, None) + return None soup = BeautifulSoup(page.content) infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ From 3f0130c75b20f93c48a2b79187886a36fbc77b6f Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sat, 15 Nov 2025 15:30:07 +0100 Subject: [PATCH 2/4] fix disclosuure handling --- crawler.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index c47432e..46aa103 100644 --- a/crawler.py +++ b/crawler.py @@ -88,8 +88,7 @@ Biographie: {self.cv} # Mandat {self.mandate[0]}, {self.mandate[1]} -# Veröffentlichungspflichtige Angaben -{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")} +# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)} """ return txt @@ -125,6 +124,7 @@ def main(): ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") + parser.add_argument("--no-git", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") @@ -142,6 +142,9 @@ def main(): save_raw(bios, args.out) save_individuals(bios, args.out) + if args.no_git: + return + repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") @@ -209,7 +212,7 @@ def get_bio(url, name, sleep_for): soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) - disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() + disclosures = get_disclosures(soup.find(class_="m-biography__infos")) bio = Biography( name, @@ -228,6 +231,30 @@ def get_bio(url, name, sleep_for): return bio +def get_disclosures(elem): + if not elem: + return None + divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer") + out = [] + for div in divs: + current_heading = "" + current_body = [] + for child in div.children: + if child.name == "h3": + if current_body != []: + out.append((current_heading, current_body)) + current_heading = child.text.strip() + current_body = [] + continue + if not child.name: + continue + if child.text.strip() == "": + continue + current_body.append(child.text.strip()) + out.append((current_heading, current_body)) + return out + + def get_functions(elem): out = [] current_heading = None From be2371e0f5c0cc2fe54ac8dc69f985ec7b16ae23 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sat, 15 Nov 2025 15:35:11 +0100 Subject: [PATCH 3/4] sort functions alphabetically and handle rate limit more gracefully --- crawler.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index 46aa103..c31051e 100644 --- a/crawler.py +++ b/crawler.py @@ -112,7 +112,7 @@ def funcs_to_str(funcs): out = "" for func in funcs: out += f"\n- {func[0]}" - for loc in func[1]: + for loc in sorted(func[1]): out += f"\n - {loc}" return out @@ -184,7 +184,12 @@ def get_bio(url, name, sleep_for): name, party = name name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") - response = requests.get(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") @@ -314,7 +319,13 @@ def get_ajax(elem): for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) - return requests.get(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) + return response def common_suffix(strings): From d47627643f9637469d84a415d4e16ccba6e69a99 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sat, 15 Nov 2025 15:38:56 +0100 Subject: [PATCH 4/4] sort functions alphabetically also in raw --- crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler.py b/crawler.py index c31051e..7b5ea7d 100644 --- a/crawler.py +++ b/crawler.py @@ -267,7 +267,7 @@ def get_functions(elem): for child in elem.children: if child.name == "h3": if current_body != []: - out.append((current_heading, current_body)) + out.append((current_heading, sorted(current_body))) current_heading = child.text.strip() current_body = [] continue @@ -278,7 +278,7 @@ def get_functions(elem): for grandchild in child.children if grandchild.text.strip() != "" ) - out.append((current_heading, current_body)) + out.append((current_heading, sorted(current_body))) return out