From 6ef8fcc99343aebe4f54fcf5ec122ef6de489d38 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sun, 16 Nov 2025 10:12:39 +0100 Subject: [PATCH 1/2] extract graceful url handling into separate function --- crawler.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crawler.py b/crawler.py index 7b5ea7d..9a34a24 100644 --- a/crawler.py +++ b/crawler.py @@ -184,12 +184,7 @@ def get_bio(url, name, sleep_for): name, party = name name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") - for _ in range(5): - try: - response = requests.get(url) - except: - print("Rate limit! waiting 5min") - sleep(300) + response = request_handle_rate_limit(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") @@ -236,6 +231,16 @@ def get_bio(url, name, sleep_for): return bio +def request_handle_rate_limit(url): + for _ in range(5): + try: + return requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) + return requests.get(url) + + def get_disclosures(elem): if not elem: return None @@ -319,12 +324,7 @@ def get_ajax(elem): for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) - for _ in range(5): - try: - response = requests.get(url) - except: - print("Rate limit! waiting 5min") - sleep(300) + response = request_handle_rate_limit(url) return response From 5501f07cf72370d5cab8c4ab3300b74d3e212361 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sun, 16 Nov 2025 10:34:41 +0100 Subject: [PATCH 2/2] add disclosure files --- crawler.py | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 9a34a24..ff05b46 100644 --- a/crawler.py +++ b/crawler.py @@ -139,8 +139,10 @@ def main(): bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] - save_raw(bios, args.out) + if not args.debug: + save_raw(bios, args.out) save_individuals(bios, args.out) + save_disclosures(bios, args.out) if args.no_git: return @@ -171,6 +173,47 @@ def save_raw(bios, out): ) +def save_disclosures(bios, out): + dir = f"{out}/Voep_Angaben" + try: + makedirs(dir) + except FileExistsError: + pass + bios_with_discl = [bio for bio in bios if bio.disclosures] + alpha_str = "" + for bio in sorted(bios_with_discl, key=lambda b: b.name): + alpha_str += f"# {bio.name[1]} {bio.name[0]} ({bio.party})" + alpha_str += funcs_to_str(bio.disclosures) + alpha_str += "\n" + + with open(f"{dir}/Alphabetisch.md", "w") as alpha_file: + alpha_file.write(alpha_str.strip()) + + party_str = "" + for party, bio_list in group_by_party(bios_with_discl): + party_str += f"# {party}\n" + for bio in bio_list: + party_str += f"## {bio.name[1]} {bio.name[0]}" + party_str += funcs_to_str(bio.disclosures) + party_str += "\n" + + with open(f"{dir}/Nach_Partei.md", "w") as party_file: + party_file.write(party_str.strip()) + + +def group_by_party(bios): + grouped = {} + for bio in bios: + if bio.party in grouped.keys(): + grouped[bio.party].append(bio) + else: + grouped[bio.party] = [bio] + + as_list = [(key, val) for key, val in grouped.items()] + as_list.sort(key=lambda party: party[0]) + return as_list + + def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content)