From 14670538f65d82fea4a312d32c21ad32537d22b0 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Mon, 13 Apr 2026 22:27:03 +0200 Subject: [PATCH] clean up: explicit utf-8 encoding, proper exception handling, remove dead code - Add encoding="utf-8" to all file writes - Catch requests.RequestException instead of bare except - Use raise_for_status() to also retry on HTTP errors - Use removeprefix/removesuffix instead of lstrip/rstrip - Use makedirs(exist_ok=True) - Remove unused common_suffix function and commonprefix import Co-Authored-By: Claude Opus 4.6 (1M context) --- crawler.py | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/crawler.py b/crawler.py index 3004bf7..28938f6 100644 --- a/crawler.py +++ b/crawler.py @@ -3,7 +3,6 @@ import re import json from bs4 import BeautifulSoup from time import sleep -from os.path import commonprefix from os import makedirs from git import Repo import argparse @@ -167,16 +166,13 @@ def save_individuals(bios, out): first_letter = rep.name[0][0].upper() name_str = f"{rep.name[0]} {rep.name[1]}".replace(" ", "_") dir = f"{out}/Abgeordnete/{first_letter}" - try: - makedirs(dir) - except FileExistsError: - pass - with open(f"{dir}/{name_str}.md", "w") as rep_file: + makedirs(dir, exist_ok=True) + with open(f"{dir}/{name_str}.md", "w", encoding="utf-8") as rep_file: rep_file.write(str(rep)) def save_raw(bios, out): - with open(f"{out}/raw.json", "w") as raw_file: + with open(f"{out}/raw.json", "w", encoding="utf-8") as raw_file: json.dump( [bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False ) @@ -184,10 +180,7 @@ def save_raw(bios, out): def save_disclosures(bios, out): dir = f"{out}/Voep_Angaben" - try: - makedirs(dir) - except FileExistsError: - pass + makedirs(dir, exist_ok=True) bios_with_discl = [bio for bio in bios if bio.disclosures] alpha_str = "" for bio in sorted(bios_with_discl, key=lambda b: b.name): @@ -195,7 +188,7 @@ def save_disclosures(bios, out): alpha_str += funcs_to_str(bio.disclosures) alpha_str += "\n" - with open(f"{dir}/Alphabetisch.md", "w") as alpha_file: + with open(f"{dir}/Alphabetisch.md", "w", encoding="utf-8") as alpha_file: alpha_file.write(alpha_str.strip()) party_str = "" @@ -206,7 +199,7 @@ def save_disclosures(bios, out): party_str += funcs_to_str(bio.disclosures) party_str += "\n" - with open(f"{dir}/Nach_Partei.md", "w") as party_file: + with open(f"{dir}/Nach_Partei.md", "w", encoding="utf-8") as party_file: party_file.write(party_str.strip()) @@ -293,9 +286,11 @@ def get_bio(url, name, sleep_for): def request_handle_rate_limit(url): for _ in range(5): try: - return requests.get(url) - except: - print("Rate limit! waiting 5min") + response = requests.get(url) + response.raise_for_status() + return response + except requests.RequestException: + print("Request failed! waiting 5min") sleep(300) return requests.get(url) @@ -378,7 +373,7 @@ def parse_vote(page): def get_ajax(elem): if not elem: return None - inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")") + inner = elem.get("x-data").removeprefix("dynamicTemplateOutput(").removesuffix(")") data = json.loads(inner) url = BUNDESTAG_BASE_URL + data["endpoint"] filters = data["filters"] @@ -391,9 +386,5 @@ def get_ajax(elem): return response -def common_suffix(strings): - return commonprefix([s[::-1] for s in strings])[::-1] - - if __name__ == "__main__": main()