From fb38bb5894bcc1b24a0f7a781b4ca2571607d9a9 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Mon, 13 Apr 2026 22:37:50 +0200 Subject: [PATCH] generate descriptive commit messages by diffing against previous run Loads the previous raw.json before saving, compares against current crawl, and generates commit messages listing: new/departed representatives, party changes, new disclosures, and total profiles updated. Co-Authored-By: Claude Opus 4.6 (1M context) --- crawler.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 0d0fcac..28db45c 100644 --- a/crawler.py +++ b/crawler.py @@ -144,6 +144,8 @@ def main(): bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] + old_bios = load_old_bios(args.out) + if not args.debug: save_raw(bios, args.out) save_individuals(bios, args.out) @@ -156,12 +158,82 @@ def main(): if repo.git.diff(name_only=True) == "": return + message = generate_commit_message(old_bios, bios) repo.git.add("*") - repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + repo.index.commit(message) origin = repo.remote(name="origin") origin.push() +def load_old_bios(out): + try: + with open(f"{out}/raw.json", "r", encoding="utf-8") as f: + return json.load(f) + except FileNotFoundError: + return [] + + +def generate_commit_message(old_bios, new_bios): + old_names = {tuple(b["name"]): b for b in old_bios} + new_names = {tuple(b.name): b for b in new_bios} + + added = [new_names[n] for n in new_names if n not in old_names] + removed = [old_names[n] for n in old_names if n not in new_names] + + new_disclosures = [] + for bio in new_bios: + key = tuple(bio.name) + if key not in old_names: + continue + old_discl = old_names[key].get("disclosures") or [] + new_discl = bio.disclosures or [] + old_items = {item for d in old_discl for item in d[1]} + new_items = {item for d in new_discl for item in d[1]} + for item in new_items - old_items: + new_disclosures.append((bio, item)) + + party_changes = [] + for bio in new_bios: + key = tuple(bio.name) + if key not in old_names: + continue + old_party = old_names[key].get("party", "") + if old_party != bio.party: + party_changes.append((bio, old_party)) + + date = datetime.now().strftime("%Y-%m-%d") + sections = [] + + if added: + sections.append("Neue Abgeordnete:\n" + "\n".join( + f"- {b.name[1]} {b.name[0]} ({b.party})" for b in added + )) + + if removed: + sections.append("Ausgeschieden:\n" + "\n".join( + f"- {b['name'][1]} {b['name'][0]} ({b.get('party', '')})" for b in removed + )) + + if party_changes: + sections.append("Parteiwechsel:\n" + "\n".join( + f"- {b.name[1]} {b.name[0]}: {old} -> {b.party}" + for b, old in party_changes + )) + + if new_disclosures: + sections.append("Neue Veröffentlichungen:\n" + "\n".join( + f"- {b.name[1]} {b.name[0]} ({b.party}): {item}" + for b, item in new_disclosures + )) + + updated = len(new_bios) + sections.append(f"{updated} Profile aktualisiert") + + title = f"Aktualisierung {date}" + body = "\n\n".join(sections) + return f"{title}\n\n{body}" + + def save_individuals(bios, out): for rep in bios: first_letter = rep.name[0][0].upper()