From d2fac390995f3620f7af67ec8c03b9a7b97ef939 Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Fri, 14 Nov 2025 12:09:08 +0100 Subject: [PATCH] fix function decoding --- crawler.py | 60 +++++++++++++++++++++++++++++++++++++++++++++----- pyproject.toml | 1 + 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/crawler.py b/crawler.py index b11920b..1dbfdd5 100644 --- a/crawler.py +++ b/crawler.py @@ -45,6 +45,41 @@ votes: {self.votes} functions: {self.functions} additional_functions: {self.additional_functions} mandate: {self.mandate} +disclosures: {self.disclosures} + """ + return txt + + def __str__(self): + if self.speeches: + speeches_str = "".join( + [f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches] + ) + else: + speeches_str = "" + + if self.votes: + votes_str = "".join( + [f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes] + ) + else: + votes_str = "" + + if self.functions: + func_str = "".join([f"\n- "]) + txt = f""" +Name: {self.name[1]} {self.name[0]} + +Partei: {self.party} + +Biographie: {self.cv} + +Reden: {speeches_str} + +Abstimmungen: {votes_str} + +Ämter im Bundestag: {self.functions} +additional_functions: {self.additional_functions} +mandate: {self.mandate} disclosures: {self.disclosures} """ return txt @@ -80,16 +115,27 @@ def main(): names = names[:5] bios = [get_bio(link, name) for link, name in zip(links, names)] - save_info(bios, args.out) + save_raw(bios, args.out) + repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") origin.push() -def save_info(bios, out): +def save_individuals(bios, out): + for rep in bios: + first_letter = rep.name[0][0].upper() + name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_") + with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file: + json.dump(str(rep), rep_file, indent=2, ensure_ascii=False) + + +def save_raw(bios, out): with open(f"{out}/raw.json", "w") as raw_file: - json.dump([bio.to_dict() for bio in bios], raw_file, indent=2) + json.dump( + [bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False + ) def get_links_and_names(): @@ -122,7 +168,7 @@ def get_bio(url, name): speeches = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) - function_divs = soup.find_all(class_="m-biography__membership") + function_divs = soup.find_all(class_="m-biography__memberships") if len(function_divs) > 0: functions = get_functions(function_divs[0]) else: @@ -168,7 +214,11 @@ def get_functions(elem): continue if not child.name: continue - current_body.append(child.text.strip()) + current_body.extend( + grandchild.text.strip().replace("\n\n\n(Interner Link)", "") + for grandchild in child.children + if grandchild.text.strip() != "" + ) out.append((current_heading, current_body)) return out diff --git a/pyproject.toml b/pyproject.toml index 6cbae08..0fff45a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,4 +14,5 @@ description = "Crawls the website of the german parlament and tracks any changes dependencies = [ "beautifulsoup4", "requests", + "gitpython" ]