fix function decoding

This commit is contained in:
Marco Lents 2025-11-14 12:09:08 +01:00
parent aaa372fe21
commit d2fac39099
2 changed files with 56 additions and 5 deletions

View file

@ -45,6 +45,41 @@ votes: {self.votes}
functions: {self.functions} functions: {self.functions}
additional_functions: {self.additional_functions} additional_functions: {self.additional_functions}
mandate: {self.mandate} mandate: {self.mandate}
disclosures: {self.disclosures}
"""
return txt
def __str__(self):
if self.speeches:
speeches_str = "".join(
[f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches]
)
else:
speeches_str = ""
if self.votes:
votes_str = "".join(
[f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes]
)
else:
votes_str = ""
if self.functions:
func_str = "".join([f"\n- "])
txt = f"""
Name: {self.name[1]} {self.name[0]}
Partei: {self.party}
Biographie: {self.cv}
Reden: {speeches_str}
Abstimmungen: {votes_str}
Ämter im Bundestag: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures} disclosures: {self.disclosures}
""" """
return txt return txt
@ -80,16 +115,27 @@ def main():
names = names[:5] names = names[:5]
bios = [get_bio(link, name) for link, name in zip(links, names)] bios = [get_bio(link, name) for link, name in zip(links, names)]
save_info(bios, args.out) save_raw(bios, args.out)
repo.git.add("*") repo.git.add("*")
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
origin = repo.remote(name="origin") origin = repo.remote(name="origin")
origin.push() origin.push()
def save_info(bios, out): def save_individuals(bios, out):
for rep in bios:
first_letter = rep.name[0][0].upper()
name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file:
json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)
def save_raw(bios, out):
with open(f"{out}/raw.json", "w") as raw_file: with open(f"{out}/raw.json", "w") as raw_file:
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2) json.dump(
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
)
def get_links_and_names(): def get_links_and_names():
@ -122,7 +168,7 @@ def get_bio(url, name):
speeches = parse_speech(speech) speeches = parse_speech(speech)
vote = get_ajax(vote_div) vote = get_ajax(vote_div)
votes = parse_vote(vote) votes = parse_vote(vote)
function_divs = soup.find_all(class_="m-biography__membership") function_divs = soup.find_all(class_="m-biography__memberships")
if len(function_divs) > 0: if len(function_divs) > 0:
functions = get_functions(function_divs[0]) functions = get_functions(function_divs[0])
else: else:
@ -168,7 +214,11 @@ def get_functions(elem):
continue continue
if not child.name: if not child.name:
continue continue
current_body.append(child.text.strip()) current_body.extend(
grandchild.text.strip().replace("\n\n\n(Interner Link)", "")
for grandchild in child.children
if grandchild.text.strip() != ""
)
out.append((current_heading, current_body)) out.append((current_heading, current_body))
return out return out

View file

@ -14,4 +14,5 @@ description = "Crawls the website of the german parlament and tracks any changes
dependencies = [ dependencies = [
"beautifulsoup4", "beautifulsoup4",
"requests", "requests",
"gitpython"
] ]