Compare commits

...

2 commits

Author SHA1 Message Date
5501f07cf7 add disclosure files 2025-11-16 10:34:41 +01:00
6ef8fcc993 extract graceful url handling into separate function 2025-11-16 10:12:39 +01:00

View file

@ -139,8 +139,10 @@ def main():
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
save_raw(bios, args.out) if not args.debug:
save_raw(bios, args.out)
save_individuals(bios, args.out) save_individuals(bios, args.out)
save_disclosures(bios, args.out)
if args.no_git: if args.no_git:
return return
@ -171,6 +173,47 @@ def save_raw(bios, out):
) )
def save_disclosures(bios, out):
dir = f"{out}/Voep_Angaben"
try:
makedirs(dir)
except FileExistsError:
pass
bios_with_discl = [bio for bio in bios if bio.disclosures]
alpha_str = ""
for bio in sorted(bios_with_discl, key=lambda b: b.name):
alpha_str += f"# {bio.name[1]} {bio.name[0]} ({bio.party})"
alpha_str += funcs_to_str(bio.disclosures)
alpha_str += "\n"
with open(f"{dir}/Alphabetisch.md", "w") as alpha_file:
alpha_file.write(alpha_str.strip())
party_str = ""
for party, bio_list in group_by_party(bios_with_discl):
party_str += f"# {party}\n"
for bio in bio_list:
party_str += f"## {bio.name[1]} {bio.name[0]}"
party_str += funcs_to_str(bio.disclosures)
party_str += "\n"
with open(f"{dir}/Nach_Partei.md", "w") as party_file:
party_file.write(party_str.strip())
def group_by_party(bios):
grouped = {}
for bio in bios:
if bio.party in grouped.keys():
grouped[bio.party].append(bio)
else:
grouped[bio.party] = [bio]
as_list = [(key, val) for key, val in grouped.items()]
as_list.sort(key=lambda party: party[0])
return as_list
def get_links_and_names(): def get_links_and_names():
response = requests.get(BUNDESTAG_URL) response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content) soup = BeautifulSoup(response.content)
@ -184,12 +227,7 @@ def get_bio(url, name, sleep_for):
name, party = name name, party = name
name = name.split(", ") name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}") print(f"Getting {url} for {name[1]} {name[0]}")
for _ in range(5): response = request_handle_rate_limit(url)
try:
response = requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
soup = BeautifulSoup(response.content) soup = BeautifulSoup(response.content)
cv = soup.find(class_="m-biography__biography").text.strip() cv = soup.find(class_="m-biography__biography").text.strip()
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
@ -236,6 +274,16 @@ def get_bio(url, name, sleep_for):
return bio return bio
def request_handle_rate_limit(url):
for _ in range(5):
try:
return requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
return requests.get(url)
def get_disclosures(elem): def get_disclosures(elem):
if not elem: if not elem:
return None return None
@ -319,12 +367,7 @@ def get_ajax(elem):
for key, value in filters.items() for key, value in filters.items()
] ]
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
for _ in range(5): response = request_handle_rate_limit(url)
try:
response = requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
return response return response