clean up: explicit utf-8 encoding, proper exception handling, remove dead code
- Add encoding="utf-8" to all file writes - Catch requests.RequestException instead of bare except - Use raise_for_status() to also retry on HTTP errors - Use removeprefix/removesuffix instead of lstrip/rstrip - Use makedirs(exist_ok=True) - Remove unused common_suffix function and commonprefix import Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1a80fe1647
commit
14670538f6
1 changed files with 12 additions and 21 deletions
33
crawler.py
33
crawler.py
|
|
@ -3,7 +3,6 @@ import re
|
|||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from time import sleep
|
||||
from os.path import commonprefix
|
||||
from os import makedirs
|
||||
from git import Repo
|
||||
import argparse
|
||||
|
|
@ -167,16 +166,13 @@ def save_individuals(bios, out):
|
|||
first_letter = rep.name[0][0].upper()
|
||||
name_str = f"{rep.name[0]} {rep.name[1]}".replace(" ", "_")
|
||||
dir = f"{out}/Abgeordnete/{first_letter}"
|
||||
try:
|
||||
makedirs(dir)
|
||||
except FileExistsError:
|
||||
pass
|
||||
with open(f"{dir}/{name_str}.md", "w") as rep_file:
|
||||
makedirs(dir, exist_ok=True)
|
||||
with open(f"{dir}/{name_str}.md", "w", encoding="utf-8") as rep_file:
|
||||
rep_file.write(str(rep))
|
||||
|
||||
|
||||
def save_raw(bios, out):
|
||||
with open(f"{out}/raw.json", "w") as raw_file:
|
||||
with open(f"{out}/raw.json", "w", encoding="utf-8") as raw_file:
|
||||
json.dump(
|
||||
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
|
||||
)
|
||||
|
|
@ -184,10 +180,7 @@ def save_raw(bios, out):
|
|||
|
||||
def save_disclosures(bios, out):
|
||||
dir = f"{out}/Voep_Angaben"
|
||||
try:
|
||||
makedirs(dir)
|
||||
except FileExistsError:
|
||||
pass
|
||||
makedirs(dir, exist_ok=True)
|
||||
bios_with_discl = [bio for bio in bios if bio.disclosures]
|
||||
alpha_str = ""
|
||||
for bio in sorted(bios_with_discl, key=lambda b: b.name):
|
||||
|
|
@ -195,7 +188,7 @@ def save_disclosures(bios, out):
|
|||
alpha_str += funcs_to_str(bio.disclosures)
|
||||
alpha_str += "\n"
|
||||
|
||||
with open(f"{dir}/Alphabetisch.md", "w") as alpha_file:
|
||||
with open(f"{dir}/Alphabetisch.md", "w", encoding="utf-8") as alpha_file:
|
||||
alpha_file.write(alpha_str.strip())
|
||||
|
||||
party_str = ""
|
||||
|
|
@ -206,7 +199,7 @@ def save_disclosures(bios, out):
|
|||
party_str += funcs_to_str(bio.disclosures)
|
||||
party_str += "\n"
|
||||
|
||||
with open(f"{dir}/Nach_Partei.md", "w") as party_file:
|
||||
with open(f"{dir}/Nach_Partei.md", "w", encoding="utf-8") as party_file:
|
||||
party_file.write(party_str.strip())
|
||||
|
||||
|
||||
|
|
@ -293,9 +286,11 @@ def get_bio(url, name, sleep_for):
|
|||
def request_handle_rate_limit(url):
|
||||
for _ in range(5):
|
||||
try:
|
||||
return requests.get(url)
|
||||
except:
|
||||
print("Rate limit! waiting 5min")
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.RequestException:
|
||||
print("Request failed! waiting 5min")
|
||||
sleep(300)
|
||||
return requests.get(url)
|
||||
|
||||
|
|
@ -378,7 +373,7 @@ def parse_vote(page):
|
|||
def get_ajax(elem):
|
||||
if not elem:
|
||||
return None
|
||||
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
|
||||
inner = elem.get("x-data").removeprefix("dynamicTemplateOutput(").removesuffix(")")
|
||||
data = json.loads(inner)
|
||||
url = BUNDESTAG_BASE_URL + data["endpoint"]
|
||||
filters = data["filters"]
|
||||
|
|
@ -391,9 +386,5 @@ def get_ajax(elem):
|
|||
return response
|
||||
|
||||
|
||||
def common_suffix(strings):
|
||||
return commonprefix([s[::-1] for s in strings])[::-1]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue