clean up: explicit utf-8 encoding, proper exception handling, remove dead code
- Add encoding="utf-8" to all file writes - Catch requests.RequestException instead of bare except - Use raise_for_status() to also retry on HTTP errors - Use removeprefix/removesuffix instead of lstrip/rstrip - Use makedirs(exist_ok=True) - Remove unused common_suffix function and commonprefix import Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1a80fe1647
commit
14670538f6
1 changed files with 12 additions and 21 deletions
33
crawler.py
33
crawler.py
|
|
@ -3,7 +3,6 @@ import re
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from os.path import commonprefix
|
|
||||||
from os import makedirs
|
from os import makedirs
|
||||||
from git import Repo
|
from git import Repo
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -167,16 +166,13 @@ def save_individuals(bios, out):
|
||||||
first_letter = rep.name[0][0].upper()
|
first_letter = rep.name[0][0].upper()
|
||||||
name_str = f"{rep.name[0]} {rep.name[1]}".replace(" ", "_")
|
name_str = f"{rep.name[0]} {rep.name[1]}".replace(" ", "_")
|
||||||
dir = f"{out}/Abgeordnete/{first_letter}"
|
dir = f"{out}/Abgeordnete/{first_letter}"
|
||||||
try:
|
makedirs(dir, exist_ok=True)
|
||||||
makedirs(dir)
|
with open(f"{dir}/{name_str}.md", "w", encoding="utf-8") as rep_file:
|
||||||
except FileExistsError:
|
|
||||||
pass
|
|
||||||
with open(f"{dir}/{name_str}.md", "w") as rep_file:
|
|
||||||
rep_file.write(str(rep))
|
rep_file.write(str(rep))
|
||||||
|
|
||||||
|
|
||||||
def save_raw(bios, out):
|
def save_raw(bios, out):
|
||||||
with open(f"{out}/raw.json", "w") as raw_file:
|
with open(f"{out}/raw.json", "w", encoding="utf-8") as raw_file:
|
||||||
json.dump(
|
json.dump(
|
||||||
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
|
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
|
||||||
)
|
)
|
||||||
|
|
@ -184,10 +180,7 @@ def save_raw(bios, out):
|
||||||
|
|
||||||
def save_disclosures(bios, out):
|
def save_disclosures(bios, out):
|
||||||
dir = f"{out}/Voep_Angaben"
|
dir = f"{out}/Voep_Angaben"
|
||||||
try:
|
makedirs(dir, exist_ok=True)
|
||||||
makedirs(dir)
|
|
||||||
except FileExistsError:
|
|
||||||
pass
|
|
||||||
bios_with_discl = [bio for bio in bios if bio.disclosures]
|
bios_with_discl = [bio for bio in bios if bio.disclosures]
|
||||||
alpha_str = ""
|
alpha_str = ""
|
||||||
for bio in sorted(bios_with_discl, key=lambda b: b.name):
|
for bio in sorted(bios_with_discl, key=lambda b: b.name):
|
||||||
|
|
@ -195,7 +188,7 @@ def save_disclosures(bios, out):
|
||||||
alpha_str += funcs_to_str(bio.disclosures)
|
alpha_str += funcs_to_str(bio.disclosures)
|
||||||
alpha_str += "\n"
|
alpha_str += "\n"
|
||||||
|
|
||||||
with open(f"{dir}/Alphabetisch.md", "w") as alpha_file:
|
with open(f"{dir}/Alphabetisch.md", "w", encoding="utf-8") as alpha_file:
|
||||||
alpha_file.write(alpha_str.strip())
|
alpha_file.write(alpha_str.strip())
|
||||||
|
|
||||||
party_str = ""
|
party_str = ""
|
||||||
|
|
@ -206,7 +199,7 @@ def save_disclosures(bios, out):
|
||||||
party_str += funcs_to_str(bio.disclosures)
|
party_str += funcs_to_str(bio.disclosures)
|
||||||
party_str += "\n"
|
party_str += "\n"
|
||||||
|
|
||||||
with open(f"{dir}/Nach_Partei.md", "w") as party_file:
|
with open(f"{dir}/Nach_Partei.md", "w", encoding="utf-8") as party_file:
|
||||||
party_file.write(party_str.strip())
|
party_file.write(party_str.strip())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -293,9 +286,11 @@ def get_bio(url, name, sleep_for):
|
||||||
def request_handle_rate_limit(url):
|
def request_handle_rate_limit(url):
|
||||||
for _ in range(5):
|
for _ in range(5):
|
||||||
try:
|
try:
|
||||||
return requests.get(url)
|
response = requests.get(url)
|
||||||
except:
|
response.raise_for_status()
|
||||||
print("Rate limit! waiting 5min")
|
return response
|
||||||
|
except requests.RequestException:
|
||||||
|
print("Request failed! waiting 5min")
|
||||||
sleep(300)
|
sleep(300)
|
||||||
return requests.get(url)
|
return requests.get(url)
|
||||||
|
|
||||||
|
|
@ -378,7 +373,7 @@ def parse_vote(page):
|
||||||
def get_ajax(elem):
|
def get_ajax(elem):
|
||||||
if not elem:
|
if not elem:
|
||||||
return None
|
return None
|
||||||
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
|
inner = elem.get("x-data").removeprefix("dynamicTemplateOutput(").removesuffix(")")
|
||||||
data = json.loads(inner)
|
data = json.loads(inner)
|
||||||
url = BUNDESTAG_BASE_URL + data["endpoint"]
|
url = BUNDESTAG_BASE_URL + data["endpoint"]
|
||||||
filters = data["filters"]
|
filters = data["filters"]
|
||||||
|
|
@ -391,9 +386,5 @@ def get_ajax(elem):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
def common_suffix(strings):
|
|
||||||
return commonprefix([s[::-1] for s in strings])[::-1]
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue