individual output
This commit is contained in:
parent
d2fac39099
commit
7358b47384
1 changed files with 31 additions and 12 deletions
43
crawler.py
43
crawler.py
|
|
@ -11,6 +11,9 @@ from datetime import datetime
|
||||||
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
||||||
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
||||||
|
|
||||||
|
DISCLOSURE_DISLAIMER = """Anzeigen nach den Verhaltensregeln (§§ 45 ff. Abgeordnetengesetz) sind von den Abgeordneten innerhalb von drei Monaten nach Erwerb der Mitgliedschaft einzureichen. Während der Wahlperiode sind Änderungen oder Ergänzungen innerhalb einer Frist von drei Monaten ab deren Eintritt mitzuteilen. Die Angaben werden nach Verarbeitung der Daten und Prüfung, ob eine Veröffentlichungspflicht besteht, an dieser Stelle veröffentlicht. Für weiterführende Informationen wird auf die "Hinweise zur Veröffentlichung der Angaben nach den Verhaltensregeln" auf den Internetseiten des Deutschen Bundestages verwiesen.
|
||||||
|
Die veröffentlichungspflichtigen Angaben der Abgeordneten der vergangenen Wahlperioden finden Sie im Archiv."""
|
||||||
|
|
||||||
|
|
||||||
class Biography:
|
class Biography:
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|
@ -52,7 +55,7 @@ disclosures: {self.disclosures}
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.speeches:
|
if self.speeches:
|
||||||
speeches_str = "".join(
|
speeches_str = "".join(
|
||||||
[f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches]
|
[f"\n- {speech[0]}: {speech[1]}" for speech in self.speeches]
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
speeches_str = ""
|
speeches_str = ""
|
||||||
|
|
@ -64,23 +67,27 @@ disclosures: {self.disclosures}
|
||||||
else:
|
else:
|
||||||
votes_str = ""
|
votes_str = ""
|
||||||
|
|
||||||
if self.functions:
|
|
||||||
func_str = "".join([f"\n- "])
|
|
||||||
txt = f"""
|
txt = f"""
|
||||||
|
# Persönliche Angaben
|
||||||
Name: {self.name[1]} {self.name[0]}
|
Name: {self.name[1]} {self.name[0]}
|
||||||
|
|
||||||
Partei: {self.party}
|
Partei: {self.party}
|
||||||
|
|
||||||
Biographie: {self.cv}
|
Biographie: {self.cv}
|
||||||
|
|
||||||
Reden: {speeches_str}
|
# Reden {speeches_str}
|
||||||
|
|
||||||
Abstimmungen: {votes_str}
|
# Abstimmungen {votes_str}
|
||||||
|
|
||||||
Ämter im Bundestag: {self.functions}
|
# Funktionen
|
||||||
additional_functions: {self.additional_functions}
|
## Ämter im Bundestag {funcs_to_str(self.functions)}
|
||||||
mandate: {self.mandate}
|
|
||||||
disclosures: {self.disclosures}
|
## Sonstige Gremien: {funcs_to_str(self.additional_functions)}
|
||||||
|
|
||||||
|
# Mandat {self.mandate[0], self.mandate[1]}
|
||||||
|
|
||||||
|
# Veröffentlichungspflichtige Angaben
|
||||||
|
{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
|
||||||
"""
|
"""
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
@ -98,6 +105,17 @@ disclosures: {self.disclosures}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def funcs_to_str(funcs):
|
||||||
|
if not funcs:
|
||||||
|
return ""
|
||||||
|
out = ""
|
||||||
|
for func in funcs:
|
||||||
|
out += f"\n- {func[0]}"
|
||||||
|
for loc in func[1]:
|
||||||
|
out += f"\n - {loc}"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
prog="Bundescrawler",
|
prog="Bundescrawler",
|
||||||
|
|
@ -116,6 +134,7 @@ def main():
|
||||||
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
||||||
|
|
||||||
save_raw(bios, args.out)
|
save_raw(bios, args.out)
|
||||||
|
save_individuals(bios, args.out)
|
||||||
|
|
||||||
repo.git.add("*")
|
repo.git.add("*")
|
||||||
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
|
|
@ -127,7 +146,7 @@ def save_individuals(bios, out):
|
||||||
for rep in bios:
|
for rep in bios:
|
||||||
first_letter = rep.name[0][0].upper()
|
first_letter = rep.name[0][0].upper()
|
||||||
name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
|
name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
|
||||||
with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file:
|
with open(f"{out}/Abgeordnete/{first_letter}/{name_str}.md") as rep_file:
|
||||||
json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)
|
json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -227,9 +246,9 @@ def parse_speech(page):
|
||||||
if not page:
|
if not page:
|
||||||
return (None, None)
|
return (None, None)
|
||||||
soup = BeautifulSoup(page.content)
|
soup = BeautifulSoup(page.content)
|
||||||
infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
|
infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
|
||||||
titles = [
|
titles = [
|
||||||
title.text
|
title.text.strip()
|
||||||
for title in soup.find_all(class_="a-link__label")
|
for title in soup.find_all(class_="a-link__label")
|
||||||
if "--hidden" not in title.get("class")
|
if "--hidden" not in title.get("class")
|
||||||
][::2]
|
][::2]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue