Bundescrawler/crawler.py
2025-11-17 17:26:10 +01:00

389 lines
11 KiB
Python

import requests
import re
import json
from bs4 import BeautifulSoup
from time import sleep
from os.path import commonprefix
from os import makedirs
from git import Repo
import argparse
from datetime import datetime
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
DISCLOSURE_DISLAIMER = """Anzeigen nach den Verhaltensregeln (§§ 45 ff. Abgeordnetengesetz) sind von den Abgeordneten innerhalb von drei Monaten nach Erwerb der Mitgliedschaft einzureichen. Während der Wahlperiode sind Änderungen oder Ergänzungen innerhalb einer Frist von drei Monaten ab deren Eintritt mitzuteilen. Die Angaben werden nach Verarbeitung der Daten und Prüfung, ob eine Veröffentlichungspflicht besteht, an dieser Stelle veröffentlicht. Für weiterführende Informationen wird auf die "Hinweise zur Veröffentlichung der Angaben nach den Verhaltensregeln" auf den Internetseiten des Deutschen Bundestages verwiesen.
Die veröffentlichungspflichtigen Angaben der Abgeordneten der vergangenen Wahlperioden finden Sie im Archiv."""
class Biography:
def __init__(
self,
name,
party,
job,
cv,
speeches,
votes,
functions,
additional_functions,
mandate,
disclosures,
):
self.name = name
self.party = party
self.job = job
self.cv = cv
self.speeches = speeches
self.votes = votes
self.functions = functions
self.additional_functions = additional_functions
self.mandate = mandate
self.disclosures = disclosures
def __repr__(self):
txt = f"""
name: {self.name}
party: {self.party}
cv: {self.cv}
speeches: {self.speeches}
votes: {self.votes}
functions: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
"""
return txt
def __str__(self):
if self.speeches:
speeches_str = "".join(
[f"\n- {speech[0]}: {speech[1]}" for speech in self.speeches]
)
else:
speeches_str = ""
if self.votes:
votes_str = "".join(
[f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes]
)
else:
votes_str = ""
txt = f"""
# Persönliche Angaben
Name: {self.name[1]} {self.name[0]}
Partei: {self.party}
Beruf: {self.job}
Biographie: {self.cv}
# Reden {speeches_str}
# Abstimmungen {votes_str}
# Funktionen
## Ämter im Bundestag {funcs_to_str(self.functions)}
## Sonstige Gremien {funcs_to_str(self.additional_functions)}
# Mandat
{self.mandate[0]}, {self.mandate[1]}
# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)}
"""
return txt
def to_dict(self):
return {
"name": self.name,
"party": self.party,
"cv": self.cv,
"speeches": self.speeches,
"votes": self.votes,
"functions": self.functions,
"additional_functions": self.additional_functions,
"mandate": self.mandate,
"disclosures": self.disclosures,
}
def funcs_to_str(funcs):
if not funcs:
return ""
out = ""
for func in funcs:
out += f"\n- {func[0]}"
for loc in sorted(func[1]):
out += f"\n - {loc}"
return out
def main():
parser = argparse.ArgumentParser(
prog="Bundescrawler",
description="Crawls the pages of german representatives and saves the information in a git repository",
)
parser.add_argument("-o", "--out")
parser.add_argument("--debug", action="store_true")
parser.add_argument("--no-git", action="store_true")
args = parser.parse_args()
if not args.out:
raise ValueError("must supply out directory")
repo = Repo(args.out)
links, names = get_links_and_names()
if args.debug:
links = links[:5]
names = names[:5]
sleep_for = 0
else:
sleep_for = 10
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
if not args.debug:
save_raw(bios, args.out)
save_individuals(bios, args.out)
save_disclosures(bios, args.out)
if args.no_git:
return
repo.git.add("*")
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
origin = repo.remote(name="origin")
origin.push()
def save_individuals(bios, out):
for rep in bios:
first_letter = rep.name[0][0].upper()
name_str = f"{rep.name[0]} {rep.name[1]}".replace(" ", "_")
dir = f"{out}/Abgeordnete/{first_letter}"
try:
makedirs(dir)
except FileExistsError:
pass
with open(f"{dir}/{name_str}.md", "w") as rep_file:
rep_file.write(str(rep))
def save_raw(bios, out):
with open(f"{out}/raw.json", "w") as raw_file:
json.dump(
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
)
def save_disclosures(bios, out):
dir = f"{out}/Voep_Angaben"
try:
makedirs(dir)
except FileExistsError:
pass
bios_with_discl = [bio for bio in bios if bio.disclosures]
alpha_str = ""
for bio in sorted(bios_with_discl, key=lambda b: b.name):
alpha_str += f"# {bio.name[1]} {bio.name[0]} ({bio.party})"
alpha_str += funcs_to_str(bio.disclosures)
alpha_str += "\n"
with open(f"{dir}/Alphabetisch.md", "w") as alpha_file:
alpha_file.write(alpha_str.strip())
party_str = ""
for party, bio_list in group_by_party(bios_with_discl):
party_str += f"# {party}\n"
for bio in bio_list:
party_str += f"## {bio.name[1]} {bio.name[0]}"
party_str += funcs_to_str(bio.disclosures)
party_str += "\n"
with open(f"{dir}/Nach_Partei.md", "w") as party_file:
party_file.write(party_str.strip())
def group_by_party(bios):
grouped = {}
for bio in bios:
if bio.party in grouped.keys():
grouped[bio.party].append(bio)
else:
grouped[bio.party] = [bio]
as_list = [(key, val) for key, val in grouped.items()]
as_list.sort(key=lambda party: party[0])
return as_list
def get_links_and_names():
response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content)
links = [a.get("href") for a in soup.find_all("a")]
names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]
return (links, names)
def get_bio(url, name, sleep_for):
name, party = name
name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}")
response = request_handle_rate_limit(url)
soup = BeautifulSoup(response.content)
job = soup.find(class_="m-biography__introInfo").find("span").text
cv = soup.find(class_="m-biography__biography").text.strip()
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
speech_div = None
vote_div = None
for div in ajax_divs:
if "abstimmung" in div.get("x-data"):
vote_div = div
else:
speech_div = div
speech = get_ajax(speech_div)
speeches = parse_speech(speech)
vote = get_ajax(vote_div)
votes = parse_vote(vote)
function_divs = soup.find_all(class_="m-biography__memberships")
if len(function_divs) > 0:
functions = get_functions(function_divs[0])
else:
functions = None
if len(function_divs) > 1:
additional_functions = get_functions(function_divs[1])
else:
additional_functions = None
mandate = (
soup.find(class_="m-biography__subHeading --mandate").text,
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
)
disclosures = get_disclosures(soup.find(class_="m-biography__infos"))
bio = Biography(
name,
party,
job,
cv,
speeches,
votes,
functions,
additional_functions,
mandate,
disclosures,
)
sleep(sleep_for)
return bio
def request_handle_rate_limit(url):
for _ in range(5):
try:
return requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
return requests.get(url)
def get_disclosures(elem):
if not elem:
return None
divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
out = []
for div in divs:
current_heading = ""
current_body = []
for child in div.children:
if child.name == "h3":
if current_body != []:
out.append((current_heading, current_body))
current_heading = child.text.strip()
current_body = []
continue
if not child.name:
continue
if child.text.strip() == "":
continue
if child.text.strip() == "Keine veröffentlichungspflichtigen Angaben.":
continue
current_body.append(child.text.strip())
if current_heading == "" and current_body == []:
continue
out.append((current_heading, current_body))
return out
def get_functions(elem):
out = []
current_heading = None
current_body = []
for child in elem.children:
if child.name == "h3":
if current_body != []:
out.append((current_heading, sorted(current_body)))
current_heading = child.text.strip()
current_body = []
continue
if not child.name:
continue
current_body.extend(
grandchild.text.strip().replace("\n\n\n(Interner Link)", "")
for grandchild in child.children
if grandchild.text.strip() != ""
)
out.append((current_heading, sorted(current_body)))
return out
def parse_speech(page):
if not page:
return None
soup = BeautifulSoup(page.content)
infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
titles = [
title.text.strip()
for title in soup.find_all(class_="a-link__label")
if "--hidden" not in title.get("class")
][::2]
return list(zip(titles, infos))
def parse_vote(page):
if not page:
return None
soup = BeautifulSoup(page.content)
rows = soup.find_all("tr")[1:]
parsed = []
for row in rows:
cols = row.find_all("td")
parsed.append([col.text.strip() for col in cols])
return parsed
def get_ajax(elem):
if not elem:
return None
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
data = json.loads(inner)
url = BUNDESTAG_BASE_URL + data["endpoint"]
filters = data["filters"]
sanitized_filters = [
(key, value.replace(" ", "+").replace("#", "%23"))
for key, value in filters.items()
]
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
response = request_handle_rate_limit(url)
return response
def common_suffix(strings):
return commonprefix([s[::-1] for s in strings])[::-1]
if __name__ == "__main__":
main()