164 lines
4.3 KiB
Python
164 lines
4.3 KiB
Python
import requests
|
|
import re
|
|
import json
|
|
from bs4 import BeautifulSoup
|
|
from time import sleep
|
|
|
|
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
|
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
|
|
|
|
|
class Biography:
|
|
def __init__(
|
|
self,
|
|
name,
|
|
cv,
|
|
speeches,
|
|
votes,
|
|
functions,
|
|
additional_functions,
|
|
mandate,
|
|
disclosures,
|
|
):
|
|
self.name = name
|
|
self.cv = cv
|
|
self.speeches = speeches
|
|
self.votes = votes
|
|
self.functions = functions
|
|
self.additional_functions = additional_functions
|
|
self.mandate = mandate
|
|
self.disclosures = disclosures
|
|
|
|
|
|
def main():
|
|
links, names = get_links_and_names()
|
|
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
|
|
|
|
|
def get_links_and_names():
|
|
response = requests.get(BUNDESTAG_URL)
|
|
soup = BeautifulSoup(response.content)
|
|
links = [a.get("href") for a in soup.find_all("a")]
|
|
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
|
|
|
|
return (links, names)
|
|
|
|
|
|
def get_bio(url, name):
|
|
print(name)
|
|
name = name.split(", ")
|
|
print(f"Getting {url} for {name[1]} {name[0]}")
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content)
|
|
cv = soup.find(class_="m-biography__biography").text.strip()
|
|
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
|
speech_div = None
|
|
vote_div = None
|
|
for div in ajax_divs:
|
|
if "abstimmung" in div.get("x-data"):
|
|
vote_div = div
|
|
else:
|
|
speech_div = div
|
|
speech = get_ajax(speech_div)
|
|
speech_infos, speech_titles = parse_speech(speech)
|
|
vote = get_ajax(vote_div)
|
|
votes = parse_vote(vote)
|
|
function_divs = soup.find_all(class_="m-biography__membership")
|
|
if len(function_divs) > 0:
|
|
functions = get_functions(function_divs[0])
|
|
else:
|
|
functions = None
|
|
if len(function_divs) > 1:
|
|
additional_functions = get_functions(function_divs[1])
|
|
else:
|
|
additional_functions = None
|
|
mandate = (
|
|
soup.find(class_="m-biography__subHeading --mandate").text,
|
|
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
|
)
|
|
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
|
|
|
print(
|
|
name,
|
|
cv,
|
|
(speech_titles, speech_infos),
|
|
votes,
|
|
functions,
|
|
additional_functions,
|
|
mandate,
|
|
disclosures,
|
|
)
|
|
sleep(10)
|
|
|
|
return Biography(
|
|
name,
|
|
cv,
|
|
(speech_titles, speech_infos),
|
|
votes,
|
|
functions,
|
|
additional_functions,
|
|
mandate,
|
|
disclosures,
|
|
)
|
|
|
|
|
|
def get_functions(elem):
|
|
out = []
|
|
current_heading = None
|
|
current_body = []
|
|
for child in elem.children:
|
|
if child.name == "h3":
|
|
if current_body != []:
|
|
out.append((current_heading, current_body))
|
|
current_heading = child.text.strip()
|
|
current_body = []
|
|
continue
|
|
if not child.name:
|
|
continue
|
|
current_body.append(child.text.strip())
|
|
out.append((current_heading, current_body))
|
|
return out
|
|
|
|
|
|
def parse_speech(page):
|
|
if not page:
|
|
return (None, None)
|
|
soup = BeautifulSoup(page.content)
|
|
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
|
|
titles = [
|
|
title.text
|
|
for title in soup.find_all(class_="a-link__label")
|
|
if "--hidden" not in title.get("class")
|
|
][::2]
|
|
return (titles, infos)
|
|
|
|
|
|
def parse_vote(page):
|
|
if not page:
|
|
return None
|
|
soup = BeautifulSoup(page.content)
|
|
rows = soup.find_all("tr")[1:]
|
|
parsed = []
|
|
for row in rows:
|
|
cols = row.find_all("td")
|
|
parsed.append([col.text.strip() for col in cols])
|
|
return parsed
|
|
|
|
|
|
def get_ajax(elem):
|
|
if not elem:
|
|
return None
|
|
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
|
|
data = json.loads(inner)
|
|
url = BUNDESTAG_BASE_URL + data["endpoint"]
|
|
filters = data["filters"]
|
|
sanitized_filters = [
|
|
(key, value.replace(" ", "+").replace("#", "%23"))
|
|
for key, value in filters.items()
|
|
]
|
|
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
|
return requests.get(url)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|