some basic functionality

This commit is contained in:
Marco Lents 2025-11-13 21:56:15 +01:00
parent 197b85d8e6
commit f8b33e1d6b
2 changed files with 112 additions and 0 deletions

107
crawler.py Normal file
View file

@ -0,0 +1,107 @@
import requests
import json
from bs4 import BeautifulSoup
from time import sleep
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
class Biography:
def __init__(
self,
name,
cv,
speeches,
votes,
functions,
additional_functions,
mandate,
disclosures,
):
self.name = name
self.cv = cv
self.votes = votes
self.functions = functions
self.additional_functions = additional_functions
self.mandate = mandate
self.disclosures = disclosures
def main():
links, names = get_links_and_names()
bios = [get_bio(bio) for bio in links]
def get_links_and_names():
response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content)
links = [a.get("href") for a in soup.find_all("a")]
names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
return (links, names)
def get_bio(url):
print(f"Getting {url}")
response = requests.get(url)
soup = BeautifulSoup(response.content)
cv = soup.find(class_="m-biography__biography").text.strip()
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
speech_div = None
vote_div = None
for div in ajax_divs:
if "abstimmung" in div.get("x-data"):
vote_div = div
else:
speech_div = div
speech = get_ajax(speech_div)
speech_infos, speech_titles = parse_speech(speech)
vote = get_ajax(vote_div)
votes = parse_vote(vote)
print(cv, speech_infos, speech_titles, votes)
sleep(10)
def parse_speech(page):
if not page:
return (None, None)
soup = BeautifulSoup(page.content)
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
titles = [
title.text
for title in soup.find_all(class_="a-link__label")
if "--hidden" not in title.get("class")
][::2]
return (titles, infos)
def parse_vote(page):
if not page:
return None
soup = BeautifulSoup(page.content)
rows = soup.find_all("tr")[1:]
parsed = []
for row in rows:
cols = row.find_all("td")
parsed.append([col.text.strip() for col in cols])
return parsed
def get_ajax(elem):
if not elem:
return None
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
data = json.loads(inner)
url = BUNDESTAG_BASE_URL + data["endpoint"]
filters = data["filters"]
sanitized_filters = [
(key, value.replace(" ", "+").replace("#", "%23"))
for key, value in filters.items()
]
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
return requests.get(url)
if __name__ == "__main__":
main()

View file

@ -10,3 +10,8 @@ maintainers = [
]
description = "Crawls the website of the german parlament and tracks any changes in a separate repository."
dependencies = [
"beautifulsoup4",
"requests",
]