some basic functionality
This commit is contained in:
parent
197b85d8e6
commit
f8b33e1d6b
2 changed files with 112 additions and 0 deletions
107
crawler.py
Normal file
107
crawler.py
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
import requests
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from time import sleep
|
||||
|
||||
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
||||
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
||||
|
||||
|
||||
class Biography:
|
||||
def __init__(
|
||||
self,
|
||||
name,
|
||||
cv,
|
||||
speeches,
|
||||
votes,
|
||||
functions,
|
||||
additional_functions,
|
||||
mandate,
|
||||
disclosures,
|
||||
):
|
||||
self.name = name
|
||||
self.cv = cv
|
||||
self.votes = votes
|
||||
self.functions = functions
|
||||
self.additional_functions = additional_functions
|
||||
self.mandate = mandate
|
||||
self.disclosures = disclosures
|
||||
|
||||
|
||||
def main():
|
||||
links, names = get_links_and_names()
|
||||
bios = [get_bio(bio) for bio in links]
|
||||
|
||||
|
||||
def get_links_and_names():
|
||||
response = requests.get(BUNDESTAG_URL)
|
||||
soup = BeautifulSoup(response.content)
|
||||
links = [a.get("href") for a in soup.find_all("a")]
|
||||
names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
|
||||
|
||||
return (links, names)
|
||||
|
||||
|
||||
def get_bio(url):
|
||||
print(f"Getting {url}")
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.content)
|
||||
cv = soup.find(class_="m-biography__biography").text.strip()
|
||||
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
||||
speech_div = None
|
||||
vote_div = None
|
||||
for div in ajax_divs:
|
||||
if "abstimmung" in div.get("x-data"):
|
||||
vote_div = div
|
||||
else:
|
||||
speech_div = div
|
||||
speech = get_ajax(speech_div)
|
||||
speech_infos, speech_titles = parse_speech(speech)
|
||||
vote = get_ajax(vote_div)
|
||||
votes = parse_vote(vote)
|
||||
print(cv, speech_infos, speech_titles, votes)
|
||||
sleep(10)
|
||||
|
||||
|
||||
def parse_speech(page):
|
||||
if not page:
|
||||
return (None, None)
|
||||
soup = BeautifulSoup(page.content)
|
||||
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
|
||||
titles = [
|
||||
title.text
|
||||
for title in soup.find_all(class_="a-link__label")
|
||||
if "--hidden" not in title.get("class")
|
||||
][::2]
|
||||
return (titles, infos)
|
||||
|
||||
|
||||
def parse_vote(page):
|
||||
if not page:
|
||||
return None
|
||||
soup = BeautifulSoup(page.content)
|
||||
rows = soup.find_all("tr")[1:]
|
||||
parsed = []
|
||||
for row in rows:
|
||||
cols = row.find_all("td")
|
||||
parsed.append([col.text.strip() for col in cols])
|
||||
return parsed
|
||||
|
||||
|
||||
def get_ajax(elem):
|
||||
if not elem:
|
||||
return None
|
||||
inner = elem.get("x-data").lstrip("dynamicTemplateOutput(").rstrip(")")
|
||||
data = json.loads(inner)
|
||||
url = BUNDESTAG_BASE_URL + data["endpoint"]
|
||||
filters = data["filters"]
|
||||
sanitized_filters = [
|
||||
(key, value.replace(" ", "+").replace("#", "%23"))
|
||||
for key, value in filters.items()
|
||||
]
|
||||
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
||||
return requests.get(url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -10,3 +10,8 @@ maintainers = [
|
|||
]
|
||||
|
||||
description = "Crawls the website of the german parlament and tracks any changes in a separate repository."
|
||||
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue