save raw data as json

This commit is contained in:
Marco Lents 2025-11-14 11:28:39 +01:00
parent cb3186e00e
commit 19cdfb486d

View file

@ -3,6 +3,9 @@ import re
import json
from bs4 import BeautifulSoup
from time import sleep
from os.path import commonprefix
from os import makedirs
import argparse
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
@ -12,6 +15,7 @@ class Biography:
def __init__(
self,
name,
party,
cv,
speeches,
votes,
@ -21,6 +25,7 @@ class Biography:
disclosures,
):
self.name = name
self.party = party
self.cv = cv
self.speeches = speeches
self.votes = votes
@ -29,22 +34,75 @@ class Biography:
self.mandate = mandate
self.disclosures = disclosures
def __repr__(self):
txt = f"""
name: {self.name}
party: {self.party}
cv: {self.cv}
speeches: {self.speeches}
votes: {self.votes}
functions: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
"""
return txt
def to_dict(self):
return {
"name": self.name,
"party": self.party,
"cv": self.cv,
"speeches": self.speeches,
"votes": self.votes,
"functions": self.functions,
"additional_functions": self.additional_functions,
"mandate": self.mandate,
"disclosures": self.disclosures,
}
def main():
parser = argparse.ArgumentParser(
prog="Bundescrawler",
description="Crawls the pages of german representatives and saves the information in a git repository",
)
parser.add_argument("-o", "--out")
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
if not args.out:
raise ValueError("must supply out directory")
try:
makedirs(args.out)
except FileExistsError:
print("Path already exists")
pass
links, names = get_links_and_names()
if args.debug:
links = links[:5]
names = names[:5]
bios = [get_bio(link, name) for link, name in zip(links, names)]
save_info(bios, args.out)
def save_info(bios, out):
with open(f"{out}/raw.json", "w") as raw_file:
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
def get_links_and_names():
response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content)
links = [a.get("href") for a in soup.find_all("a")]
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]
return (links, names)
def get_bio(url, name):
print(name)
name, party = name
print(name)
name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}")
@ -60,7 +118,7 @@ def get_bio(url, name):
else:
speech_div = div
speech = get_ajax(speech_div)
speech_infos, speech_titles = parse_speech(speech)
speeches = parse_speech(speech)
vote = get_ajax(vote_div)
votes = parse_vote(vote)
function_divs = soup.find_all(class_="m-biography__membership")
@ -78,28 +136,22 @@ def get_bio(url, name):
)
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
print(
bio = Biography(
name,
party,
cv,
(speech_titles, speech_infos),
speeches,
votes,
functions,
additional_functions,
mandate,
disclosures,
)
sleep(10)
return Biography(
name,
cv,
(speech_titles, speech_infos),
votes,
functions,
additional_functions,
mandate,
disclosures,
)
print(bio)
sleep(1)
return bio
def get_functions(elem):
@ -124,13 +176,13 @@ def parse_speech(page):
if not page:
return (None, None)
soup = BeautifulSoup(page.content)
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
titles = [
title.text
for title in soup.find_all(class_="a-link__label")
if "--hidden" not in title.get("class")
][::2]
return (titles, infos)
return list(zip(titles, infos))
def parse_vote(page):
@ -160,5 +212,9 @@ def get_ajax(elem):
return requests.get(url)
def common_suffix(strings):
return commonprefix([s[::-1] for s in strings])[::-1]
if __name__ == "__main__":
main()