save raw data as json

This commit is contained in:
Marco Lents 2025-11-14 11:28:39 +01:00
parent cb3186e00e
commit 19cdfb486d

View file

@ -3,6 +3,9 @@ import re
import json import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from time import sleep from time import sleep
from os.path import commonprefix
from os import makedirs
import argparse
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
BUNDESTAG_BASE_URL = "https://www.bundestag.de" BUNDESTAG_BASE_URL = "https://www.bundestag.de"
@ -12,6 +15,7 @@ class Biography:
def __init__( def __init__(
self, self,
name, name,
party,
cv, cv,
speeches, speeches,
votes, votes,
@ -21,6 +25,7 @@ class Biography:
disclosures, disclosures,
): ):
self.name = name self.name = name
self.party = party
self.cv = cv self.cv = cv
self.speeches = speeches self.speeches = speeches
self.votes = votes self.votes = votes
@ -29,22 +34,75 @@ class Biography:
self.mandate = mandate self.mandate = mandate
self.disclosures = disclosures self.disclosures = disclosures
def __repr__(self):
txt = f"""
name: {self.name}
party: {self.party}
cv: {self.cv}
speeches: {self.speeches}
votes: {self.votes}
functions: {self.functions}
additional_functions: {self.additional_functions}
mandate: {self.mandate}
disclosures: {self.disclosures}
"""
return txt
def to_dict(self):
return {
"name": self.name,
"party": self.party,
"cv": self.cv,
"speeches": self.speeches,
"votes": self.votes,
"functions": self.functions,
"additional_functions": self.additional_functions,
"mandate": self.mandate,
"disclosures": self.disclosures,
}
def main(): def main():
parser = argparse.ArgumentParser(
prog="Bundescrawler",
description="Crawls the pages of german representatives and saves the information in a git repository",
)
parser.add_argument("-o", "--out")
parser.add_argument("--debug", action="store_true")
args = parser.parse_args()
if not args.out:
raise ValueError("must supply out directory")
try:
makedirs(args.out)
except FileExistsError:
print("Path already exists")
pass
links, names = get_links_and_names() links, names = get_links_and_names()
if args.debug:
links = links[:5]
names = names[:5]
bios = [get_bio(link, name) for link, name in zip(links, names)] bios = [get_bio(link, name) for link, name in zip(links, names)]
save_info(bios, args.out)
def save_info(bios, out):
with open(f"{out}/raw.json", "w") as raw_file:
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
def get_links_and_names(): def get_links_and_names():
response = requests.get(BUNDESTAG_URL) response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content) soup = BeautifulSoup(response.content)
links = [a.get("href") for a in soup.find_all("a")] links = [a.get("href") for a in soup.find_all("a")]
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")] names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]
return (links, names) return (links, names)
def get_bio(url, name): def get_bio(url, name):
print(name)
name, party = name
print(name) print(name)
name = name.split(", ") name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}") print(f"Getting {url} for {name[1]} {name[0]}")
@ -60,7 +118,7 @@ def get_bio(url, name):
else: else:
speech_div = div speech_div = div
speech = get_ajax(speech_div) speech = get_ajax(speech_div)
speech_infos, speech_titles = parse_speech(speech) speeches = parse_speech(speech)
vote = get_ajax(vote_div) vote = get_ajax(vote_div)
votes = parse_vote(vote) votes = parse_vote(vote)
function_divs = soup.find_all(class_="m-biography__membership") function_divs = soup.find_all(class_="m-biography__membership")
@ -78,28 +136,22 @@ def get_bio(url, name):
) )
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
print( bio = Biography(
name, name,
party,
cv, cv,
(speech_titles, speech_infos), speeches,
votes, votes,
functions, functions,
additional_functions, additional_functions,
mandate, mandate,
disclosures, disclosures,
) )
sleep(10)
return Biography( print(bio)
name, sleep(1)
cv,
(speech_titles, speech_infos), return bio
votes,
functions,
additional_functions,
mandate,
disclosures,
)
def get_functions(elem): def get_functions(elem):
@ -124,13 +176,13 @@ def parse_speech(page):
if not page: if not page:
return (None, None) return (None, None)
soup = BeautifulSoup(page.content) soup = BeautifulSoup(page.content)
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")] infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
titles = [ titles = [
title.text title.text
for title in soup.find_all(class_="a-link__label") for title in soup.find_all(class_="a-link__label")
if "--hidden" not in title.get("class") if "--hidden" not in title.get("class")
][::2] ][::2]
return (titles, infos) return list(zip(titles, infos))
def parse_vote(page): def parse_vote(page):
@ -160,5 +212,9 @@ def get_ajax(elem):
return requests.get(url) return requests.get(url)
def common_suffix(strings):
return commonprefix([s[::-1] for s in strings])[::-1]
if __name__ == "__main__": if __name__ == "__main__":
main() main()