full crawler functionality

This commit is contained in:
Marco Lents 2025-11-14 10:10:24 +01:00
parent 2583829836
commit cb3186e00e

View file

@ -1,4 +1,5 @@
import requests
import re
import json
from bs4 import BeautifulSoup
from time import sleep
@ -21,6 +22,7 @@ class Biography:
):
self.name = name
self.cv = cv
self.speeches = speeches
self.votes = votes
self.functions = functions
self.additional_functions = additional_functions
@ -30,20 +32,22 @@ class Biography:
def main():
links, names = get_links_and_names()
bios = [get_bio(bio) for bio in links]
bios = [get_bio(link, name) for link, name in zip(links, names)]
def get_links_and_names():
response = requests.get(BUNDESTAG_URL)
soup = BeautifulSoup(response.content)
links = [a.get("href") for a in soup.find_all("a")]
names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
return (links, names)
def get_bio(url):
print(f"Getting {url}")
def get_bio(url, name):
print(name)
name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}")
response = requests.get(url)
soup = BeautifulSoup(response.content)
cv = soup.find(class_="m-biography__biography").text.strip()
@ -59,9 +63,62 @@ def get_bio(url):
speech_infos, speech_titles = parse_speech(speech)
vote = get_ajax(vote_div)
votes = parse_vote(vote)
print(cv, speech_infos, speech_titles, votes)
function_divs = soup.find_all(class_="m-biography__membership")
if len(function_divs) > 0:
functions = get_functions(function_divs[0])
else:
functions = None
if len(function_divs) > 1:
additional_functions = get_functions(function_divs[1])
else:
additional_functions = None
mandate = (
soup.find(class_="m-biography__subHeading --mandate").text,
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
)
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
print(
name,
cv,
(speech_titles, speech_infos),
votes,
functions,
additional_functions,
mandate,
disclosures,
)
sleep(10)
return Biography(
name,
cv,
(speech_titles, speech_infos),
votes,
functions,
additional_functions,
mandate,
disclosures,
)
def get_functions(elem):
out = []
current_heading = None
current_body = []
for child in elem.children:
if child.name == "h3":
if current_body != []:
out.append((current_heading, current_body))
current_heading = child.text.strip()
current_body = []
continue
if not child.name:
continue
current_body.append(child.text.strip())
out.append((current_heading, current_body))
return out
def parse_speech(page):
if not page: