diff --git a/crawler.py b/crawler.py index 59a5600..017b72c 100644 --- a/crawler.py +++ b/crawler.py @@ -3,6 +3,9 @@ import re import json from bs4 import BeautifulSoup from time import sleep +from os.path import commonprefix +from os import makedirs +import argparse BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList" BUNDESTAG_BASE_URL = "https://www.bundestag.de" @@ -12,6 +15,7 @@ class Biography: def __init__( self, name, + party, cv, speeches, votes, @@ -21,6 +25,7 @@ class Biography: disclosures, ): self.name = name + self.party = party self.cv = cv self.speeches = speeches self.votes = votes @@ -29,22 +34,75 @@ class Biography: self.mandate = mandate self.disclosures = disclosures + def __repr__(self): + txt = f""" +name: {self.name} +party: {self.party} +cv: {self.cv} +speeches: {self.speeches} +votes: {self.votes} +functions: {self.functions} +additional_functions: {self.additional_functions} +mandate: {self.mandate} +disclosures: {self.disclosures} + """ + return txt + + def to_dict(self): + return { + "name": self.name, + "party": self.party, + "cv": self.cv, + "speeches": self.speeches, + "votes": self.votes, + "functions": self.functions, + "additional_functions": self.additional_functions, + "mandate": self.mandate, + "disclosures": self.disclosures, + } + def main(): + parser = argparse.ArgumentParser( + prog="Bundescrawler", + description="Crawls the pages of german representatives and saves the information in a git repository", + ) + parser.add_argument("-o", "--out") + parser.add_argument("--debug", action="store_true") + args = parser.parse_args() + if not args.out: + raise ValueError("must supply out directory") + try: + makedirs(args.out) + except FileExistsError: + print("Path already exists") + pass links, names = get_links_and_names() + if args.debug: + links = links[:5] + names = names[:5] bios = [get_bio(link, name) for link, name in zip(links, names)] + save_info(bios, args.out) + + +def save_info(bios, out): + with open(f"{out}/raw.json", "w") as raw_file: + json.dump([bio.to_dict() for bio in bios], raw_file, indent=2) + def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) links = [a.get("href") for a in soup.find_all("a")] - names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")] + names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")] return (links, names) def get_bio(url, name): + print(name) + name, party = name print(name) name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") @@ -60,7 +118,7 @@ def get_bio(url, name): else: speech_div = div speech = get_ajax(speech_div) - speech_infos, speech_titles = parse_speech(speech) + speeches = parse_speech(speech) vote = get_ajax(vote_div) votes = parse_vote(vote) function_divs = soup.find_all(class_="m-biography__membership") @@ -78,28 +136,22 @@ def get_bio(url, name): ) disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() - print( + bio = Biography( name, + party, cv, - (speech_titles, speech_infos), + speeches, votes, functions, additional_functions, mandate, disclosures, ) - sleep(10) - return Biography( - name, - cv, - (speech_titles, speech_infos), - votes, - functions, - additional_functions, - mandate, - disclosures, - ) + print(bio) + sleep(1) + + return bio def get_functions(elem): @@ -124,13 +176,13 @@ def parse_speech(page): if not page: return (None, None) soup = BeautifulSoup(page.content) - infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")] + infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ title.text for title in soup.find_all(class_="a-link__label") if "--hidden" not in title.get("class") ][::2] - return (titles, infos) + return list(zip(titles, infos)) def parse_vote(page): @@ -160,5 +212,9 @@ def get_ajax(elem): return requests.get(url) +def common_suffix(strings): + return commonprefix([s[::-1] for s in strings])[::-1] + + if __name__ == "__main__": main()