save raw data as json
This commit is contained in:
parent
cb3186e00e
commit
19cdfb486d
1 changed files with 73 additions and 17 deletions
90
crawler.py
90
crawler.py
|
|
@ -3,6 +3,9 @@ import re
|
||||||
import json
|
import json
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from os.path import commonprefix
|
||||||
|
from os import makedirs
|
||||||
|
import argparse
|
||||||
|
|
||||||
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
BUNDESTAG_URL = "https://www.bundestag.de/ajax/filterlist/de/abgeordnete/biografien/1040594-1040594?limit=9999&view=BTBiographyList"
|
||||||
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
BUNDESTAG_BASE_URL = "https://www.bundestag.de"
|
||||||
|
|
@ -12,6 +15,7 @@ class Biography:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name,
|
name,
|
||||||
|
party,
|
||||||
cv,
|
cv,
|
||||||
speeches,
|
speeches,
|
||||||
votes,
|
votes,
|
||||||
|
|
@ -21,6 +25,7 @@ class Biography:
|
||||||
disclosures,
|
disclosures,
|
||||||
):
|
):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.party = party
|
||||||
self.cv = cv
|
self.cv = cv
|
||||||
self.speeches = speeches
|
self.speeches = speeches
|
||||||
self.votes = votes
|
self.votes = votes
|
||||||
|
|
@ -29,22 +34,75 @@ class Biography:
|
||||||
self.mandate = mandate
|
self.mandate = mandate
|
||||||
self.disclosures = disclosures
|
self.disclosures = disclosures
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
txt = f"""
|
||||||
|
name: {self.name}
|
||||||
|
party: {self.party}
|
||||||
|
cv: {self.cv}
|
||||||
|
speeches: {self.speeches}
|
||||||
|
votes: {self.votes}
|
||||||
|
functions: {self.functions}
|
||||||
|
additional_functions: {self.additional_functions}
|
||||||
|
mandate: {self.mandate}
|
||||||
|
disclosures: {self.disclosures}
|
||||||
|
"""
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
"name": self.name,
|
||||||
|
"party": self.party,
|
||||||
|
"cv": self.cv,
|
||||||
|
"speeches": self.speeches,
|
||||||
|
"votes": self.votes,
|
||||||
|
"functions": self.functions,
|
||||||
|
"additional_functions": self.additional_functions,
|
||||||
|
"mandate": self.mandate,
|
||||||
|
"disclosures": self.disclosures,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="Bundescrawler",
|
||||||
|
description="Crawls the pages of german representatives and saves the information in a git repository",
|
||||||
|
)
|
||||||
|
parser.add_argument("-o", "--out")
|
||||||
|
parser.add_argument("--debug", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
if not args.out:
|
||||||
|
raise ValueError("must supply out directory")
|
||||||
|
try:
|
||||||
|
makedirs(args.out)
|
||||||
|
except FileExistsError:
|
||||||
|
print("Path already exists")
|
||||||
|
pass
|
||||||
links, names = get_links_and_names()
|
links, names = get_links_and_names()
|
||||||
|
if args.debug:
|
||||||
|
links = links[:5]
|
||||||
|
names = names[:5]
|
||||||
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
||||||
|
|
||||||
|
save_info(bios, args.out)
|
||||||
|
|
||||||
|
|
||||||
|
def save_info(bios, out):
|
||||||
|
with open(f"{out}/raw.json", "w") as raw_file:
|
||||||
|
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
|
||||||
|
|
||||||
|
|
||||||
def get_links_and_names():
|
def get_links_and_names():
|
||||||
response = requests.get(BUNDESTAG_URL)
|
response = requests.get(BUNDESTAG_URL)
|
||||||
soup = BeautifulSoup(response.content)
|
soup = BeautifulSoup(response.content)
|
||||||
links = [a.get("href") for a in soup.find_all("a")]
|
links = [a.get("href") for a in soup.find_all("a")]
|
||||||
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
|
names = [a.text.strip().split("\n\n\n") for a in soup.find_all("a")]
|
||||||
|
|
||||||
return (links, names)
|
return (links, names)
|
||||||
|
|
||||||
|
|
||||||
def get_bio(url, name):
|
def get_bio(url, name):
|
||||||
|
print(name)
|
||||||
|
name, party = name
|
||||||
print(name)
|
print(name)
|
||||||
name = name.split(", ")
|
name = name.split(", ")
|
||||||
print(f"Getting {url} for {name[1]} {name[0]}")
|
print(f"Getting {url} for {name[1]} {name[0]}")
|
||||||
|
|
@ -60,7 +118,7 @@ def get_bio(url, name):
|
||||||
else:
|
else:
|
||||||
speech_div = div
|
speech_div = div
|
||||||
speech = get_ajax(speech_div)
|
speech = get_ajax(speech_div)
|
||||||
speech_infos, speech_titles = parse_speech(speech)
|
speeches = parse_speech(speech)
|
||||||
vote = get_ajax(vote_div)
|
vote = get_ajax(vote_div)
|
||||||
votes = parse_vote(vote)
|
votes = parse_vote(vote)
|
||||||
function_divs = soup.find_all(class_="m-biography__membership")
|
function_divs = soup.find_all(class_="m-biography__membership")
|
||||||
|
|
@ -78,28 +136,22 @@ def get_bio(url, name):
|
||||||
)
|
)
|
||||||
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
||||||
|
|
||||||
print(
|
bio = Biography(
|
||||||
name,
|
name,
|
||||||
|
party,
|
||||||
cv,
|
cv,
|
||||||
(speech_titles, speech_infos),
|
speeches,
|
||||||
votes,
|
votes,
|
||||||
functions,
|
functions,
|
||||||
additional_functions,
|
additional_functions,
|
||||||
mandate,
|
mandate,
|
||||||
disclosures,
|
disclosures,
|
||||||
)
|
)
|
||||||
sleep(10)
|
|
||||||
|
|
||||||
return Biography(
|
print(bio)
|
||||||
name,
|
sleep(1)
|
||||||
cv,
|
|
||||||
(speech_titles, speech_infos),
|
return bio
|
||||||
votes,
|
|
||||||
functions,
|
|
||||||
additional_functions,
|
|
||||||
mandate,
|
|
||||||
disclosures,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_functions(elem):
|
def get_functions(elem):
|
||||||
|
|
@ -124,13 +176,13 @@ def parse_speech(page):
|
||||||
if not page:
|
if not page:
|
||||||
return (None, None)
|
return (None, None)
|
||||||
soup = BeautifulSoup(page.content)
|
soup = BeautifulSoup(page.content)
|
||||||
infos = [s.text for s in soup.find_all(class_="m-biography__speechtitle")]
|
infos = [s.text for s in soup.find_all(class_="m-biography__speechTitle")]
|
||||||
titles = [
|
titles = [
|
||||||
title.text
|
title.text
|
||||||
for title in soup.find_all(class_="a-link__label")
|
for title in soup.find_all(class_="a-link__label")
|
||||||
if "--hidden" not in title.get("class")
|
if "--hidden" not in title.get("class")
|
||||||
][::2]
|
][::2]
|
||||||
return (titles, infos)
|
return list(zip(titles, infos))
|
||||||
|
|
||||||
|
|
||||||
def parse_vote(page):
|
def parse_vote(page):
|
||||||
|
|
@ -160,5 +212,9 @@ def get_ajax(elem):
|
||||||
return requests.get(url)
|
return requests.get(url)
|
||||||
|
|
||||||
|
|
||||||
|
def common_suffix(strings):
|
||||||
|
return commonprefix([s[::-1] for s in strings])[::-1]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue