full crawler functionality
This commit is contained in:
parent
2583829836
commit
cb3186e00e
1 changed files with 62 additions and 5 deletions
67
crawler.py
67
crawler.py
|
|
@ -1,4 +1,5 @@
|
|||
import requests
|
||||
import re
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
from time import sleep
|
||||
|
|
@ -21,6 +22,7 @@ class Biography:
|
|||
):
|
||||
self.name = name
|
||||
self.cv = cv
|
||||
self.speeches = speeches
|
||||
self.votes = votes
|
||||
self.functions = functions
|
||||
self.additional_functions = additional_functions
|
||||
|
|
@ -30,20 +32,22 @@ class Biography:
|
|||
|
||||
def main():
|
||||
links, names = get_links_and_names()
|
||||
bios = [get_bio(bio) for bio in links]
|
||||
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
||||
|
||||
|
||||
def get_links_and_names():
|
||||
response = requests.get(BUNDESTAG_URL)
|
||||
soup = BeautifulSoup(response.content)
|
||||
links = [a.get("href") for a in soup.find_all("a")]
|
||||
names = [a.get("title").strip().split("\n\n\n") for a in soup.find_all("a")]
|
||||
names = [a.get("title").strip().split("\n\n\n")[0] for a in soup.find_all("a")]
|
||||
|
||||
return (links, names)
|
||||
|
||||
|
||||
def get_bio(url):
|
||||
print(f"Getting {url}")
|
||||
def get_bio(url, name):
|
||||
print(name)
|
||||
name = name.split(", ")
|
||||
print(f"Getting {url} for {name[1]} {name[0]}")
|
||||
response = requests.get(url)
|
||||
soup = BeautifulSoup(response.content)
|
||||
cv = soup.find(class_="m-biography__biography").text.strip()
|
||||
|
|
@ -59,9 +63,62 @@ def get_bio(url):
|
|||
speech_infos, speech_titles = parse_speech(speech)
|
||||
vote = get_ajax(vote_div)
|
||||
votes = parse_vote(vote)
|
||||
print(cv, speech_infos, speech_titles, votes)
|
||||
function_divs = soup.find_all(class_="m-biography__membership")
|
||||
if len(function_divs) > 0:
|
||||
functions = get_functions(function_divs[0])
|
||||
else:
|
||||
functions = None
|
||||
if len(function_divs) > 1:
|
||||
additional_functions = get_functions(function_divs[1])
|
||||
else:
|
||||
additional_functions = None
|
||||
mandate = (
|
||||
soup.find(class_="m-biography__subHeading --mandate").text,
|
||||
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
||||
)
|
||||
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
||||
|
||||
print(
|
||||
name,
|
||||
cv,
|
||||
(speech_titles, speech_infos),
|
||||
votes,
|
||||
functions,
|
||||
additional_functions,
|
||||
mandate,
|
||||
disclosures,
|
||||
)
|
||||
sleep(10)
|
||||
|
||||
return Biography(
|
||||
name,
|
||||
cv,
|
||||
(speech_titles, speech_infos),
|
||||
votes,
|
||||
functions,
|
||||
additional_functions,
|
||||
mandate,
|
||||
disclosures,
|
||||
)
|
||||
|
||||
|
||||
def get_functions(elem):
|
||||
out = []
|
||||
current_heading = None
|
||||
current_body = []
|
||||
for child in elem.children:
|
||||
if child.name == "h3":
|
||||
if current_body != []:
|
||||
out.append((current_heading, current_body))
|
||||
current_heading = child.text.strip()
|
||||
current_body = []
|
||||
continue
|
||||
if not child.name:
|
||||
continue
|
||||
current_body.append(child.text.strip())
|
||||
out.append((current_heading, current_body))
|
||||
return out
|
||||
|
||||
|
||||
def parse_speech(page):
|
||||
if not page:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue