Compare commits
2 commits
d47627643f
...
5501f07cf7
| Author | SHA1 | Date | |
|---|---|---|---|
| 5501f07cf7 | |||
| 6ef8fcc993 |
1 changed files with 56 additions and 13 deletions
69
crawler.py
69
crawler.py
|
|
@ -139,8 +139,10 @@ def main():
|
||||||
|
|
||||||
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
|
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
|
||||||
|
|
||||||
save_raw(bios, args.out)
|
if not args.debug:
|
||||||
|
save_raw(bios, args.out)
|
||||||
save_individuals(bios, args.out)
|
save_individuals(bios, args.out)
|
||||||
|
save_disclosures(bios, args.out)
|
||||||
|
|
||||||
if args.no_git:
|
if args.no_git:
|
||||||
return
|
return
|
||||||
|
|
@ -171,6 +173,47 @@ def save_raw(bios, out):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_disclosures(bios, out):
|
||||||
|
dir = f"{out}/Voep_Angaben"
|
||||||
|
try:
|
||||||
|
makedirs(dir)
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
bios_with_discl = [bio for bio in bios if bio.disclosures]
|
||||||
|
alpha_str = ""
|
||||||
|
for bio in sorted(bios_with_discl, key=lambda b: b.name):
|
||||||
|
alpha_str += f"# {bio.name[1]} {bio.name[0]} ({bio.party})"
|
||||||
|
alpha_str += funcs_to_str(bio.disclosures)
|
||||||
|
alpha_str += "\n"
|
||||||
|
|
||||||
|
with open(f"{dir}/Alphabetisch.md", "w") as alpha_file:
|
||||||
|
alpha_file.write(alpha_str.strip())
|
||||||
|
|
||||||
|
party_str = ""
|
||||||
|
for party, bio_list in group_by_party(bios_with_discl):
|
||||||
|
party_str += f"# {party}\n"
|
||||||
|
for bio in bio_list:
|
||||||
|
party_str += f"## {bio.name[1]} {bio.name[0]}"
|
||||||
|
party_str += funcs_to_str(bio.disclosures)
|
||||||
|
party_str += "\n"
|
||||||
|
|
||||||
|
with open(f"{dir}/Nach_Partei.md", "w") as party_file:
|
||||||
|
party_file.write(party_str.strip())
|
||||||
|
|
||||||
|
|
||||||
|
def group_by_party(bios):
|
||||||
|
grouped = {}
|
||||||
|
for bio in bios:
|
||||||
|
if bio.party in grouped.keys():
|
||||||
|
grouped[bio.party].append(bio)
|
||||||
|
else:
|
||||||
|
grouped[bio.party] = [bio]
|
||||||
|
|
||||||
|
as_list = [(key, val) for key, val in grouped.items()]
|
||||||
|
as_list.sort(key=lambda party: party[0])
|
||||||
|
return as_list
|
||||||
|
|
||||||
|
|
||||||
def get_links_and_names():
|
def get_links_and_names():
|
||||||
response = requests.get(BUNDESTAG_URL)
|
response = requests.get(BUNDESTAG_URL)
|
||||||
soup = BeautifulSoup(response.content)
|
soup = BeautifulSoup(response.content)
|
||||||
|
|
@ -184,12 +227,7 @@ def get_bio(url, name, sleep_for):
|
||||||
name, party = name
|
name, party = name
|
||||||
name = name.split(", ")
|
name = name.split(", ")
|
||||||
print(f"Getting {url} for {name[1]} {name[0]}")
|
print(f"Getting {url} for {name[1]} {name[0]}")
|
||||||
for _ in range(5):
|
response = request_handle_rate_limit(url)
|
||||||
try:
|
|
||||||
response = requests.get(url)
|
|
||||||
except:
|
|
||||||
print("Rate limit! waiting 5min")
|
|
||||||
sleep(300)
|
|
||||||
soup = BeautifulSoup(response.content)
|
soup = BeautifulSoup(response.content)
|
||||||
cv = soup.find(class_="m-biography__biography").text.strip()
|
cv = soup.find(class_="m-biography__biography").text.strip()
|
||||||
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
||||||
|
|
@ -236,6 +274,16 @@ def get_bio(url, name, sleep_for):
|
||||||
return bio
|
return bio
|
||||||
|
|
||||||
|
|
||||||
|
def request_handle_rate_limit(url):
|
||||||
|
for _ in range(5):
|
||||||
|
try:
|
||||||
|
return requests.get(url)
|
||||||
|
except:
|
||||||
|
print("Rate limit! waiting 5min")
|
||||||
|
sleep(300)
|
||||||
|
return requests.get(url)
|
||||||
|
|
||||||
|
|
||||||
def get_disclosures(elem):
|
def get_disclosures(elem):
|
||||||
if not elem:
|
if not elem:
|
||||||
return None
|
return None
|
||||||
|
|
@ -319,12 +367,7 @@ def get_ajax(elem):
|
||||||
for key, value in filters.items()
|
for key, value in filters.items()
|
||||||
]
|
]
|
||||||
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
||||||
for _ in range(5):
|
response = request_handle_rate_limit(url)
|
||||||
try:
|
|
||||||
response = requests.get(url)
|
|
||||||
except:
|
|
||||||
print("Rate limit! waiting 5min")
|
|
||||||
sleep(300)
|
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue