diff --git a/crawler.py b/crawler.py index ff05b46..7b5ea7d 100644 --- a/crawler.py +++ b/crawler.py @@ -139,10 +139,8 @@ def main(): bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] - if not args.debug: - save_raw(bios, args.out) + save_raw(bios, args.out) save_individuals(bios, args.out) - save_disclosures(bios, args.out) if args.no_git: return @@ -173,47 +171,6 @@ def save_raw(bios, out): ) -def save_disclosures(bios, out): - dir = f"{out}/Voep_Angaben" - try: - makedirs(dir) - except FileExistsError: - pass - bios_with_discl = [bio for bio in bios if bio.disclosures] - alpha_str = "" - for bio in sorted(bios_with_discl, key=lambda b: b.name): - alpha_str += f"# {bio.name[1]} {bio.name[0]} ({bio.party})" - alpha_str += funcs_to_str(bio.disclosures) - alpha_str += "\n" - - with open(f"{dir}/Alphabetisch.md", "w") as alpha_file: - alpha_file.write(alpha_str.strip()) - - party_str = "" - for party, bio_list in group_by_party(bios_with_discl): - party_str += f"# {party}\n" - for bio in bio_list: - party_str += f"## {bio.name[1]} {bio.name[0]}" - party_str += funcs_to_str(bio.disclosures) - party_str += "\n" - - with open(f"{dir}/Nach_Partei.md", "w") as party_file: - party_file.write(party_str.strip()) - - -def group_by_party(bios): - grouped = {} - for bio in bios: - if bio.party in grouped.keys(): - grouped[bio.party].append(bio) - else: - grouped[bio.party] = [bio] - - as_list = [(key, val) for key, val in grouped.items()] - as_list.sort(key=lambda party: party[0]) - return as_list - - def get_links_and_names(): response = requests.get(BUNDESTAG_URL) soup = BeautifulSoup(response.content) @@ -227,7 +184,12 @@ def get_bio(url, name, sleep_for): name, party = name name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") - response = request_handle_rate_limit(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") @@ -274,16 +236,6 @@ def get_bio(url, name, sleep_for): return bio -def request_handle_rate_limit(url): - for _ in range(5): - try: - return requests.get(url) - except: - print("Rate limit! waiting 5min") - sleep(300) - return requests.get(url) - - def get_disclosures(elem): if not elem: return None @@ -367,7 +319,12 @@ def get_ajax(elem): for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) - response = request_handle_rate_limit(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) return response