diff --git a/crawler.py b/crawler.py index 969c837..7b5ea7d 100644 --- a/crawler.py +++ b/crawler.py @@ -88,8 +88,7 @@ Biographie: {self.cv} # Mandat {self.mandate[0]}, {self.mandate[1]} -# Veröffentlichungspflichtige Angaben -{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")} +# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)} """ return txt @@ -113,7 +112,7 @@ def funcs_to_str(funcs): out = "" for func in funcs: out += f"\n- {func[0]}" - for loc in func[1]: + for loc in sorted(func[1]): out += f"\n - {loc}" return out @@ -125,6 +124,7 @@ def main(): ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") + parser.add_argument("--no-git", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") @@ -133,15 +133,18 @@ def main(): if args.debug: links = links[:5] names = names[:5] - sleep_for = 1 + sleep_for = 0 else: - sleep_for = 5 + sleep_for = 10 bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] save_raw(bios, args.out) save_individuals(bios, args.out) + if args.no_git: + return + repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") @@ -181,7 +184,12 @@ def get_bio(url, name, sleep_for): name, party = name name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") - response = requests.get(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") @@ -209,7 +217,7 @@ def get_bio(url, name, sleep_for): soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) - disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() + disclosures = get_disclosures(soup.find(class_="m-biography__infos")) bio = Biography( name, @@ -228,6 +236,30 @@ def get_bio(url, name, sleep_for): return bio +def get_disclosures(elem): + if not elem: + return None + divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer") + out = [] + for div in divs: + current_heading = "" + current_body = [] + for child in div.children: + if child.name == "h3": + if current_body != []: + out.append((current_heading, current_body)) + current_heading = child.text.strip() + current_body = [] + continue + if not child.name: + continue + if child.text.strip() == "": + continue + current_body.append(child.text.strip()) + out.append((current_heading, current_body)) + return out + + def get_functions(elem): out = [] current_heading = None @@ -235,7 +267,7 @@ def get_functions(elem): for child in elem.children: if child.name == "h3": if current_body != []: - out.append((current_heading, current_body)) + out.append((current_heading, sorted(current_body))) current_heading = child.text.strip() current_body = [] continue @@ -246,13 +278,13 @@ def get_functions(elem): for grandchild in child.children if grandchild.text.strip() != "" ) - out.append((current_heading, current_body)) + out.append((current_heading, sorted(current_body))) return out def parse_speech(page): if not page: - return (None, None) + return None soup = BeautifulSoup(page.content) infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ @@ -287,7 +319,13 @@ def get_ajax(elem): for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) - return requests.get(url) + for _ in range(5): + try: + response = requests.get(url) + except: + print("Rate limit! waiting 5min") + sleep(300) + return response def common_suffix(strings):