diff --git a/crawler.py b/crawler.py index 7b5ea7d..969c837 100644 --- a/crawler.py +++ b/crawler.py @@ -88,7 +88,8 @@ Biographie: {self.cv} # Mandat {self.mandate[0]}, {self.mandate[1]} -# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)} +# Veröffentlichungspflichtige Angaben +{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")} """ return txt @@ -112,7 +113,7 @@ def funcs_to_str(funcs): out = "" for func in funcs: out += f"\n- {func[0]}" - for loc in sorted(func[1]): + for loc in func[1]: out += f"\n - {loc}" return out @@ -124,7 +125,6 @@ def main(): ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") - parser.add_argument("--no-git", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") @@ -133,18 +133,15 @@ def main(): if args.debug: links = links[:5] names = names[:5] - sleep_for = 0 + sleep_for = 1 else: - sleep_for = 10 + sleep_for = 5 bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] save_raw(bios, args.out) save_individuals(bios, args.out) - if args.no_git: - return - repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") @@ -184,12 +181,7 @@ def get_bio(url, name, sleep_for): name, party = name name = name.split(", ") print(f"Getting {url} for {name[1]} {name[0]}") - for _ in range(5): - try: - response = requests.get(url) - except: - print("Rate limit! waiting 5min") - sleep(300) + response = requests.get(url) soup = BeautifulSoup(response.content) cv = soup.find(class_="m-biography__biography").text.strip() ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") @@ -217,7 +209,7 @@ def get_bio(url, name, sleep_for): soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) - disclosures = get_disclosures(soup.find(class_="m-biography__infos")) + disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() bio = Biography( name, @@ -236,30 +228,6 @@ def get_bio(url, name, sleep_for): return bio -def get_disclosures(elem): - if not elem: - return None - divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer") - out = [] - for div in divs: - current_heading = "" - current_body = [] - for child in div.children: - if child.name == "h3": - if current_body != []: - out.append((current_heading, current_body)) - current_heading = child.text.strip() - current_body = [] - continue - if not child.name: - continue - if child.text.strip() == "": - continue - current_body.append(child.text.strip()) - out.append((current_heading, current_body)) - return out - - def get_functions(elem): out = [] current_heading = None @@ -267,7 +235,7 @@ def get_functions(elem): for child in elem.children: if child.name == "h3": if current_body != []: - out.append((current_heading, sorted(current_body))) + out.append((current_heading, current_body)) current_heading = child.text.strip() current_body = [] continue @@ -278,13 +246,13 @@ def get_functions(elem): for grandchild in child.children if grandchild.text.strip() != "" ) - out.append((current_heading, sorted(current_body))) + out.append((current_heading, current_body)) return out def parse_speech(page): if not page: - return None + return (None, None) soup = BeautifulSoup(page.content) infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")] titles = [ @@ -319,13 +287,7 @@ def get_ajax(elem): for key, value in filters.items() ] url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) - for _ in range(5): - try: - response = requests.get(url) - except: - print("Rate limit! waiting 5min") - sleep(300) - return response + return requests.get(url) def common_suffix(strings):