From 3f0130c75b20f93c48a2b79187886a36fbc77b6f Mon Sep 17 00:00:00 2001 From: Marco Lents Date: Sat, 15 Nov 2025 15:30:07 +0100 Subject: [PATCH] fix disclosuure handling --- crawler.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/crawler.py b/crawler.py index c47432e..46aa103 100644 --- a/crawler.py +++ b/crawler.py @@ -88,8 +88,7 @@ Biographie: {self.cv} # Mandat {self.mandate[0]}, {self.mandate[1]} -# Veröffentlichungspflichtige Angaben -{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")} +# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)} """ return txt @@ -125,6 +124,7 @@ def main(): ) parser.add_argument("-o", "--out") parser.add_argument("--debug", action="store_true") + parser.add_argument("--no-git", action="store_true") args = parser.parse_args() if not args.out: raise ValueError("must supply out directory") @@ -142,6 +142,9 @@ def main(): save_raw(bios, args.out) save_individuals(bios, args.out) + if args.no_git: + return + repo.git.add("*") repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) origin = repo.remote(name="origin") @@ -209,7 +212,7 @@ def get_bio(url, name, sleep_for): soup.find(class_="m-biography__subHeading --mandate").text, soup.find(string=re.compile(r"^Wahlkreis \d*:")), ) - disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip() + disclosures = get_disclosures(soup.find(class_="m-biography__infos")) bio = Biography( name, @@ -228,6 +231,30 @@ def get_bio(url, name, sleep_for): return bio +def get_disclosures(elem): + if not elem: + return None + divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer") + out = [] + for div in divs: + current_heading = "" + current_body = [] + for child in div.children: + if child.name == "h3": + if current_body != []: + out.append((current_heading, current_body)) + current_heading = child.text.strip() + current_body = [] + continue + if not child.name: + continue + if child.text.strip() == "": + continue + current_body.append(child.text.strip()) + out.append((current_heading, current_body)) + return out + + def get_functions(elem): out = [] current_heading = None