fix disclosuure handling
This commit is contained in:
parent
93310a8030
commit
3f0130c75b
1 changed files with 30 additions and 3 deletions
33
crawler.py
33
crawler.py
|
|
@ -88,8 +88,7 @@ Biographie: {self.cv}
|
||||||
# Mandat
|
# Mandat
|
||||||
{self.mandate[0]}, {self.mandate[1]}
|
{self.mandate[0]}, {self.mandate[1]}
|
||||||
|
|
||||||
# Veröffentlichungspflichtige Angaben
|
# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)}
|
||||||
{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
|
|
||||||
"""
|
"""
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
@ -125,6 +124,7 @@ def main():
|
||||||
)
|
)
|
||||||
parser.add_argument("-o", "--out")
|
parser.add_argument("-o", "--out")
|
||||||
parser.add_argument("--debug", action="store_true")
|
parser.add_argument("--debug", action="store_true")
|
||||||
|
parser.add_argument("--no-git", action="store_true")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if not args.out:
|
if not args.out:
|
||||||
raise ValueError("must supply out directory")
|
raise ValueError("must supply out directory")
|
||||||
|
|
@ -142,6 +142,9 @@ def main():
|
||||||
save_raw(bios, args.out)
|
save_raw(bios, args.out)
|
||||||
save_individuals(bios, args.out)
|
save_individuals(bios, args.out)
|
||||||
|
|
||||||
|
if args.no_git:
|
||||||
|
return
|
||||||
|
|
||||||
repo.git.add("*")
|
repo.git.add("*")
|
||||||
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
origin = repo.remote(name="origin")
|
origin = repo.remote(name="origin")
|
||||||
|
|
@ -209,7 +212,7 @@ def get_bio(url, name, sleep_for):
|
||||||
soup.find(class_="m-biography__subHeading --mandate").text,
|
soup.find(class_="m-biography__subHeading --mandate").text,
|
||||||
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
||||||
)
|
)
|
||||||
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
disclosures = get_disclosures(soup.find(class_="m-biography__infos"))
|
||||||
|
|
||||||
bio = Biography(
|
bio = Biography(
|
||||||
name,
|
name,
|
||||||
|
|
@ -228,6 +231,30 @@ def get_bio(url, name, sleep_for):
|
||||||
return bio
|
return bio
|
||||||
|
|
||||||
|
|
||||||
|
def get_disclosures(elem):
|
||||||
|
if not elem:
|
||||||
|
return None
|
||||||
|
divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
|
||||||
|
out = []
|
||||||
|
for div in divs:
|
||||||
|
current_heading = ""
|
||||||
|
current_body = []
|
||||||
|
for child in div.children:
|
||||||
|
if child.name == "h3":
|
||||||
|
if current_body != []:
|
||||||
|
out.append((current_heading, current_body))
|
||||||
|
current_heading = child.text.strip()
|
||||||
|
current_body = []
|
||||||
|
continue
|
||||||
|
if not child.name:
|
||||||
|
continue
|
||||||
|
if child.text.strip() == "":
|
||||||
|
continue
|
||||||
|
current_body.append(child.text.strip())
|
||||||
|
out.append((current_heading, current_body))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def get_functions(elem):
|
def get_functions(elem):
|
||||||
out = []
|
out = []
|
||||||
current_heading = None
|
current_heading = None
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue