fix function decoding
This commit is contained in:
parent
aaa372fe21
commit
d2fac39099
2 changed files with 56 additions and 5 deletions
60
crawler.py
60
crawler.py
|
|
@ -45,6 +45,41 @@ votes: {self.votes}
|
||||||
functions: {self.functions}
|
functions: {self.functions}
|
||||||
additional_functions: {self.additional_functions}
|
additional_functions: {self.additional_functions}
|
||||||
mandate: {self.mandate}
|
mandate: {self.mandate}
|
||||||
|
disclosures: {self.disclosures}
|
||||||
|
"""
|
||||||
|
return txt
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
if self.speeches:
|
||||||
|
speeches_str = "".join(
|
||||||
|
[f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
speeches_str = ""
|
||||||
|
|
||||||
|
if self.votes:
|
||||||
|
votes_str = "".join(
|
||||||
|
[f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
votes_str = ""
|
||||||
|
|
||||||
|
if self.functions:
|
||||||
|
func_str = "".join([f"\n- "])
|
||||||
|
txt = f"""
|
||||||
|
Name: {self.name[1]} {self.name[0]}
|
||||||
|
|
||||||
|
Partei: {self.party}
|
||||||
|
|
||||||
|
Biographie: {self.cv}
|
||||||
|
|
||||||
|
Reden: {speeches_str}
|
||||||
|
|
||||||
|
Abstimmungen: {votes_str}
|
||||||
|
|
||||||
|
Ämter im Bundestag: {self.functions}
|
||||||
|
additional_functions: {self.additional_functions}
|
||||||
|
mandate: {self.mandate}
|
||||||
disclosures: {self.disclosures}
|
disclosures: {self.disclosures}
|
||||||
"""
|
"""
|
||||||
return txt
|
return txt
|
||||||
|
|
@ -80,16 +115,27 @@ def main():
|
||||||
names = names[:5]
|
names = names[:5]
|
||||||
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
||||||
|
|
||||||
save_info(bios, args.out)
|
save_raw(bios, args.out)
|
||||||
|
|
||||||
repo.git.add("*")
|
repo.git.add("*")
|
||||||
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
origin = repo.remote(name="origin")
|
origin = repo.remote(name="origin")
|
||||||
origin.push()
|
origin.push()
|
||||||
|
|
||||||
|
|
||||||
def save_info(bios, out):
|
def save_individuals(bios, out):
|
||||||
|
for rep in bios:
|
||||||
|
first_letter = rep.name[0][0].upper()
|
||||||
|
name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
|
||||||
|
with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file:
|
||||||
|
json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def save_raw(bios, out):
|
||||||
with open(f"{out}/raw.json", "w") as raw_file:
|
with open(f"{out}/raw.json", "w") as raw_file:
|
||||||
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
|
json.dump(
|
||||||
|
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_links_and_names():
|
def get_links_and_names():
|
||||||
|
|
@ -122,7 +168,7 @@ def get_bio(url, name):
|
||||||
speeches = parse_speech(speech)
|
speeches = parse_speech(speech)
|
||||||
vote = get_ajax(vote_div)
|
vote = get_ajax(vote_div)
|
||||||
votes = parse_vote(vote)
|
votes = parse_vote(vote)
|
||||||
function_divs = soup.find_all(class_="m-biography__membership")
|
function_divs = soup.find_all(class_="m-biography__memberships")
|
||||||
if len(function_divs) > 0:
|
if len(function_divs) > 0:
|
||||||
functions = get_functions(function_divs[0])
|
functions = get_functions(function_divs[0])
|
||||||
else:
|
else:
|
||||||
|
|
@ -168,7 +214,11 @@ def get_functions(elem):
|
||||||
continue
|
continue
|
||||||
if not child.name:
|
if not child.name:
|
||||||
continue
|
continue
|
||||||
current_body.append(child.text.strip())
|
current_body.extend(
|
||||||
|
grandchild.text.strip().replace("\n\n\n(Interner Link)", "")
|
||||||
|
for grandchild in child.children
|
||||||
|
if grandchild.text.strip() != ""
|
||||||
|
)
|
||||||
out.append((current_heading, current_body))
|
out.append((current_heading, current_body))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,4 +14,5 @@ description = "Crawls the website of the german parlament and tracks any changes
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"requests",
|
"requests",
|
||||||
|
"gitpython"
|
||||||
]
|
]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue