fix function decoding
This commit is contained in:
parent
aaa372fe21
commit
d2fac39099
2 changed files with 56 additions and 5 deletions
60
crawler.py
60
crawler.py
|
|
@ -45,6 +45,41 @@ votes: {self.votes}
|
|||
functions: {self.functions}
|
||||
additional_functions: {self.additional_functions}
|
||||
mandate: {self.mandate}
|
||||
disclosures: {self.disclosures}
|
||||
"""
|
||||
return txt
|
||||
|
||||
def __str__(self):
|
||||
if self.speeches:
|
||||
speeches_str = "".join(
|
||||
[f"\n- {speech[1]}: {speech[0]}" for speech in self.speeches]
|
||||
)
|
||||
else:
|
||||
speeches_str = ""
|
||||
|
||||
if self.votes:
|
||||
votes_str = "".join(
|
||||
[f"\n- {vote[0]}, {vote[1]}: {vote[2]}" for vote in self.votes]
|
||||
)
|
||||
else:
|
||||
votes_str = ""
|
||||
|
||||
if self.functions:
|
||||
func_str = "".join([f"\n- "])
|
||||
txt = f"""
|
||||
Name: {self.name[1]} {self.name[0]}
|
||||
|
||||
Partei: {self.party}
|
||||
|
||||
Biographie: {self.cv}
|
||||
|
||||
Reden: {speeches_str}
|
||||
|
||||
Abstimmungen: {votes_str}
|
||||
|
||||
Ämter im Bundestag: {self.functions}
|
||||
additional_functions: {self.additional_functions}
|
||||
mandate: {self.mandate}
|
||||
disclosures: {self.disclosures}
|
||||
"""
|
||||
return txt
|
||||
|
|
@ -80,16 +115,27 @@ def main():
|
|||
names = names[:5]
|
||||
bios = [get_bio(link, name) for link, name in zip(links, names)]
|
||||
|
||||
save_info(bios, args.out)
|
||||
save_raw(bios, args.out)
|
||||
|
||||
repo.git.add("*")
|
||||
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||
origin = repo.remote(name="origin")
|
||||
origin.push()
|
||||
|
||||
|
||||
def save_info(bios, out):
|
||||
def save_individuals(bios, out):
|
||||
for rep in bios:
|
||||
first_letter = rep.name[0][0].upper()
|
||||
name_str = f"{rep.name[1]} {rep.name[0]}".replace(" ", "_")
|
||||
with open(f"{out}/Abgeordnete/{first_letter}/{name_str}") as rep_file:
|
||||
json.dump(str(rep), rep_file, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def save_raw(bios, out):
|
||||
with open(f"{out}/raw.json", "w") as raw_file:
|
||||
json.dump([bio.to_dict() for bio in bios], raw_file, indent=2)
|
||||
json.dump(
|
||||
[bio.to_dict() for bio in bios], raw_file, indent=2, ensure_ascii=False
|
||||
)
|
||||
|
||||
|
||||
def get_links_and_names():
|
||||
|
|
@ -122,7 +168,7 @@ def get_bio(url, name):
|
|||
speeches = parse_speech(speech)
|
||||
vote = get_ajax(vote_div)
|
||||
votes = parse_vote(vote)
|
||||
function_divs = soup.find_all(class_="m-biography__membership")
|
||||
function_divs = soup.find_all(class_="m-biography__memberships")
|
||||
if len(function_divs) > 0:
|
||||
functions = get_functions(function_divs[0])
|
||||
else:
|
||||
|
|
@ -168,7 +214,11 @@ def get_functions(elem):
|
|||
continue
|
||||
if not child.name:
|
||||
continue
|
||||
current_body.append(child.text.strip())
|
||||
current_body.extend(
|
||||
grandchild.text.strip().replace("\n\n\n(Interner Link)", "")
|
||||
for grandchild in child.children
|
||||
if grandchild.text.strip() != ""
|
||||
)
|
||||
out.append((current_heading, current_body))
|
||||
return out
|
||||
|
||||
|
|
|
|||
|
|
@ -14,4 +14,5 @@ description = "Crawls the website of the german parlament and tracks any changes
|
|||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"gitpython"
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue