Compare commits

..

No commits in common. "d47627643f9637469d84a415d4e16ccba6e69a99" and "384bd83b20b2297c60a313232c4d625c46875cb4" have entirely different histories.

View file

@ -88,7 +88,8 @@ Biographie: {self.cv}
# Mandat # Mandat
{self.mandate[0]}, {self.mandate[1]} {self.mandate[0]}, {self.mandate[1]}
# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)} # Veröffentlichungspflichtige Angaben
{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
""" """
return txt return txt
@ -112,7 +113,7 @@ def funcs_to_str(funcs):
out = "" out = ""
for func in funcs: for func in funcs:
out += f"\n- {func[0]}" out += f"\n- {func[0]}"
for loc in sorted(func[1]): for loc in func[1]:
out += f"\n - {loc}" out += f"\n - {loc}"
return out return out
@ -124,7 +125,6 @@ def main():
) )
parser.add_argument("-o", "--out") parser.add_argument("-o", "--out")
parser.add_argument("--debug", action="store_true") parser.add_argument("--debug", action="store_true")
parser.add_argument("--no-git", action="store_true")
args = parser.parse_args() args = parser.parse_args()
if not args.out: if not args.out:
raise ValueError("must supply out directory") raise ValueError("must supply out directory")
@ -133,18 +133,15 @@ def main():
if args.debug: if args.debug:
links = links[:5] links = links[:5]
names = names[:5] names = names[:5]
sleep_for = 0 sleep_for = 1
else: else:
sleep_for = 10 sleep_for = 5
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)] bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
save_raw(bios, args.out) save_raw(bios, args.out)
save_individuals(bios, args.out) save_individuals(bios, args.out)
if args.no_git:
return
repo.git.add("*") repo.git.add("*")
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
origin = repo.remote(name="origin") origin = repo.remote(name="origin")
@ -184,12 +181,7 @@ def get_bio(url, name, sleep_for):
name, party = name name, party = name
name = name.split(", ") name = name.split(", ")
print(f"Getting {url} for {name[1]} {name[0]}") print(f"Getting {url} for {name[1]} {name[0]}")
for _ in range(5): response = requests.get(url)
try:
response = requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
soup = BeautifulSoup(response.content) soup = BeautifulSoup(response.content)
cv = soup.find(class_="m-biography__biography").text.strip() cv = soup.find(class_="m-biography__biography").text.strip()
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent") ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
@ -217,7 +209,7 @@ def get_bio(url, name, sleep_for):
soup.find(class_="m-biography__subHeading --mandate").text, soup.find(class_="m-biography__subHeading --mandate").text,
soup.find(string=re.compile(r"^Wahlkreis \d*:")), soup.find(string=re.compile(r"^Wahlkreis \d*:")),
) )
disclosures = get_disclosures(soup.find(class_="m-biography__infos")) disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
bio = Biography( bio = Biography(
name, name,
@ -236,30 +228,6 @@ def get_bio(url, name, sleep_for):
return bio return bio
def get_disclosures(elem):
if not elem:
return None
divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
out = []
for div in divs:
current_heading = ""
current_body = []
for child in div.children:
if child.name == "h3":
if current_body != []:
out.append((current_heading, current_body))
current_heading = child.text.strip()
current_body = []
continue
if not child.name:
continue
if child.text.strip() == "":
continue
current_body.append(child.text.strip())
out.append((current_heading, current_body))
return out
def get_functions(elem): def get_functions(elem):
out = [] out = []
current_heading = None current_heading = None
@ -267,7 +235,7 @@ def get_functions(elem):
for child in elem.children: for child in elem.children:
if child.name == "h3": if child.name == "h3":
if current_body != []: if current_body != []:
out.append((current_heading, sorted(current_body))) out.append((current_heading, current_body))
current_heading = child.text.strip() current_heading = child.text.strip()
current_body = [] current_body = []
continue continue
@ -278,13 +246,13 @@ def get_functions(elem):
for grandchild in child.children for grandchild in child.children
if grandchild.text.strip() != "" if grandchild.text.strip() != ""
) )
out.append((current_heading, sorted(current_body))) out.append((current_heading, current_body))
return out return out
def parse_speech(page): def parse_speech(page):
if not page: if not page:
return None return (None, None)
soup = BeautifulSoup(page.content) soup = BeautifulSoup(page.content)
infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")] infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
titles = [ titles = [
@ -319,13 +287,7 @@ def get_ajax(elem):
for key, value in filters.items() for key, value in filters.items()
] ]
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters) url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
for _ in range(5): return requests.get(url)
try:
response = requests.get(url)
except:
print("Rate limit! waiting 5min")
sleep(300)
return response
def common_suffix(strings): def common_suffix(strings):