Compare commits
No commits in common. "d47627643f9637469d84a415d4e16ccba6e69a99" and "384bd83b20b2297c60a313232c4d625c46875cb4" have entirely different histories.
d47627643f
...
384bd83b20
1 changed files with 11 additions and 49 deletions
60
crawler.py
60
crawler.py
|
|
@ -88,7 +88,8 @@ Biographie: {self.cv}
|
||||||
# Mandat
|
# Mandat
|
||||||
{self.mandate[0]}, {self.mandate[1]}
|
{self.mandate[0]}, {self.mandate[1]}
|
||||||
|
|
||||||
# Veröffentlichungspflichtige Angaben {funcs_to_str(self.disclosures)}
|
# Veröffentlichungspflichtige Angaben
|
||||||
|
{self.disclosures.replace(DISCLOSURE_DISLAIMER, "")}
|
||||||
"""
|
"""
|
||||||
return txt
|
return txt
|
||||||
|
|
||||||
|
|
@ -112,7 +113,7 @@ def funcs_to_str(funcs):
|
||||||
out = ""
|
out = ""
|
||||||
for func in funcs:
|
for func in funcs:
|
||||||
out += f"\n- {func[0]}"
|
out += f"\n- {func[0]}"
|
||||||
for loc in sorted(func[1]):
|
for loc in func[1]:
|
||||||
out += f"\n - {loc}"
|
out += f"\n - {loc}"
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
@ -124,7 +125,6 @@ def main():
|
||||||
)
|
)
|
||||||
parser.add_argument("-o", "--out")
|
parser.add_argument("-o", "--out")
|
||||||
parser.add_argument("--debug", action="store_true")
|
parser.add_argument("--debug", action="store_true")
|
||||||
parser.add_argument("--no-git", action="store_true")
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if not args.out:
|
if not args.out:
|
||||||
raise ValueError("must supply out directory")
|
raise ValueError("must supply out directory")
|
||||||
|
|
@ -133,18 +133,15 @@ def main():
|
||||||
if args.debug:
|
if args.debug:
|
||||||
links = links[:5]
|
links = links[:5]
|
||||||
names = names[:5]
|
names = names[:5]
|
||||||
sleep_for = 0
|
sleep_for = 1
|
||||||
else:
|
else:
|
||||||
sleep_for = 10
|
sleep_for = 5
|
||||||
|
|
||||||
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
|
bios = [get_bio(link, name, sleep_for) for link, name in zip(links, names)]
|
||||||
|
|
||||||
save_raw(bios, args.out)
|
save_raw(bios, args.out)
|
||||||
save_individuals(bios, args.out)
|
save_individuals(bios, args.out)
|
||||||
|
|
||||||
if args.no_git:
|
|
||||||
return
|
|
||||||
|
|
||||||
repo.git.add("*")
|
repo.git.add("*")
|
||||||
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
repo.index.commit(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
|
||||||
origin = repo.remote(name="origin")
|
origin = repo.remote(name="origin")
|
||||||
|
|
@ -184,12 +181,7 @@ def get_bio(url, name, sleep_for):
|
||||||
name, party = name
|
name, party = name
|
||||||
name = name.split(", ")
|
name = name.split(", ")
|
||||||
print(f"Getting {url} for {name[1]} {name[0]}")
|
print(f"Getting {url} for {name[1]} {name[0]}")
|
||||||
for _ in range(5):
|
response = requests.get(url)
|
||||||
try:
|
|
||||||
response = requests.get(url)
|
|
||||||
except:
|
|
||||||
print("Rate limit! waiting 5min")
|
|
||||||
sleep(300)
|
|
||||||
soup = BeautifulSoup(response.content)
|
soup = BeautifulSoup(response.content)
|
||||||
cv = soup.find(class_="m-biography__biography").text.strip()
|
cv = soup.find(class_="m-biography__biography").text.strip()
|
||||||
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
ajax_divs = soup.find_all(class_="m-ajaxLoadedContent")
|
||||||
|
|
@ -217,7 +209,7 @@ def get_bio(url, name, sleep_for):
|
||||||
soup.find(class_="m-biography__subHeading --mandate").text,
|
soup.find(class_="m-biography__subHeading --mandate").text,
|
||||||
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
soup.find(string=re.compile(r"^Wahlkreis \d*:")),
|
||||||
)
|
)
|
||||||
disclosures = get_disclosures(soup.find(class_="m-biography__infos"))
|
disclosures = soup.find(class_="m-biography__infoDisclaimer").text.strip()
|
||||||
|
|
||||||
bio = Biography(
|
bio = Biography(
|
||||||
name,
|
name,
|
||||||
|
|
@ -236,30 +228,6 @@ def get_bio(url, name, sleep_for):
|
||||||
return bio
|
return bio
|
||||||
|
|
||||||
|
|
||||||
def get_disclosures(elem):
|
|
||||||
if not elem:
|
|
||||||
return None
|
|
||||||
divs = elem.find_all("div", class_=lambda cls: cls != "m-biography__infoDisclaimer")
|
|
||||||
out = []
|
|
||||||
for div in divs:
|
|
||||||
current_heading = ""
|
|
||||||
current_body = []
|
|
||||||
for child in div.children:
|
|
||||||
if child.name == "h3":
|
|
||||||
if current_body != []:
|
|
||||||
out.append((current_heading, current_body))
|
|
||||||
current_heading = child.text.strip()
|
|
||||||
current_body = []
|
|
||||||
continue
|
|
||||||
if not child.name:
|
|
||||||
continue
|
|
||||||
if child.text.strip() == "":
|
|
||||||
continue
|
|
||||||
current_body.append(child.text.strip())
|
|
||||||
out.append((current_heading, current_body))
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def get_functions(elem):
|
def get_functions(elem):
|
||||||
out = []
|
out = []
|
||||||
current_heading = None
|
current_heading = None
|
||||||
|
|
@ -267,7 +235,7 @@ def get_functions(elem):
|
||||||
for child in elem.children:
|
for child in elem.children:
|
||||||
if child.name == "h3":
|
if child.name == "h3":
|
||||||
if current_body != []:
|
if current_body != []:
|
||||||
out.append((current_heading, sorted(current_body)))
|
out.append((current_heading, current_body))
|
||||||
current_heading = child.text.strip()
|
current_heading = child.text.strip()
|
||||||
current_body = []
|
current_body = []
|
||||||
continue
|
continue
|
||||||
|
|
@ -278,13 +246,13 @@ def get_functions(elem):
|
||||||
for grandchild in child.children
|
for grandchild in child.children
|
||||||
if grandchild.text.strip() != ""
|
if grandchild.text.strip() != ""
|
||||||
)
|
)
|
||||||
out.append((current_heading, sorted(current_body)))
|
out.append((current_heading, current_body))
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def parse_speech(page):
|
def parse_speech(page):
|
||||||
if not page:
|
if not page:
|
||||||
return None
|
return (None, None)
|
||||||
soup = BeautifulSoup(page.content)
|
soup = BeautifulSoup(page.content)
|
||||||
infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
|
infos = [s.text.strip() for s in soup.find_all(class_="m-biography__speechTitle")]
|
||||||
titles = [
|
titles = [
|
||||||
|
|
@ -319,13 +287,7 @@ def get_ajax(elem):
|
||||||
for key, value in filters.items()
|
for key, value in filters.items()
|
||||||
]
|
]
|
||||||
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
url = url + "?" + "&".join(f"{key}={val}" for key, val in sanitized_filters)
|
||||||
for _ in range(5):
|
return requests.get(url)
|
||||||
try:
|
|
||||||
response = requests.get(url)
|
|
||||||
except:
|
|
||||||
print("Rate limit! waiting 5min")
|
|
||||||
sleep(300)
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
def common_suffix(strings):
|
def common_suffix(strings):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue