first order

This commit is contained in:
Al-P 2019-07-30 11:10:38 +02:00
parent ce235d76ef
commit 41ea012ffc
3 changed files with 131 additions and 0 deletions

View File

@ -0,0 +1,34 @@
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://efa-net.eu/events/events-calendar').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('EFA_scrape_csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['titel', 'datum', 'event_url', 'ort'])
for event in soup.find_all("div", class_="column mcb-column one column_list"):
titel = event.h4.text
print(titel)
datum =event.find("div", class_="desc").p.b.text
print(datum)
ort =event.find("div", class_="desc").p.i.text
print(ort)
try:
event_url = event.find("a")['href']
except Exception as e:
event_url = None
print(event_url)
print()
csv_writer.writerow([titel, datum, event_url, ort])
csv_file.close()

View File

@ -0,0 +1,54 @@
from bs4 import BeautifulSoup
import requests
import csv
source = requests.get('https://fundraiser-magazin.de/fundraising-kalender.html').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open("frkalender_scrape_csv", "w")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["titel", "datum", "veranstalter", "event_url", "ort"])
for event in soup.find_all("div", class_="kalender-veranstaltung"):
titel = event.find("div", class_="kalender-titel").text
print(titel)
datum = event.find("div", class_="kalender-termin").text
print(datum)
veranstalter = event.find("div", class_="kalender-veranstalter").text
veranstalter = veranstalter.split(": ")[1]
print(veranstalter)
event_url = event.find("a")['href']
#print(event_url)
event_link = f"Weitere Informationen: {event_url}"
print(event_link)
try:
ort = event.find("div", class_="kalender-ort").text
ort = ort.split(": ")[1]
except Exception as e:
ort = None
print(ort)
print()
csv_writer.writerow([titel, datum, veranstalter, event_url, ort])
csv_file.close()
'''
for event in soup.find_all("div", class_="kalender-veranstaltung"):
titel = event.find_all('div', class_='kalender-titel')
print(titel)
veranstalter = event.find_all('div', class_='kalender-veranstalter')
print(veranstalter)
ort = event.find_all('div', class_='kalender-ort')
print(ort)
'''

View File

@ -0,0 +1,43 @@
import sys
import requests
import re
from bs4 import BeautifulSoup
suchwort = 'stift'
if len(sys.argv) > 1:
if sys.argv[1] == 'download':
payload = {'searchtype': 1 , 'stichworte': suchwort, "ort":'', "bundesland":'', "action" : "search"}
r = requests.post("https://stiftungssuche.de/", data=payload)
cookie = r.cookies
if r.status_code == 200:
f = open("data/website", "w")
f.write(r.text)
f.close()
count = 1
while True:
print("\nReload: "+str(count))
count = count + 1
r = requests.get('https://stiftungssuche.de/wp-content/plugins/stiftungssuche/ajax/more_content.php/', cookies=cookie)
if r.status_code == 200:
f = open("data/website", "a")
f.write(r.text)
f.close()
else:
break
f = open("data/website", "r")
document = BeautifulSoup(f.read(), 'html.parser')
f.close()
hitlist = document.find_all(id=re.compile("^portrait_"))
for entry in hitlist:
portrait = str(entry['id'])
f = open("result/"+portrait+'.html', "w")
f.write(str(entry.div))
f.close()
#page = 'https://stiftungssuche.de/wp-content/plugins/stiftungssuche/ajax/more_content.php/'