From 0ac2f2e9c169a45e89818871d363a7ba57e3a314 Mon Sep 17 00:00:00 2001 From: Rob Date: Tue, 30 Jul 2019 11:35:46 +0200 Subject: [PATCH] my scraper --- src/Support/Stiftungssuche.de_rob.py | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/Support/Stiftungssuche.de_rob.py diff --git a/src/Support/Stiftungssuche.de_rob.py b/src/Support/Stiftungssuche.de_rob.py new file mode 100644 index 0000000..e7ca14f --- /dev/null +++ b/src/Support/Stiftungssuche.de_rob.py @@ -0,0 +1,43 @@ +import sys +import requests +import re +from bs4 import BeautifulSoup + +suchwort = 'stift' + +if len(sys.argv) > 1: + if sys.argv[1] == 'download': + payload = {'searchtype': 1 , 'stichworte': suchwort, "ort":'', "bundesland":'', "action" : "search"} + r = requests.post("https://stiftungssuche.de/", data=payload) + cookie = r.cookies + if r.status_code == 200: + f = open("data/website", "w") + f.write(r.text) + f.close() + + count = 1 + + while True: + print("\nReload: "+str(count)) + count = count + 1 + r = requests.get('https://stiftungssuche.de/wp-content/plugins/stiftungssuche/ajax/more_content.php/', cookies=cookie) + if r.status_code == 200: + f = open("data/website", "a") + f.write(r.text) + f.close() + else: + break + +f = open("data/website", "r") +document = BeautifulSoup(f.read(), 'html.parser') +f.close() +hitlist = document.find_all(id=re.compile("^portrait_")) + +for entry in hitlist: + portrait = str(entry['id']) + f = open("result/"+portrait+'.html', "w") + f.write(str(entry.div)) + f.close() + +#page = 'https://stiftungssuche.de/wp-content/plugins/stiftungssuche/ajax/more_content.php/' +