get favicon url

This commit is contained in:
Eri - 2020-03-12 02:13:54 +01:00
parent 04fc8e9ab4
commit 8023ec6aa0
2 changed files with 55 additions and 8 deletions

View File

@ -2,21 +2,66 @@
# -*- coding: utf-8 -*-
from html.parser import HTMLParser
from requests_html import HTMLSession
from urllib.parse import urlparse
import re
class favfinder(HTMLParser):
attrs = []
in_head = False
def handle_starttag(self, tag, attrs):
if tag == 'link':
if tag == 'head':
self.in_head = True
return
if tag == 'link' and self.in_head:
for attr in attrs:
if 'rel' in attr and attr[1].find('icon') > 0:
if 'rel' in attr and re.search(attr[1],'icon'):
self.attrs = attrs
def gethtml(url):
return "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><link rel=\"stylesheet\" type=\"text/css\" href=\"../static/style.css\"><link rel=\"shortcut icon\" type=\"image/png\" href=\"../static/py.png\"></head><body></body></html>"
def handle_endtag(self, tag):
if tag == 'head':
self.in_head = False
def find_in(self,html):
self.feed(html)
if len(self.attrs) > 0:
return True
else:
return False
def get_html(session, url):
res = session.get(url)
return res.html.html
def crop_url(url):
o = urlparse(url)
return "{proto}://{domain}".format(proto=o.scheme, domain=o.netloc)
def complete_url(url, icon_url):
ico = urlparse(icon_url)
org = urlparse(url)
if ico.scheme == '' or ico.netloc == '':
return "{proto}://{domain}{path}".format(
proto=org.scheme,
domain=org.netloc,
path=ico.path
)
return icon_url
def get_icon_url(response):
icon = favfinder()
if icon.find_in(response):
for attr in icon.attrs:
if 'href' in attr:
return attr[1]
return None
def main(options):
ff = favfinder()
response = gethtml(options.url)
ff.feed(response)
print(ff.attrs)
session = HTMLSession()
url = crop_url(str(options.url))
response = get_html(session=session, url=url)
icon_url = get_icon_url(response)
print(complete_url(url, icon_url))

View File

@ -0,0 +1,2 @@
requests_html
urllib