get favicon url
parent
04fc8e9ab4
commit
8023ec6aa0
|
@ -2,21 +2,66 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from html.parser import HTMLParser
|
||||
from requests_html import HTMLSession
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
|
||||
|
||||
class favfinder(HTMLParser):
|
||||
attrs = []
|
||||
in_head = False
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'link':
|
||||
if tag == 'head':
|
||||
self.in_head = True
|
||||
return
|
||||
if tag == 'link' and self.in_head:
|
||||
for attr in attrs:
|
||||
if 'rel' in attr and attr[1].find('icon') > 0:
|
||||
if 'rel' in attr and re.search(attr[1],'icon'):
|
||||
self.attrs = attrs
|
||||
|
||||
def gethtml(url):
|
||||
return "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><link rel=\"stylesheet\" type=\"text/css\" href=\"../static/style.css\"><link rel=\"shortcut icon\" type=\"image/png\" href=\"../static/py.png\"></head><body></body></html>"
|
||||
def handle_endtag(self, tag):
|
||||
if tag == 'head':
|
||||
self.in_head = False
|
||||
|
||||
def find_in(self,html):
|
||||
self.feed(html)
|
||||
if len(self.attrs) > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_html(session, url):
|
||||
res = session.get(url)
|
||||
return res.html.html
|
||||
|
||||
def crop_url(url):
|
||||
o = urlparse(url)
|
||||
return "{proto}://{domain}".format(proto=o.scheme, domain=o.netloc)
|
||||
|
||||
def complete_url(url, icon_url):
|
||||
ico = urlparse(icon_url)
|
||||
org = urlparse(url)
|
||||
if ico.scheme == '' or ico.netloc == '':
|
||||
return "{proto}://{domain}{path}".format(
|
||||
proto=org.scheme,
|
||||
domain=org.netloc,
|
||||
path=ico.path
|
||||
)
|
||||
return icon_url
|
||||
|
||||
def get_icon_url(response):
|
||||
icon = favfinder()
|
||||
if icon.find_in(response):
|
||||
for attr in icon.attrs:
|
||||
if 'href' in attr:
|
||||
return attr[1]
|
||||
return None
|
||||
|
||||
def main(options):
|
||||
ff = favfinder()
|
||||
response = gethtml(options.url)
|
||||
ff.feed(response)
|
||||
print(ff.attrs)
|
||||
session = HTMLSession()
|
||||
url = crop_url(str(options.url))
|
||||
|
||||
response = get_html(session=session, url=url)
|
||||
icon_url = get_icon_url(response)
|
||||
print(complete_url(url, icon_url))
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
requests_html
|
||||
urllib
|
Loading…
Reference in New Issue