2020-03-12 00:22:40 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from html.parser import HTMLParser
|
2020-03-12 02:13:54 +01:00
|
|
|
from requests_html import HTMLSession
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
import re
|
|
|
|
|
2020-03-12 00:22:40 +01:00
|
|
|
|
|
|
|
class favfinder(HTMLParser):
|
|
|
|
attrs = []
|
2020-03-12 02:13:54 +01:00
|
|
|
in_head = False
|
2020-03-12 00:22:40 +01:00
|
|
|
def handle_starttag(self, tag, attrs):
|
2020-03-12 02:13:54 +01:00
|
|
|
if tag == 'head':
|
|
|
|
self.in_head = True
|
|
|
|
return
|
|
|
|
if tag == 'link' and self.in_head:
|
2020-03-12 00:22:40 +01:00
|
|
|
for attr in attrs:
|
2020-03-12 02:13:54 +01:00
|
|
|
if 'rel' in attr and re.search(attr[1],'icon'):
|
2020-03-12 00:22:40 +01:00
|
|
|
self.attrs = attrs
|
|
|
|
|
2020-03-12 02:13:54 +01:00
|
|
|
def handle_endtag(self, tag):
|
|
|
|
if tag == 'head':
|
|
|
|
self.in_head = False
|
|
|
|
|
|
|
|
def find_in(self,html):
|
|
|
|
self.feed(html)
|
|
|
|
if len(self.attrs) > 0:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
def get_html(session, url):
|
|
|
|
res = session.get(url)
|
|
|
|
return res.html.html
|
|
|
|
|
|
|
|
def crop_url(url):
|
|
|
|
o = urlparse(url)
|
|
|
|
return "{proto}://{domain}".format(proto=o.scheme, domain=o.netloc)
|
2020-03-12 00:22:40 +01:00
|
|
|
|
2020-03-12 02:13:54 +01:00
|
|
|
def complete_url(url, icon_url):
|
|
|
|
ico = urlparse(icon_url)
|
|
|
|
org = urlparse(url)
|
|
|
|
if ico.scheme == '' or ico.netloc == '':
|
|
|
|
return "{proto}://{domain}{path}".format(
|
|
|
|
proto=org.scheme,
|
|
|
|
domain=org.netloc,
|
|
|
|
path=ico.path
|
|
|
|
)
|
|
|
|
return icon_url
|
|
|
|
|
|
|
|
def get_icon_url(response):
|
|
|
|
icon = favfinder()
|
|
|
|
if icon.find_in(response):
|
|
|
|
for attr in icon.attrs:
|
|
|
|
if 'href' in attr:
|
|
|
|
return attr[1]
|
|
|
|
return None
|
2020-03-12 00:22:40 +01:00
|
|
|
|
|
|
|
def main(options):
|
2020-03-12 02:13:54 +01:00
|
|
|
session = HTMLSession()
|
|
|
|
url = crop_url(str(options.url))
|
|
|
|
|
|
|
|
response = get_html(session=session, url=url)
|
|
|
|
icon_url = get_icon_url(response)
|
|
|
|
print(complete_url(url, icon_url))
|
|
|
|
|