#!/usr/bin/env python3 # -*- coding: utf-8 -*- from html.parser import HTMLParser from requests_html import HTMLSession from urllib.parse import urlparse import re class favfinder(HTMLParser): attrs = [] in_head = False def handle_starttag(self, tag, attrs): if tag == 'head': self.in_head = True return if tag == 'link' and self.in_head: for attr in attrs: if 'rel' in attr and re.search(attr[1],'icon'): self.attrs = attrs def handle_endtag(self, tag): if tag == 'head': self.in_head = False def find_in(self,html): self.feed(html) if len(self.attrs) > 0: return True else: return False def get_html(session, url): res = session.get(url) return res.html.html def crop_url(url): o = urlparse(url) return "{proto}://{domain}".format(proto=o.scheme, domain=o.netloc) def complete_url(url, icon_url): ico = urlparse(icon_url) org = urlparse(url) if ico.scheme == '' or ico.netloc == '': return "{proto}://{domain}{path}".format( proto=org.scheme, domain=org.netloc, path=ico.path ) return icon_url def get_icon_url(response): icon = favfinder() if icon.find_in(response): for attr in icon.attrs: if 'href' in attr: return attr[1] return None def main(options): session = HTMLSession() url = crop_url(str(options.url)) response = get_html(session=session, url=url) icon_url = get_icon_url(response) print(complete_url(url, icon_url))