stop parsing the html page once the parser has left the <head> of the page
This commit is contained in:
parent
c4b20aa595
commit
e4ef17975c
|
@ -256,6 +256,8 @@ class TrackerIcons(Component):
|
|||
parser = FaviconParser()
|
||||
for line in f:
|
||||
parser.feed(line)
|
||||
if parser.left_head:
|
||||
break
|
||||
parser.close()
|
||||
f.close()
|
||||
os.remove(page)
|
||||
|
@ -410,6 +412,7 @@ class FaviconParser(HTMLParser):
|
|||
"""
|
||||
def __init__(self):
|
||||
self.icons = []
|
||||
self.left_head = False
|
||||
HTMLParser.__init__(self)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
|
@ -424,6 +427,10 @@ class FaviconParser(HTMLParser):
|
|||
if href and type:
|
||||
self.icons.append((href, type))
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "head":
|
||||
self.left_head = True
|
||||
|
||||
def get_icons(self):
|
||||
"""
|
||||
Returns a list of favicons extracted from the HTML page
|
||||
|
|
|
@ -36,3 +36,11 @@ class TrackerIconsTestCase(unittest.TestCase):
|
|||
d.addCallback(self.assertNotIdentical, None)
|
||||
d.addCallback(self.assertEquals, icon)
|
||||
return d
|
||||
|
||||
def test_get_ubuntu_ico(self):
|
||||
# ubuntu.com has inline css which causes HTMLParser issues
|
||||
icon = TrackerIcon("../ubuntu.png")
|
||||
d = icons.get("www.ubuntu.com")
|
||||
d.addCallback(self.assertNotIdentical, None)
|
||||
d.addCallback(self.assertEquals, icon)
|
||||
return d
|
||||
|
|
Loading…
Reference in New Issue