stop parsing the html page once the parser has left the <head> of the page

This commit is contained in:
Damien Churchill 2010-05-03 21:28:16 +01:00
parent c4b20aa595
commit e4ef17975c
2 changed files with 15 additions and 0 deletions

View File

@ -256,6 +256,8 @@ class TrackerIcons(Component):
parser = FaviconParser()
for line in f:
parser.feed(line)
if parser.left_head:
break
parser.close()
f.close()
os.remove(page)
@ -410,6 +412,7 @@ class FaviconParser(HTMLParser):
"""
def __init__(self):
self.icons = []
self.left_head = False
HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
@ -424,6 +427,10 @@ class FaviconParser(HTMLParser):
if href and type:
self.icons.append((href, type))
def handle_endtag(self, tag):
if tag == "head":
self.left_head = True
def get_icons(self):
"""
Returns a list of favicons extracted from the HTML page

View File

@ -36,3 +36,11 @@ class TrackerIconsTestCase(unittest.TestCase):
d.addCallback(self.assertNotIdentical, None)
d.addCallback(self.assertEquals, icon)
return d
def test_get_ubuntu_ico(self):
# ubuntu.com has inline css which causes HTMLParser issues
icon = TrackerIcon("../ubuntu.png")
d = icons.get("www.ubuntu.com")
d.addCallback(self.assertNotIdentical, None)
d.addCallback(self.assertEquals, icon)
return d