stop parsing the html page once the parser has left the <head> of the page

This commit is contained in:
Damien Churchill 2010-05-03 21:28:16 +01:00
parent c4b20aa595
commit e4ef17975c
2 changed files with 15 additions and 0 deletions

View File

@ -256,6 +256,8 @@ class TrackerIcons(Component):
parser = FaviconParser() parser = FaviconParser()
for line in f: for line in f:
parser.feed(line) parser.feed(line)
if parser.left_head:
break
parser.close() parser.close()
f.close() f.close()
os.remove(page) os.remove(page)
@ -410,6 +412,7 @@ class FaviconParser(HTMLParser):
""" """
def __init__(self): def __init__(self):
self.icons = [] self.icons = []
self.left_head = False
HTMLParser.__init__(self) HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs): def handle_starttag(self, tag, attrs):
@ -424,6 +427,10 @@ class FaviconParser(HTMLParser):
if href and type: if href and type:
self.icons.append((href, type)) self.icons.append((href, type))
def handle_endtag(self, tag):
if tag == "head":
self.left_head = True
def get_icons(self): def get_icons(self):
""" """
Returns a list of favicons extracted from the HTML page Returns a list of favicons extracted from the HTML page

View File

@ -36,3 +36,11 @@ class TrackerIconsTestCase(unittest.TestCase):
d.addCallback(self.assertNotIdentical, None) d.addCallback(self.assertNotIdentical, None)
d.addCallback(self.assertEquals, icon) d.addCallback(self.assertEquals, icon)
return d return d
def test_get_ubuntu_ico(self):
# ubuntu.com has inline css which causes HTMLParser issues
icon = TrackerIcon("../ubuntu.png")
d = icons.get("www.ubuntu.com")
d.addCallback(self.assertNotIdentical, None)
d.addCallback(self.assertEquals, icon)
return d