Avoid running chardet in decode_string if not needed

This commit is contained in:
bendikro 2012-11-26 02:15:10 +01:00
parent 60f196ff93
commit d5e340354e
1 changed files with 11 additions and 10 deletions

View File

@ -626,13 +626,17 @@ def decode_string(s, encoding="utf8"):
elif isinstance(s, unicode):
return s
encodings = [(encoding, 'strict'), ("utf8", 'strict'),
("iso-8859-1", 'strict'),
(chardet.detect(s)["encoding"], 'strict'),
(chardet.detect(s)["encoding"], 'ignore')]
for i in range(len(encodings)):
encodings = [lambda: ("utf8", 'strict'),
lambda: ("iso-8859-1", 'strict'),
lambda: (chardet.detect(s)["encoding"], 'strict'),
lambda: (chardet.detect(s)["encoding"], 'ignore')]
if not encoding is "utf8":
encodings.insert(0, lambda: (encoding, 'strict'))
for l in encodings:
try:
return s.decode(encodings[i][0], encodings[i][1])
return s.decode(*l())
except UnicodeDecodeError:
pass
return u''
@ -648,10 +652,7 @@ def utf8_encoded(s):
"""
if isinstance(s, str):
try:
s = decode_string(s).encode("utf8")
except UnicodeEncodeError:
log.warn("Error when encoding to utf8: %s" % s)
s = decode_string(s).encode("utf8")
elif isinstance(s, unicode):
s = s.encode("utf8", "ignore")
return s