Avoid running chardet in decode_string if not needed

This commit is contained in:
bendikro 2012-11-26 02:15:10 +01:00
parent 60f196ff93
commit d5e340354e

View File

@ -626,13 +626,17 @@ def decode_string(s, encoding="utf8"):
elif isinstance(s, unicode): elif isinstance(s, unicode):
return s return s
encodings = [(encoding, 'strict'), ("utf8", 'strict'), encodings = [lambda: ("utf8", 'strict'),
("iso-8859-1", 'strict'), lambda: ("iso-8859-1", 'strict'),
(chardet.detect(s)["encoding"], 'strict'), lambda: (chardet.detect(s)["encoding"], 'strict'),
(chardet.detect(s)["encoding"], 'ignore')] lambda: (chardet.detect(s)["encoding"], 'ignore')]
for i in range(len(encodings)):
if not encoding is "utf8":
encodings.insert(0, lambda: (encoding, 'strict'))
for l in encodings:
try: try:
return s.decode(encodings[i][0], encodings[i][1]) return s.decode(*l())
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
return u'' return u''
@ -648,10 +652,7 @@ def utf8_encoded(s):
""" """
if isinstance(s, str): if isinstance(s, str):
try: s = decode_string(s).encode("utf8")
s = decode_string(s).encode("utf8")
except UnicodeEncodeError:
log.warn("Error when encoding to utf8: %s" % s)
elif isinstance(s, unicode): elif isinstance(s, unicode):
s = s.encode("utf8", "ignore") s = s.encode("utf8", "ignore")
return s return s