add regex for all wide chars.

This commit is contained in:
Christopher Jeffrey 2015-04-26 04:23:32 -07:00
parent 758eef133c
commit ccca1092e7
2 changed files with 11 additions and 1 deletions

View File

@ -549,6 +549,16 @@ exports.wideChars = new RegExp('(['
+ '\\uffe0-\\uffe6'
+ '])', 'g');
exports.allWide = new RegExp('('
// 0x20000 - 0x2fffd:
+ '[\\ud840-\\ud87f][\\udc00-\\udffd]'
+ '|'
// 0x30000 - 0x3fffd:
+ '[\\ud880-\\ud8bf][\\udc00-\\udffd]'
+ '|'
+ exports.wideChars.source.slice(1, -1)
+ ')', 'g');
// Regex to detect a surrogate pair.
exports.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g;

View File

@ -2427,7 +2427,7 @@ Element.prototype.parseContent = function(noTags) {
content = content.replace(unicode.wideChars, '$1_');
} else {
// no double-width: replace them with question-marks.
content = content.replace(unicode.wideChars, '??');
content = content.replace(unicode.allWide, '??');
// delete combining characters since they're 0-width anyway.
// NOTE: We could drop this, the non-surrogates would get changed to ? by
// the unicode filter, and surrogates changed to ? by the surrogate