From ccca1092e7bf3ef9b7cabd9c7b2d539f97eb889e Mon Sep 17 00:00:00 2001 From: Christopher Jeffrey Date: Sun, 26 Apr 2015 04:23:32 -0700 Subject: [PATCH] add regex for all wide chars. --- lib/unicode.js | 10 ++++++++++ lib/widget.js | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/unicode.js b/lib/unicode.js index 51a73ef..b21ae77 100644 --- a/lib/unicode.js +++ b/lib/unicode.js @@ -549,6 +549,16 @@ exports.wideChars = new RegExp('([' + '\\uffe0-\\uffe6' + '])', 'g'); +exports.allWide = new RegExp('(' + // 0x20000 - 0x2fffd: + + '[\\ud840-\\ud87f][\\udc00-\\udffd]' + + '|' + // 0x30000 - 0x3fffd: + + '[\\ud880-\\ud8bf][\\udc00-\\udffd]' + + '|' + + exports.wideChars.source.slice(1, -1) + + ')', 'g'); + // Regex to detect a surrogate pair. exports.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g; diff --git a/lib/widget.js b/lib/widget.js index 803ff4a..f1f7f5e 100644 --- a/lib/widget.js +++ b/lib/widget.js @@ -2427,7 +2427,7 @@ Element.prototype.parseContent = function(noTags) { content = content.replace(unicode.wideChars, '$1_'); } else { // no double-width: replace them with question-marks. - content = content.replace(unicode.wideChars, '??'); + content = content.replace(unicode.allWide, '??'); // delete combining characters since they're 0-width anyway. // NOTE: We could drop this, the non-surrogates would get changed to ? by // the unicode filter, and surrogates changed to ? by the surrogate