rename unicode regexes again.

This commit is contained in:
Christopher Jeffrey 2015-04-26 04:34:32 -07:00
parent dafe95d900
commit c73fee2f7d
2 changed files with 18 additions and 12 deletions

View File

@ -532,12 +532,18 @@ exports.fromCodePoint = function() {
return result;
};
/**
* Regexes
*/
exports.chars = {};
// Double width characters that are _not_ surrogate pairs.
// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
// regex anyway. This regex is used to put a blank char after wide chars to
// be eaten, however, if this is a surrogate pair, parseContent already adds
// the extra one char because its length equals 2 instead of 1.
exports.wideChars = new RegExp('(['
exports.chars.wide = new RegExp('(['
+ '\\u1100-\\u115f' // Hangul Jamo init. consonants
+ '\\u2329\\u232a'
+ '\\u2e80-\\u303e\\u3040-\\ua4cf' // CJK ... Yi
@ -550,21 +556,21 @@ exports.wideChars = new RegExp('(['
+ '])', 'g');
// All wide chars including surrogate pairs.
exports.allWideChars = new RegExp('('
exports.chars.all = new RegExp('('
// 0x20000 - 0x2fffd:
+ '[\\ud840-\\ud87f][\\udc00-\\udffd]'
+ '|'
// 0x30000 - 0x3fffd:
+ '[\\ud880-\\ud8bf][\\udc00-\\udffd]'
+ '|'
+ exports.wideChars.source.slice(1, -1)
+ exports.chars.wide.source.slice(1, -1)
+ ')', 'g');
// Regex to detect a surrogate pair.
exports.surrogateChars = /[\ud800-\udbff][\udc00-\udfff]/g;
exports.chars.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g;
// Regex to find combining characters.
exports.combiningChars = exports.combiningTable.reduce(function(out, row) {
exports.chars.combining = exports.combiningTable.reduce(function(out, row) {
var low, high, range;
if (row[0] > 0x00ffff) {
low = exports.fromCodePoint(row[0]);
@ -591,7 +597,7 @@ exports.combiningChars = exports.combiningTable.reduce(function(out, row) {
return out;
}, '[');
exports.combiningChars = new RegExp(exports.combiningChars, 'g');
exports.chars.combining = new RegExp(exports.chars.combining, 'g');
function hexify(n) {
n = n.toString(16);
@ -600,7 +606,7 @@ function hexify(n) {
}
/*
exports.combiningChars = new RegExp(
exports.chars.combining = new RegExp(
'['
+ '\\u0300-\\u036f'
+ '\\u0483-\\u0486'

View File

@ -2424,17 +2424,17 @@ Element.prototype.parseContent = function(noTags) {
if (this.screen.fullUnicode) {
// double-width chars will eat the next char after render. create a
// blank character after it so it doesn't eat the real next char.
content = content.replace(unicode.wideChars, '$1_');
content = content.replace(unicode.chars.wide, '$1_');
} else {
// no double-width: replace them with question-marks.
content = content.replace(unicode.allWideChars, '??');
content = content.replace(unicode.chars.all, '??');
// delete combining characters since they're 0-width anyway.
// NOTE: We could drop this, the non-surrogates would get changed to ? by
// the unicode filter, and surrogates changed to ? by the surrogate
// regex. however, the user might expect them to be 0-width.
content = content.replace(unicode.combiningChars, '');
content = content.replace(unicode.chars.combining, '');
// no surrogate pairs: replace them with question-marks.
content = content.replace(unicode.surrogateChars, '?');
content = content.replace(unicode.chars.surrogate, '?');
}
if (!noTags) {
@ -2769,7 +2769,7 @@ main:
}
// Pad the end of the lines if the surrogate is not a double-width char.
// var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length;
var surrogates = out[i].match(unicode.surrogateChars);
var surrogates = out[i].match(unicode.chars.surrogate);
if (surrogates && surrogates.length) {
for (var j = 0; j < surrogates.length; j++) {
var cwid = unicode.charWidth(surrogates[j], 0);