handle unicode combining characters properly.

This commit is contained in:
Christopher Jeffrey 2015-04-16 11:23:53 -07:00
parent dfb87e2037
commit d520c94c3b
5 changed files with 310 additions and 19 deletions

View File

@ -276,14 +276,14 @@ The screen on which every other node renders.
│ box1 │ box2 │
└─────────┴─────────┘
```
- __fullUnicode__ - allow for rendering of East Asian double-width characters.
also enable proper rendering of utf-16 surrogate pairs. this allows you to
- __fullUnicode__ - allow for rendering of East Asian double-width characters,
utf-16 surrogate pairs, and unicode combining characters. this allows you to
display text above the basic multilingual plane. this is behind an option
because it may affect performance slightly negatively. without this option
enabled, all double-width and surrogate pair characters will be replaced by
`??` and `?` respectively. (NOTE: libvte (e.g. gnome-terminal) cannot display
characters that are both surrogate pairs _and_ double-width properly. there
is no way for blessed to fix this unfortunately).
enabled, all double-width, surrogate pair, and combining characters will be
replaced by `??`, `?`, `` respectively. (NOTE: libvte (e.g. gnome-terminal)
cannot display characters that are both surrogate pairs _and_ double-width
properly. there is no way for blessed to fix this unfortunately).
##### Properties:

View File

@ -12,7 +12,8 @@
var program = require('./program')
, tput = require('./tput')
, widget = require('./widget')
, colors = require('./colors');
, colors = require('./colors')
, unicode = require('./unicode');
/**
* Blessed
@ -24,8 +25,9 @@ function blessed() {
blessed.program = blessed.Program = program;
blessed.tput = blessed.Tput = tput;
blessed.colors = colors;
blessed.widget = widget;
blessed.colors = colors;
blessed.unicode = unicode;
Object.keys(blessed.widget).forEach(function(name) {
blessed[name] = blessed.widget[name];

View File

@ -79,6 +79,33 @@
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
// String.fromCodePoint
//
// Copyright Mathias Bynens <https://mathiasbynens.be/>
// https://github.com/mathiasbynens/String.fromCodePoint
//
// Permission is hereby granted, free of charge, to any person obtaining
// a copy of this software and associated documentation files (the
// "Software"), to deal in the Software without restriction, including
// without limitation the rights to use, copy, modify, merge, publish,
// distribute, sublicense, and/or sell copies of the Software, and to
// permit persons to whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
var stringFromCharCode = String.fromCharCode;
var floor = Math.floor;
exports.charWidth = function(str, i) {
var point = typeof str !== 'number'
? exports.codePointAt(str, i || 0)
@ -355,7 +382,7 @@ exports.isSurrogate = function(str, i) {
return point > 0x00ffff;
};
exports.combining = [
exports.combiningTable = [
[ 0x0300, 0x036F ], [ 0x0483, 0x0486 ], [ 0x0488, 0x0489 ],
[ 0x0591, 0x05BD ], [ 0x05BF, 0x05BF ], [ 0x05C1, 0x05C2 ],
[ 0x05C4, 0x05C5 ], [ 0x05C7, 0x05C7 ], [ 0x0600, 0x0603 ],
@ -404,7 +431,9 @@ exports.combining = [
[ 0x1D173, 0x1D182 ], [ 0x1D185, 0x1D18B ], [ 0x1D1AA, 0x1D1AD ],
[ 0x1D242, 0x1D244 ], [ 0xE0001, 0xE0001 ], [ 0xE0020, 0xE007F ],
[ 0xE0100, 0xE01EF ]
].reduce(function(out, row) {
];
exports.combining = exports.combiningTable.reduce(function(out, row) {
for (var i = row[0]; i <= row[1]; i++) {
out[i] = true;
}
@ -465,6 +494,44 @@ exports.codePointAt = function(str, position) {
return first;
};
exports.fromCodePoint = function() {
var MAX_SIZE = 0x4000;
var codeUnits = [];
var highSurrogate;
var lowSurrogate;
var index = -1;
var length = arguments.length;
if (!length) {
return '';
}
var result = '';
while (++index < length) {
var codePoint = Number(arguments[index]);
if (
!isFinite(codePoint) || // `NaN`, `+Infinity`, or `-Infinity`
codePoint < 0 || // not a valid Unicode code point
codePoint > 0x10FFFF || // not a valid Unicode code point
floor(codePoint) != codePoint // not an integer
) {
throw RangeError('Invalid code point: ' + codePoint);
}
if (codePoint <= 0xFFFF) { // BMP code point
codeUnits.push(codePoint);
} else { // Astral code point; split in surrogate halves
// http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
codePoint -= 0x10000;
highSurrogate = (codePoint >> 10) + 0xD800;
lowSurrogate = (codePoint % 0x400) + 0xDC00;
codeUnits.push(highSurrogate, lowSurrogate);
}
if (index + 1 == length || codeUnits.length > MAX_SIZE) {
result += stringFromCharCode.apply(null, codeUnits);
codeUnits.length = 0;
}
}
return result;
};
// Double width characters that are _not_ surrogate pairs.
// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
// regex anyway. This regex is used to put a blank char after wide chars to
@ -484,3 +551,188 @@ exports.wideChars = new RegExp('(['
// Regex to detect a surrogate pair.
exports.surrogate = /[\ud800-\udbff][\udc00-\udfff]/g;
// Regex to find combining characters.
exports.combiningRegex = exports.combiningTable.reduce(function(out, row) {
var low, high, range;
if (row[0] > 0x00ffff) {
low = exports.fromCodePoint(row[0]);
low = [
hexify(low.charCodeAt(0)),
hexify(low.charCodeAt(1))
];
high = exports.fromCodePoint(row[1]);
high = [
hexify(high.charCodeAt(0)),
hexify(high.charCodeAt(1))
];
range = '[\\u' + low[0] + '-' + '\\u' + high[0] + ']'
+ '[\\u' + low[1] + '-' + '\\u' + high[1] + ']';
if (!~out.indexOf('|')) out += ']';
out += '|' + range;
} else {
low = hexify(row[0]);
high = hexify(row[1]);
low = '\\u' + low;
high = '\\u' + high;
out += low + '-' + high;
}
return out;
}, '[');
exports.combiningRegex = new RegExp(exports.combiningRegex, 'g');
function hexify(n) {
n = n.toString(16);
while (n.length < 4) n = '0' + n;
return n;
}
/*
exports.combiningRegex = new RegExp(
'['
+ '\\u0300-\\u036f'
+ '\\u0483-\\u0486'
+ '\\u0488-\\u0489'
+ '\\u0591-\\u05bd'
+ '\\u05bf-\\u05bf'
+ '\\u05c1-\\u05c2'
+ '\\u05c4-\\u05c5'
+ '\\u05c7-\\u05c7'
+ '\\u0600-\\u0603'
+ '\\u0610-\\u0615'
+ '\\u064b-\\u065e'
+ '\\u0670-\\u0670'
+ '\\u06d6-\\u06e4'
+ '\\u06e7-\\u06e8'
+ '\\u06ea-\\u06ed'
+ '\\u070f-\\u070f'
+ '\\u0711-\\u0711'
+ '\\u0730-\\u074a'
+ '\\u07a6-\\u07b0'
+ '\\u07eb-\\u07f3'
+ '\\u0901-\\u0902'
+ '\\u093c-\\u093c'
+ '\\u0941-\\u0948'
+ '\\u094d-\\u094d'
+ '\\u0951-\\u0954'
+ '\\u0962-\\u0963'
+ '\\u0981-\\u0981'
+ '\\u09bc-\\u09bc'
+ '\\u09c1-\\u09c4'
+ '\\u09cd-\\u09cd'
+ '\\u09e2-\\u09e3'
+ '\\u0a01-\\u0a02'
+ '\\u0a3c-\\u0a3c'
+ '\\u0a41-\\u0a42'
+ '\\u0a47-\\u0a48'
+ '\\u0a4b-\\u0a4d'
+ '\\u0a70-\\u0a71'
+ '\\u0a81-\\u0a82'
+ '\\u0abc-\\u0abc'
+ '\\u0ac1-\\u0ac5'
+ '\\u0ac7-\\u0ac8'
+ '\\u0acd-\\u0acd'
+ '\\u0ae2-\\u0ae3'
+ '\\u0b01-\\u0b01'
+ '\\u0b3c-\\u0b3c'
+ '\\u0b3f-\\u0b3f'
+ '\\u0b41-\\u0b43'
+ '\\u0b4d-\\u0b4d'
+ '\\u0b56-\\u0b56'
+ '\\u0b82-\\u0b82'
+ '\\u0bc0-\\u0bc0'
+ '\\u0bcd-\\u0bcd'
+ '\\u0c3e-\\u0c40'
+ '\\u0c46-\\u0c48'
+ '\\u0c4a-\\u0c4d'
+ '\\u0c55-\\u0c56'
+ '\\u0cbc-\\u0cbc'
+ '\\u0cbf-\\u0cbf'
+ '\\u0cc6-\\u0cc6'
+ '\\u0ccc-\\u0ccd'
+ '\\u0ce2-\\u0ce3'
+ '\\u0d41-\\u0d43'
+ '\\u0d4d-\\u0d4d'
+ '\\u0dca-\\u0dca'
+ '\\u0dd2-\\u0dd4'
+ '\\u0dd6-\\u0dd6'
+ '\\u0e31-\\u0e31'
+ '\\u0e34-\\u0e3a'
+ '\\u0e47-\\u0e4e'
+ '\\u0eb1-\\u0eb1'
+ '\\u0eb4-\\u0eb9'
+ '\\u0ebb-\\u0ebc'
+ '\\u0ec8-\\u0ecd'
+ '\\u0f18-\\u0f19'
+ '\\u0f35-\\u0f35'
+ '\\u0f37-\\u0f37'
+ '\\u0f39-\\u0f39'
+ '\\u0f71-\\u0f7e'
+ '\\u0f80-\\u0f84'
+ '\\u0f86-\\u0f87'
+ '\\u0f90-\\u0f97'
+ '\\u0f99-\\u0fbc'
+ '\\u0fc6-\\u0fc6'
+ '\\u102d-\\u1030'
+ '\\u1032-\\u1032'
+ '\\u1036-\\u1037'
+ '\\u1039-\\u1039'
+ '\\u1058-\\u1059'
+ '\\u1160-\\u11ff'
+ '\\u135f-\\u135f'
+ '\\u1712-\\u1714'
+ '\\u1732-\\u1734'
+ '\\u1752-\\u1753'
+ '\\u1772-\\u1773'
+ '\\u17b4-\\u17b5'
+ '\\u17b7-\\u17bd'
+ '\\u17c6-\\u17c6'
+ '\\u17c9-\\u17d3'
+ '\\u17dd-\\u17dd'
+ '\\u180b-\\u180d'
+ '\\u18a9-\\u18a9'
+ '\\u1920-\\u1922'
+ '\\u1927-\\u1928'
+ '\\u1932-\\u1932'
+ '\\u1939-\\u193b'
+ '\\u1a17-\\u1a18'
+ '\\u1b00-\\u1b03'
+ '\\u1b34-\\u1b34'
+ '\\u1b36-\\u1b3a'
+ '\\u1b3c-\\u1b3c'
+ '\\u1b42-\\u1b42'
+ '\\u1b6b-\\u1b73'
+ '\\u1dc0-\\u1dca'
+ '\\u1dfe-\\u1dff'
+ '\\u200b-\\u200f'
+ '\\u202a-\\u202e'
+ '\\u2060-\\u2063'
+ '\\u206a-\\u206f'
+ '\\u20d0-\\u20ef'
+ '\\u302a-\\u302f'
+ '\\u3099-\\u309a'
+ '\\ua806-\\ua806'
+ '\\ua80b-\\ua80b'
+ '\\ua825-\\ua826'
+ '\\ufb1e-\\ufb1e'
+ '\\ufe00-\\ufe0f'
+ '\\ufe20-\\ufe23'
+ '\\ufeff-\\ufeff'
+ '\\ufff9-\\ufffb'
+ ']'
+ '|[\\ud802-\\ud802][\\ude01-\\ude03]'
+ '|[\\ud802-\\ud802][\\ude05-\\ude06]'
+ '|[\\ud802-\\ud802][\\ude0c-\\ude0f]'
+ '|[\\ud802-\\ud802][\\ude38-\\ude3a]'
+ '|[\\ud802-\\ud802][\\ude3f-\\ude3f]'
+ '|[\\ud834-\\ud834][\\udd67-\\udd69]'
+ '|[\\ud834-\\ud834][\\udd73-\\udd82]'
+ '|[\\ud834-\\ud834][\\udd85-\\udd8b]'
+ '|[\\ud834-\\ud834][\\uddaa-\\uddad]'
+ '|[\\ud834-\\ud834][\\ude42-\\ude44]'
+ '|[\\udb40-\\udb40][\\udc01-\\udc01]'
+ '|[\\udb40-\\udb40][\\udc20-\\udc7f]'
+ '|[\\udb40-\\udb40][\\udd00-\\uddef]'
, 'g');
*/

View File

@ -1214,8 +1214,6 @@ Screen.prototype.draw = function(start, end) {
} else {
o[++x][1] = ' ';
}
} else if (cwid === 0) {
ch = ' ';
}
}
}
@ -2332,8 +2330,14 @@ Element.prototype.parseContent = function(noTags) {
// blank character after it so it doesn't eat the real next char.
content = content.replace(unicode.wideChars, '$1 ');
} else {
// no double-width or surrogate pairs: replace them with question-marks.
// no double-width: replace them with question-marks.
content = content.replace(unicode.wideChars, '??');
// delete combining characters since they're 0-width anyway.
// NOTE: We could drop this, the non-surrogates would get changed to ? by
// the unicode filter, and surrogates changed to ? by the surrogate
// regex. however, the user might expect them to be 0-width.
content = content.replace(unicode.combiningRegex, '');
// no surrogate pairs: replace them with question-marks.
content = content.replace(unicode.surrogate, '?');
}
@ -2673,9 +2677,7 @@ main:
if (surrogates && surrogates.length) {
for (var j = 0; j < surrogates.length; j++) {
var cwid = unicode.charWidth(surrogates[j], 0);
if (cwid === 0) {
out[i] += ' ';
} else if (cwid === 1) {
if (cwid === 1) {
out[i] += ' ';
}
}
@ -3997,9 +3999,25 @@ Element.prototype.render = function() {
continue;
}
// Handle surrogate pairs:
// Make sure we put surrogate pair chars in one cell.
if (this.screen.fullUnicode && content[ci - 1]) {
// Handle combining chars:
// Make sure they get in the same cell and are counted as 0.
var point = unicode.codePointAt(content, ci - 1);
if (unicode.combining[point]) {
if (point > 0x00ffff) {
ch = content[ci - 1] + content[ci];
ci++;
}
if (x - 1 >= xi) {
lines[y][x - 1][1] += ch;
} else if (y - 1 >= yi) {
lines[y - 1][xl - 1][1] += ch;
}
x--;
continue;
}
// Handle surrogate pairs:
// Make sure we put surrogate pair chars in one cell.
var code = content[ci - 1].charCodeAt(0);
// if (unicode.codePointAt(content, ci - 1) > 0x00ffff) {
// if (unicode.isSurrogate(content, ci - 1) {
@ -4014,6 +4032,19 @@ Element.prototype.render = function() {
}
}
// Alternative to regex to avoiding combining chars when fullUnicode=false
// NOTE: Wouldn't matter because the surrogate regex would already remove it.
// if (!this.screen.fullUnicode) {
// var point = unicode.codePointAt(content, ci - 1);
// if (unicode.combining[point]) {
// if (point > 0x00ffff) {
// ci++;
// }
// x--;
// continue;
// }
// }
if (this.style.transparent) {
lines[y][x][0] = blend(attr, lines[y][x][0]);
if (content[ci]) lines[y][x][1] = ch;

View File

@ -31,10 +31,16 @@ var SURROGATE_SINGLE = String.fromCodePoint
? String.fromCodePoint(0x1D306)
: String.fromCharCode(0xD834, 0xDF06);
var COMBINE = String.fromCodePoint
? String.fromCodePoint(0x0300)
: String.fromCharCode(0x0300);
var COMBINE = blessed.unicode.fromCodePoint(0x10A01);
// At cols=44, the bug that is avoided by this occurs:
// || angles[line[x + 1][1]]) {
var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque'
var lorem = 'Non eram nes' + COMBINE + 'cius Brute cum quae summis ingeniis exquisitaque'
+ ' doctrina philosophi Graeco sermone tractavissent ea Latinis litteris mandaremus'
+ ' fore ut hic noster labor in varias reprehensiones incurreret nam quibusdam et'
+ ' iis quidem non admodum indoctis totum hoc displicet philosophari quidam autem'