add support for surrogate pairs. fixes #123. see #4 and slap-editor/slap#107.

This commit is contained in:
Christopher Jeffrey 2015-04-15 10:05:34 -07:00
parent 1b1775a4c2
commit ea4e142757
3 changed files with 143 additions and 62 deletions

View File

@ -264,11 +264,12 @@ The screen on which every other node renders.
matter whether the keys are locked.
- __dockBorders__ - automatically "dock" borders with other elements instead of
overlapping, depending on position (__experimental__). for example:
- __doubleWidth__ - allow for rendering of East Asian double-width characters.
this is behind an option because it may affect performance negatively.
- __doubleWidthPerfect__ - handle high code point double-width characters,
without this option, high code point double width characters just show up as
`?`. that being said, this option will slow content parsing a fair amount.
- __fullUnicode__ - allow for rendering of East Asian double-width characters.
also enable proper rendering of utf-16 surrogate pairs. this allows you to
display text above the basic multilingual plane. this is behind an option
because it may affect performance slightly negatively. without this option
enabled, all double-width and surrogate pair characters will be replaced by
`??` and `?` respectively.
These border-overlapped elements:

View File

@ -290,6 +290,9 @@ function Screen(options) {
this.ignoreLocked = options.ignoreLocked || [];
this._unicode = this.tput.unicode || this.tput.numbers.U8 === 1;
this.fullUnicode = this.options.fullUnicode && this._unicode;
this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff;
this.renders = 0;
@ -979,7 +982,8 @@ Screen.prototype.draw = function(start, end) {
, fg
, bg
, flags
, cwid;
, cwid
, point;
var main = ''
, pre
@ -1196,25 +1200,42 @@ Screen.prototype.draw = function(start, end) {
// If we find a double-width char, eat the next character which should be
// a space due to parseContent's behavior.
if (this.options.doubleWidth && (this.tput.unicode || this.tput.numbers.U8 === 1)) {
cwid = east_asian_width.char_width(line[x][1].codePointAt(0));
if (cwid === 2) {
// Might also need:
// `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
if (x === line.length - 1 || angles[line[x + 1][1]]) {
if (this.fullUnicode) {
// If this is a surrogate pair double-width char, we can ignore it
// because parseContent already counted it as length=2.
point = line[x][1].codePointAt(0);
// if (line[x][1].length === 1) {
if (point <= 0xffff) {
cwid = east_asian_width.char_width(point);
if (cwid === 2) {
// Might also need:
// `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
if (x === line.length - 1 || angles[line[x + 1][1]]) {
ch = ' ';
o[x][1] = ' ';
} else {
o[++x][1] = ' ';
}
} else if (cwid === 0) {
ch = ' ';
o[x][1] = ' ';
} else {
o[++x][1] = ' ';
}
} else if (cwid === 0) {
// No real way to do this right now:
// ch = '';
ch = ' ';
o[x][1] = ' ';
}
}
// Find surrogate pairs that have been split:
// Pad after each character instead of end of line.
// XXX Doesn't work well.
// if (this.fullUnicode) {
// var code = line[x][1].charCodeAt(0);
// if (code >= 0xd800 && code <= 0xdbff) {
// var code2 = line[x + 1][1].charCodeAt(0);
// if (code2 >= 0xdc00 && code2 <= 0xdfff) {
// ch = line[x][1] + line[x + 1][1];
// line[x + 1][1] = ' ';
// }
// }
// }
// Attempt to use ACS for supported characters.
// This is not ideal, but it's how ncurses works.
// There are a lot of terminals that support ACS
@ -1228,7 +1249,9 @@ Screen.prototype.draw = function(start, end) {
// supports UTF8, but I imagine it's unlikely.
// Maybe remove !this.tput.unicode check, however,
// this seems to be the way ncurses does it.
if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS) {
if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS
// Necessary for handling unicode when not supported:
&& (this.tput.acscr[ch] || acs)) {
// Fun fact: even if this.tput.brokenACS wasn't checked here,
// the linux console would still work fine because the acs
// table would fail the check of: this.tput.acscr[ch]
@ -2321,42 +2344,39 @@ Element.prototype.parseContent = function(noTags) {
.replace(/\r\n|\r/g, '\n')
.replace(/\t/g, this.screen.tabc);
if (this.screen.options.doubleWidth
&& (this.screen.tput.unicode
|| this.screen.tput.numbers.U8 === 1)) {
// double-width chars will eat the next char after render - create a
// blank character after it so it doesn't eat the real next char
if (this.screen.fullUnicode) {
// double-width chars will eat the next char after render. create a
// blank character after it so it doesn't eat the real next char.
content = content.replace(wideChars, '$1 ');
} else {
// no double-width. replace double-width chars with question-marks.
// NOTE: could use two chars: '? ' depending on what is intended.
// if we did, we could remove the unicode checks above in this if
// statement.
content = content.replace(wideChars, '?');
// no double-width or surrogate pairs: replace them with question-marks.
content = content.replace(wideChars, '??');
content = content.replace(/[\ud800-\udbff][\udc00-\udfff]/g, '?');
}
if (this.screen.options.doubleWidthPerfect) {
var _content = content;
content = '';
for (var i = 0; i < _content.length; i++) {
var point = _content.codePointAt(i);
if ((point >= 0x20000 && point <= 0x2fffd)
|| (point >= 0x30000 && point <= 0x3fffd)) {
if (this.screen.options.doubleWidth
&& (this.screen.tput.unicode
|| this.screen.tput.numbers.U8 === 1)) {
content += _content[i] + ' ';
} else {
// NOTE: could use two chars: '? ' depending on what is intended.
// if we did, we could remove the unicode checks above in this if
// statement.
content += '?';
}
} else {
content += _content[i];
}
}
}
// XXX Because 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are surrogate pairs,
// the length is computed as `2` in javascript, which coincidentally helps
// since they are double width. This may not be necessary.
// if (this.screen.options.fullUnicode) {
// var _content = content;
// content = '';
// for (var i = 0; i < _content.length; i++) {
// var point = _content.codePointAt(i);
// if ((point >= 0x20000 && point <= 0x2fffd)
// || (point >= 0x30000 && point <= 0x3fffd)) {
// if (this.screen._unicode) {
// content += _content[i] + ' ';
// } else {
// // NOTE: could use two chars: '? ' depending on what is intended.
// // if we did, we could remove the unicode checks above in this if
// // statement.
// content += '?';
// }
// } else {
// content += _content[i];
// }
// }
// }
if (!noTags) {
content = this._parseTags(content);
@ -2677,6 +2697,30 @@ main:
: current;
}, 0);
// Find all surrogate pairs and compensate for the lack of width
// on the line by padding with trailing spaces:
if (this.screen.fullUnicode) {
for (var i = 0; i < out.length; i++) {
// NOTE: Happens at 54 cols with all chars enabled in test.
// Check to see if surrogates got split on end and beginning of 2 lines.
if (/[\ud800-\udbff]$/.exec(out[i])
&& /^[\udc00-\udfff]/.exec(out[i + 1])) {
out[i] = out[i] + out[i + 1][0];
out[i + 1] = out[i + 1].substring(1) + ' ';
}
// Pad the end of the lines if the surrogate is not a double-width char.
// var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length;
var surrogates = out[i].match(/[\ud800-\udbff][\udc00-\udfff]/g);
if (surrogates && surrogates.length) {
for (var j = 0; j < surrogates.length; j++) {
if (east_asian_width.char_width(surrogates[j].codePointAt(0)) === 1) {
out[i] += ' ';
}
}
}
}
}
return out;
};
@ -3991,6 +4035,22 @@ Element.prototype.render = function() {
continue;
}
// Handle surrogate pairs:
// Make sure we put surrogate pair chars in one cell.
if (this.screen.fullUnicode) {
var code = content[ci - 1].charCodeAt(0);
// if (content.codePointAt(ci - 1) > 0xffff) {
if (code >= 0xd800 && code <= 0xdbff) {
var code2 = (content[ci] || '').charCodeAt(0);
if (code2 >= 0xdc00 && code2 <= 0xdfff) {
ch = content[ci - 1] + content[ci];
ci++;
} else {
ch = bch;
}
}
}
if (this.style.transparent) {
lines[y][x][0] = blend(attr, lines[y][x][0]);
if (content[ci]) lines[y][x][1] = ch;
@ -8614,6 +8674,10 @@ function hsort(obj) {
});
}
// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
// regex anyway. This regex is used to put a blank char after wide chars to
// be eaten, however, if this is a surrogate pair, parseContent already adds
// the extra one char because its length equals 2 instead of 1.
var wideChars = new RegExp('('
// 0x20000 - 0x2fffd:
// + '[\\ud840-\\ud87f][\\udc00-\\udffd]'
@ -8631,9 +8695,6 @@ var wideChars = new RegExp('('
+ '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */
+ '\\uff00-\\uff60' /* Fullwidth Forms */
+ '\\uffe0-\\uffe6'
// XXX Cannot implement these in a regex. Not perfect, but the layout will
// still not be negatively affected by double-width chars in this range,
// however, the next char on the screen will be eaten.
// + '\\u20000-\\u2fffd'
// + '\\u30000-\\u3fffd'
+ ']'

View File

@ -5,12 +5,30 @@ screen = blessed.screen({
dump: __dirname + '/logs/eaw.log',
smartCSR: true,
dockBorders: true,
doubleWidth: true,
doubleWidthPerfect: true
fullUnicode: true
});
var DW = '杜';
var DW2 = String.fromCodePoint ? String.fromCodePoint(0x30000) : 'a';
// screen.options.fullUnicode = false;
// screen.fullUnicode = false;
// screen._unicode = false;
// screen.tput.unicode = false;
// screen.tput.numbers.U8 = -1;
// screen.tput.strings.enter_alt_charset_mode = false;
// var DOUBLE = '杜';
var DOUBLE = String.fromCodePoint
? String.fromCodePoint(0x675c)
: String.fromCharCode(0x675c);
// var SURROGATE_DOUBLE = '𰀀';
var SURROGATE_DOUBLE = String.fromCodePoint
? String.fromCodePoint(0x30000)
: String.fromCharCode(0xD880, 0xDC00);
// var SURROGATE_SINGLE = '𝌆';
var SURROGATE_SINGLE = String.fromCodePoint
? String.fromCodePoint(0x1D306)
: String.fromCharCode(0xD834, 0xDF06);
// At cols=44, the bug that is avoided by this occurs:
// || angles[line[x + 1][1]]) {
@ -60,8 +78,9 @@ var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque'
+ ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non'
+ ' legantur';
lorem = lorem.replace(/e/gi, DW);
lorem = lorem.replace(/a/gi, DW2);
lorem = lorem.replace(/e/gi, DOUBLE);
lorem = lorem.replace(/a/gi, SURROGATE_DOUBLE);
lorem = lorem.replace(/o/gi, SURROGATE_SINGLE);
var main = blessed.box({
parent: screen,