add support for surrogate pairs. fixes #123. see #4 and slap-editor/slap#107.

This commit is contained in:
Christopher Jeffrey 2015-04-15 10:05:34 -07:00
parent 1b1775a4c2
commit ea4e142757
3 changed files with 143 additions and 62 deletions

View File

@ -264,11 +264,12 @@ The screen on which every other node renders.
matter whether the keys are locked. matter whether the keys are locked.
- __dockBorders__ - automatically "dock" borders with other elements instead of - __dockBorders__ - automatically "dock" borders with other elements instead of
overlapping, depending on position (__experimental__). for example: overlapping, depending on position (__experimental__). for example:
- __doubleWidth__ - allow for rendering of East Asian double-width characters. - __fullUnicode__ - allow for rendering of East Asian double-width characters.
this is behind an option because it may affect performance negatively. also enable proper rendering of utf-16 surrogate pairs. this allows you to
- __doubleWidthPerfect__ - handle high code point double-width characters, display text above the basic multilingual plane. this is behind an option
without this option, high code point double width characters just show up as because it may affect performance slightly negatively. without this option
`?`. that being said, this option will slow content parsing a fair amount. enabled, all double-width and surrogate pair characters will be replaced by
`??` and `?` respectively.
These border-overlapped elements: These border-overlapped elements:

View File

@ -290,6 +290,9 @@ function Screen(options) {
this.ignoreLocked = options.ignoreLocked || []; this.ignoreLocked = options.ignoreLocked || [];
this._unicode = this.tput.unicode || this.tput.numbers.U8 === 1;
this.fullUnicode = this.options.fullUnicode && this._unicode;
this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff; this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff;
this.renders = 0; this.renders = 0;
@ -979,7 +982,8 @@ Screen.prototype.draw = function(start, end) {
, fg , fg
, bg , bg
, flags , flags
, cwid; , cwid
, point;
var main = '' var main = ''
, pre , pre
@ -1196,8 +1200,13 @@ Screen.prototype.draw = function(start, end) {
// If we find a double-width char, eat the next character which should be // If we find a double-width char, eat the next character which should be
// a space due to parseContent's behavior. // a space due to parseContent's behavior.
if (this.options.doubleWidth && (this.tput.unicode || this.tput.numbers.U8 === 1)) { if (this.fullUnicode) {
cwid = east_asian_width.char_width(line[x][1].codePointAt(0)); // If this is a surrogate pair double-width char, we can ignore it
// because parseContent already counted it as length=2.
point = line[x][1].codePointAt(0);
// if (line[x][1].length === 1) {
if (point <= 0xffff) {
cwid = east_asian_width.char_width(point);
if (cwid === 2) { if (cwid === 2) {
// Might also need: // Might also need:
// `|| line[x + 1][0] !== line[x][0]` for borderless boxes? // `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
@ -1208,12 +1217,24 @@ Screen.prototype.draw = function(start, end) {
o[++x][1] = ' '; o[++x][1] = ' ';
} }
} else if (cwid === 0) { } else if (cwid === 0) {
// No real way to do this right now:
// ch = '';
ch = ' '; ch = ' ';
o[x][1] = ' ';
} }
} }
}
// Find surrogate pairs that have been split:
// Pad after each character instead of end of line.
// XXX Doesn't work well.
// if (this.fullUnicode) {
// var code = line[x][1].charCodeAt(0);
// if (code >= 0xd800 && code <= 0xdbff) {
// var code2 = line[x + 1][1].charCodeAt(0);
// if (code2 >= 0xdc00 && code2 <= 0xdfff) {
// ch = line[x][1] + line[x + 1][1];
// line[x + 1][1] = ' ';
// }
// }
// }
// Attempt to use ACS for supported characters. // Attempt to use ACS for supported characters.
// This is not ideal, but it's how ncurses works. // This is not ideal, but it's how ncurses works.
@ -1228,7 +1249,9 @@ Screen.prototype.draw = function(start, end) {
// supports UTF8, but I imagine it's unlikely. // supports UTF8, but I imagine it's unlikely.
// Maybe remove !this.tput.unicode check, however, // Maybe remove !this.tput.unicode check, however,
// this seems to be the way ncurses does it. // this seems to be the way ncurses does it.
if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS) { if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS
// Necessary for handling unicode when not supported:
&& (this.tput.acscr[ch] || acs)) {
// Fun fact: even if this.tput.brokenACS wasn't checked here, // Fun fact: even if this.tput.brokenACS wasn't checked here,
// the linux console would still work fine because the acs // the linux console would still work fine because the acs
// table would fail the check of: this.tput.acscr[ch] // table would fail the check of: this.tput.acscr[ch]
@ -2321,42 +2344,39 @@ Element.prototype.parseContent = function(noTags) {
.replace(/\r\n|\r/g, '\n') .replace(/\r\n|\r/g, '\n')
.replace(/\t/g, this.screen.tabc); .replace(/\t/g, this.screen.tabc);
if (this.screen.options.doubleWidth if (this.screen.fullUnicode) {
&& (this.screen.tput.unicode // double-width chars will eat the next char after render. create a
|| this.screen.tput.numbers.U8 === 1)) { // blank character after it so it doesn't eat the real next char.
// double-width chars will eat the next char after render - create a
// blank character after it so it doesn't eat the real next char
content = content.replace(wideChars, '$1 '); content = content.replace(wideChars, '$1 ');
} else { } else {
// no double-width. replace double-width chars with question-marks. // no double-width or surrogate pairs: replace them with question-marks.
// NOTE: could use two chars: '? ' depending on what is intended. content = content.replace(wideChars, '??');
// if we did, we could remove the unicode checks above in this if content = content.replace(/[\ud800-\udbff][\udc00-\udfff]/g, '?');
// statement.
content = content.replace(wideChars, '?');
} }
if (this.screen.options.doubleWidthPerfect) { // XXX Because 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are surrogate pairs,
var _content = content; // the length is computed as `2` in javascript, which coincidentally helps
content = ''; // since they are double width. This may not be necessary.
for (var i = 0; i < _content.length; i++) { // if (this.screen.options.fullUnicode) {
var point = _content.codePointAt(i); // var _content = content;
if ((point >= 0x20000 && point <= 0x2fffd) // content = '';
|| (point >= 0x30000 && point <= 0x3fffd)) { // for (var i = 0; i < _content.length; i++) {
if (this.screen.options.doubleWidth // var point = _content.codePointAt(i);
&& (this.screen.tput.unicode // if ((point >= 0x20000 && point <= 0x2fffd)
|| this.screen.tput.numbers.U8 === 1)) { // || (point >= 0x30000 && point <= 0x3fffd)) {
content += _content[i] + ' '; // if (this.screen._unicode) {
} else { // content += _content[i] + ' ';
// NOTE: could use two chars: '? ' depending on what is intended. // } else {
// if we did, we could remove the unicode checks above in this if // // NOTE: could use two chars: '? ' depending on what is intended.
// statement. // // if we did, we could remove the unicode checks above in this if
content += '?'; // // statement.
} // content += '?';
} else { // }
content += _content[i]; // } else {
} // content += _content[i];
} // }
} // }
// }
if (!noTags) { if (!noTags) {
content = this._parseTags(content); content = this._parseTags(content);
@ -2677,6 +2697,30 @@ main:
: current; : current;
}, 0); }, 0);
// Find all surrogate pairs and compensate for the lack of width
// on the line by padding with trailing spaces:
if (this.screen.fullUnicode) {
for (var i = 0; i < out.length; i++) {
// NOTE: Happens at 54 cols with all chars enabled in test.
// Check to see if surrogates got split on end and beginning of 2 lines.
if (/[\ud800-\udbff]$/.exec(out[i])
&& /^[\udc00-\udfff]/.exec(out[i + 1])) {
out[i] = out[i] + out[i + 1][0];
out[i + 1] = out[i + 1].substring(1) + ' ';
}
// Pad the end of the lines if the surrogate is not a double-width char.
// var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length;
var surrogates = out[i].match(/[\ud800-\udbff][\udc00-\udfff]/g);
if (surrogates && surrogates.length) {
for (var j = 0; j < surrogates.length; j++) {
if (east_asian_width.char_width(surrogates[j].codePointAt(0)) === 1) {
out[i] += ' ';
}
}
}
}
}
return out; return out;
}; };
@ -3991,6 +4035,22 @@ Element.prototype.render = function() {
continue; continue;
} }
// Handle surrogate pairs:
// Make sure we put surrogate pair chars in one cell.
if (this.screen.fullUnicode) {
var code = content[ci - 1].charCodeAt(0);
// if (content.codePointAt(ci - 1) > 0xffff) {
if (code >= 0xd800 && code <= 0xdbff) {
var code2 = (content[ci] || '').charCodeAt(0);
if (code2 >= 0xdc00 && code2 <= 0xdfff) {
ch = content[ci - 1] + content[ci];
ci++;
} else {
ch = bch;
}
}
}
if (this.style.transparent) { if (this.style.transparent) {
lines[y][x][0] = blend(attr, lines[y][x][0]); lines[y][x][0] = blend(attr, lines[y][x][0]);
if (content[ci]) lines[y][x][1] = ch; if (content[ci]) lines[y][x][1] = ch;
@ -8614,6 +8674,10 @@ function hsort(obj) {
}); });
} }
// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
// regex anyway. This regex is used to put a blank char after wide chars to
// be eaten, however, if this is a surrogate pair, parseContent already adds
// the extra one char because its length equals 2 instead of 1.
var wideChars = new RegExp('(' var wideChars = new RegExp('('
// 0x20000 - 0x2fffd: // 0x20000 - 0x2fffd:
// + '[\\ud840-\\ud87f][\\udc00-\\udffd]' // + '[\\ud840-\\ud87f][\\udc00-\\udffd]'
@ -8631,9 +8695,6 @@ var wideChars = new RegExp('('
+ '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */ + '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */
+ '\\uff00-\\uff60' /* Fullwidth Forms */ + '\\uff00-\\uff60' /* Fullwidth Forms */
+ '\\uffe0-\\uffe6' + '\\uffe0-\\uffe6'
// XXX Cannot implement these in a regex. Not perfect, but the layout will
// still not be negatively affected by double-width chars in this range,
// however, the next char on the screen will be eaten.
// + '\\u20000-\\u2fffd' // + '\\u20000-\\u2fffd'
// + '\\u30000-\\u3fffd' // + '\\u30000-\\u3fffd'
+ ']' + ']'

View File

@ -5,12 +5,30 @@ screen = blessed.screen({
dump: __dirname + '/logs/eaw.log', dump: __dirname + '/logs/eaw.log',
smartCSR: true, smartCSR: true,
dockBorders: true, dockBorders: true,
doubleWidth: true, fullUnicode: true
doubleWidthPerfect: true
}); });
var DW = '杜'; // screen.options.fullUnicode = false;
var DW2 = String.fromCodePoint ? String.fromCodePoint(0x30000) : 'a'; // screen.fullUnicode = false;
// screen._unicode = false;
// screen.tput.unicode = false;
// screen.tput.numbers.U8 = -1;
// screen.tput.strings.enter_alt_charset_mode = false;
// var DOUBLE = '杜';
var DOUBLE = String.fromCodePoint
? String.fromCodePoint(0x675c)
: String.fromCharCode(0x675c);
// var SURROGATE_DOUBLE = '𰀀';
var SURROGATE_DOUBLE = String.fromCodePoint
? String.fromCodePoint(0x30000)
: String.fromCharCode(0xD880, 0xDC00);
// var SURROGATE_SINGLE = '𝌆';
var SURROGATE_SINGLE = String.fromCodePoint
? String.fromCodePoint(0x1D306)
: String.fromCharCode(0xD834, 0xDF06);
// At cols=44, the bug that is avoided by this occurs: // At cols=44, the bug that is avoided by this occurs:
// || angles[line[x + 1][1]]) { // || angles[line[x + 1][1]]) {
@ -60,8 +78,9 @@ var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque'
+ ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non' + ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non'
+ ' legantur'; + ' legantur';
lorem = lorem.replace(/e/gi, DW); lorem = lorem.replace(/e/gi, DOUBLE);
lorem = lorem.replace(/a/gi, DW2); lorem = lorem.replace(/a/gi, SURROGATE_DOUBLE);
lorem = lorem.replace(/o/gi, SURROGATE_SINGLE);
var main = blessed.box({ var main = blessed.box({
parent: screen, parent: screen,