From ea4e142757ee8a36c6b1dd07c40a8f1566b48169 Mon Sep 17 00:00:00 2001 From: Christopher Jeffrey Date: Wed, 15 Apr 2015 10:05:34 -0700 Subject: [PATCH] add support for surrogate pairs. fixes #123. see #4 and slap-editor/slap#107. --- README.md | 11 +-- lib/widget.js | 163 +++++++++++++++++++++++++++++++-------------- test/widget-eaw.js | 31 +++++++-- 3 files changed, 143 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index 5b6a4fd..8fbd044 100644 --- a/README.md +++ b/README.md @@ -264,11 +264,12 @@ The screen on which every other node renders. matter whether the keys are locked. - __dockBorders__ - automatically "dock" borders with other elements instead of overlapping, depending on position (__experimental__). for example: -- __doubleWidth__ - allow for rendering of East Asian double-width characters. - this is behind an option because it may affect performance negatively. -- __doubleWidthPerfect__ - handle high code point double-width characters, - without this option, high code point double width characters just show up as - `?`. that being said, this option will slow content parsing a fair amount. +- __fullUnicode__ - allow for rendering of East Asian double-width characters. + also enable proper rendering of utf-16 surrogate pairs. this allows you to + display text above the basic multilingual plane. this is behind an option + because it may affect performance slightly negatively. without this option + enabled, all double-width and surrogate pair characters will be replaced by + `??` and `?` respectively. These border-overlapped elements: diff --git a/lib/widget.js b/lib/widget.js index f0b2052..0e71fd3 100644 --- a/lib/widget.js +++ b/lib/widget.js @@ -290,6 +290,9 @@ function Screen(options) { this.ignoreLocked = options.ignoreLocked || []; + this._unicode = this.tput.unicode || this.tput.numbers.U8 === 1; + this.fullUnicode = this.options.fullUnicode && this._unicode; + this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff; this.renders = 0; @@ -979,7 +982,8 @@ Screen.prototype.draw = function(start, end) { , fg , bg , flags - , cwid; + , cwid + , point; var main = '' , pre @@ -1196,25 +1200,42 @@ Screen.prototype.draw = function(start, end) { // If we find a double-width char, eat the next character which should be // a space due to parseContent's behavior. - if (this.options.doubleWidth && (this.tput.unicode || this.tput.numbers.U8 === 1)) { - cwid = east_asian_width.char_width(line[x][1].codePointAt(0)); - if (cwid === 2) { - // Might also need: - // `|| line[x + 1][0] !== line[x][0]` for borderless boxes? - if (x === line.length - 1 || angles[line[x + 1][1]]) { + if (this.fullUnicode) { + // If this is a surrogate pair double-width char, we can ignore it + // because parseContent already counted it as length=2. + point = line[x][1].codePointAt(0); + // if (line[x][1].length === 1) { + if (point <= 0xffff) { + cwid = east_asian_width.char_width(point); + if (cwid === 2) { + // Might also need: + // `|| line[x + 1][0] !== line[x][0]` for borderless boxes? + if (x === line.length - 1 || angles[line[x + 1][1]]) { + ch = ' '; + o[x][1] = ' '; + } else { + o[++x][1] = ' '; + } + } else if (cwid === 0) { ch = ' '; - o[x][1] = ' '; - } else { - o[++x][1] = ' '; } - } else if (cwid === 0) { - // No real way to do this right now: - // ch = ''; - ch = ' '; - o[x][1] = ' '; } } + // Find surrogate pairs that have been split: + // Pad after each character instead of end of line. + // XXX Doesn't work well. + // if (this.fullUnicode) { + // var code = line[x][1].charCodeAt(0); + // if (code >= 0xd800 && code <= 0xdbff) { + // var code2 = line[x + 1][1].charCodeAt(0); + // if (code2 >= 0xdc00 && code2 <= 0xdfff) { + // ch = line[x][1] + line[x + 1][1]; + // line[x + 1][1] = ' '; + // } + // } + // } + // Attempt to use ACS for supported characters. // This is not ideal, but it's how ncurses works. // There are a lot of terminals that support ACS @@ -1228,7 +1249,9 @@ Screen.prototype.draw = function(start, end) { // supports UTF8, but I imagine it's unlikely. // Maybe remove !this.tput.unicode check, however, // this seems to be the way ncurses does it. - if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS) { + if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS + // Necessary for handling unicode when not supported: + && (this.tput.acscr[ch] || acs)) { // Fun fact: even if this.tput.brokenACS wasn't checked here, // the linux console would still work fine because the acs // table would fail the check of: this.tput.acscr[ch] @@ -2321,42 +2344,39 @@ Element.prototype.parseContent = function(noTags) { .replace(/\r\n|\r/g, '\n') .replace(/\t/g, this.screen.tabc); - if (this.screen.options.doubleWidth - && (this.screen.tput.unicode - || this.screen.tput.numbers.U8 === 1)) { - // double-width chars will eat the next char after render - create a - // blank character after it so it doesn't eat the real next char + if (this.screen.fullUnicode) { + // double-width chars will eat the next char after render. create a + // blank character after it so it doesn't eat the real next char. content = content.replace(wideChars, '$1 '); } else { - // no double-width. replace double-width chars with question-marks. - // NOTE: could use two chars: '? ' depending on what is intended. - // if we did, we could remove the unicode checks above in this if - // statement. - content = content.replace(wideChars, '?'); + // no double-width or surrogate pairs: replace them with question-marks. + content = content.replace(wideChars, '??'); + content = content.replace(/[\ud800-\udbff][\udc00-\udfff]/g, '?'); } - if (this.screen.options.doubleWidthPerfect) { - var _content = content; - content = ''; - for (var i = 0; i < _content.length; i++) { - var point = _content.codePointAt(i); - if ((point >= 0x20000 && point <= 0x2fffd) - || (point >= 0x30000 && point <= 0x3fffd)) { - if (this.screen.options.doubleWidth - && (this.screen.tput.unicode - || this.screen.tput.numbers.U8 === 1)) { - content += _content[i] + ' '; - } else { - // NOTE: could use two chars: '? ' depending on what is intended. - // if we did, we could remove the unicode checks above in this if - // statement. - content += '?'; - } - } else { - content += _content[i]; - } - } - } + // XXX Because 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are surrogate pairs, + // the length is computed as `2` in javascript, which coincidentally helps + // since they are double width. This may not be necessary. + // if (this.screen.options.fullUnicode) { + // var _content = content; + // content = ''; + // for (var i = 0; i < _content.length; i++) { + // var point = _content.codePointAt(i); + // if ((point >= 0x20000 && point <= 0x2fffd) + // || (point >= 0x30000 && point <= 0x3fffd)) { + // if (this.screen._unicode) { + // content += _content[i] + ' '; + // } else { + // // NOTE: could use two chars: '? ' depending on what is intended. + // // if we did, we could remove the unicode checks above in this if + // // statement. + // content += '?'; + // } + // } else { + // content += _content[i]; + // } + // } + // } if (!noTags) { content = this._parseTags(content); @@ -2677,6 +2697,30 @@ main: : current; }, 0); + // Find all surrogate pairs and compensate for the lack of width + // on the line by padding with trailing spaces: + if (this.screen.fullUnicode) { + for (var i = 0; i < out.length; i++) { + // NOTE: Happens at 54 cols with all chars enabled in test. + // Check to see if surrogates got split on end and beginning of 2 lines. + if (/[\ud800-\udbff]$/.exec(out[i]) + && /^[\udc00-\udfff]/.exec(out[i + 1])) { + out[i] = out[i] + out[i + 1][0]; + out[i + 1] = out[i + 1].substring(1) + ' '; + } + // Pad the end of the lines if the surrogate is not a double-width char. + // var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length; + var surrogates = out[i].match(/[\ud800-\udbff][\udc00-\udfff]/g); + if (surrogates && surrogates.length) { + for (var j = 0; j < surrogates.length; j++) { + if (east_asian_width.char_width(surrogates[j].codePointAt(0)) === 1) { + out[i] += ' '; + } + } + } + } + } + return out; }; @@ -3991,6 +4035,22 @@ Element.prototype.render = function() { continue; } + // Handle surrogate pairs: + // Make sure we put surrogate pair chars in one cell. + if (this.screen.fullUnicode) { + var code = content[ci - 1].charCodeAt(0); + // if (content.codePointAt(ci - 1) > 0xffff) { + if (code >= 0xd800 && code <= 0xdbff) { + var code2 = (content[ci] || '').charCodeAt(0); + if (code2 >= 0xdc00 && code2 <= 0xdfff) { + ch = content[ci - 1] + content[ci]; + ci++; + } else { + ch = bch; + } + } + } + if (this.style.transparent) { lines[y][x][0] = blend(attr, lines[y][x][0]); if (content[ci]) lines[y][x][1] = ch; @@ -8614,6 +8674,10 @@ function hsort(obj) { }); } +// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this +// regex anyway. This regex is used to put a blank char after wide chars to +// be eaten, however, if this is a surrogate pair, parseContent already adds +// the extra one char because its length equals 2 instead of 1. var wideChars = new RegExp('(' // 0x20000 - 0x2fffd: // + '[\\ud840-\\ud87f][\\udc00-\\udffd]' @@ -8631,9 +8695,6 @@ var wideChars = new RegExp('(' + '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */ + '\\uff00-\\uff60' /* Fullwidth Forms */ + '\\uffe0-\\uffe6' - // XXX Cannot implement these in a regex. Not perfect, but the layout will - // still not be negatively affected by double-width chars in this range, - // however, the next char on the screen will be eaten. // + '\\u20000-\\u2fffd' // + '\\u30000-\\u3fffd' + ']' diff --git a/test/widget-eaw.js b/test/widget-eaw.js index 3162c0b..64da180 100644 --- a/test/widget-eaw.js +++ b/test/widget-eaw.js @@ -5,12 +5,30 @@ screen = blessed.screen({ dump: __dirname + '/logs/eaw.log', smartCSR: true, dockBorders: true, - doubleWidth: true, - doubleWidthPerfect: true + fullUnicode: true }); -var DW = '杜'; -var DW2 = String.fromCodePoint ? String.fromCodePoint(0x30000) : 'a'; +// screen.options.fullUnicode = false; +// screen.fullUnicode = false; +// screen._unicode = false; +// screen.tput.unicode = false; +// screen.tput.numbers.U8 = -1; +// screen.tput.strings.enter_alt_charset_mode = false; + +// var DOUBLE = '杜'; +var DOUBLE = String.fromCodePoint + ? String.fromCodePoint(0x675c) + : String.fromCharCode(0x675c); + +// var SURROGATE_DOUBLE = '𰀀'; +var SURROGATE_DOUBLE = String.fromCodePoint + ? String.fromCodePoint(0x30000) + : String.fromCharCode(0xD880, 0xDC00); + +// var SURROGATE_SINGLE = '𝌆'; +var SURROGATE_SINGLE = String.fromCodePoint + ? String.fromCodePoint(0x1D306) + : String.fromCharCode(0xD834, 0xDF06); // At cols=44, the bug that is avoided by this occurs: // || angles[line[x + 1][1]]) { @@ -60,8 +78,9 @@ var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque' + ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non' + ' legantur'; -lorem = lorem.replace(/e/gi, DW); -lorem = lorem.replace(/a/gi, DW2); +lorem = lorem.replace(/e/gi, DOUBLE); +lorem = lorem.replace(/a/gi, SURROGATE_DOUBLE); +lorem = lorem.replace(/o/gi, SURROGATE_SINGLE); var main = blessed.box({ parent: screen,