add support for surrogate pairs. fixes #123. see #4 and slap-editor/slap#107.

2025-02-23 08:08:16 +00:00 · 2015-04-15 10:05:34 -07:00 · 2015-04-15 10:05:34 -07:00 · ea4e142757
commit ea4e142757
parent 1b1775a4c2
3 changed files with 143 additions and 62 deletions
--- a/README.md
+++ b/README.md
@ -264,11 +264,12 @@ The screen on which every other node renders.
  matter whether the keys are locked.
 - __dockBorders__ - automatically "dock" borders with other elements instead of
  overlapping, depending on position (__experimental__). for example:
- __doubleWidth__ - allow for rendering of East Asian double-width characters.
+- __fullUnicode__ - allow for rendering of East Asian double-width characters.
-  this is behind an option because it may affect performance negatively.
+  also enable proper rendering of utf-16 surrogate pairs. this allows you to
- __doubleWidthPerfect__ - handle high code point double-width characters,
+  display text above the basic multilingual plane. this is behind an option
-  without this option, high code point double width characters just show up as
+  because it may affect performance slightly negatively. without this option
-  `?`. that being said, this option will slow content parsing a fair amount.
+  enabled, all double-width and surrogate pair characters will be replaced by
  `??` and `?` respectively.
 These border-overlapped elements:
--- a/lib/widget.js
+++ b/lib/widget.js
@ -290,6 +290,9 @@ function Screen(options) {
  this.ignoreLocked = options.ignoreLocked || [];
  this._unicode = this.tput.unicode || this.tput.numbers.U8 === 1;
  this.fullUnicode = this.options.fullUnicode && this._unicode;
  this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff;
  this.renders = 0;
@ -979,7 +982,8 @@ Screen.prototype.draw = function(start, end) {
    , fg
    , bg
    , flags
-    , cwid;
+    , cwid
    , point;
  var main = ''
    , pre
@ -1196,8 +1200,13 @@ Screen.prototype.draw = function(start, end) {
      // If we find a double-width char, eat the next character which should be
      // a space due to parseContent's behavior.
-      if (this.options.doubleWidth && (this.tput.unicode || this.tput.numbers.U8 === 1)) {
+      if (this.fullUnicode) {
-        cwid = east_asian_width.char_width(line[x][1].codePointAt(0));
+        // If this is a surrogate pair double-width char, we can ignore it
        // because parseContent already counted it as length=2.
        point = line[x][1].codePointAt(0);
        // if (line[x][1].length === 1) {
        if (point <= 0xffff) {
          cwid = east_asian_width.char_width(point);
          if (cwid === 2) {
            // Might also need:
            // `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
@ -1208,12 +1217,24 @@ Screen.prototype.draw = function(start, end) {
              o[++x][1] = ' ';
            }
          } else if (cwid === 0) {
          // No real way to do this right now:
          // ch = '';
            ch = ' ';
          o[x][1] = ' ';
          }
        }
      }
      // Find surrogate pairs that have been split:
      // Pad after each character instead of end of line.
      // XXX Doesn't work well.
      // if (this.fullUnicode) {
      //   var code = line[x][1].charCodeAt(0);
      //   if (code >= 0xd800 && code <= 0xdbff) {
      //     var code2 = line[x + 1][1].charCodeAt(0);
      //     if (code2 >= 0xdc00 && code2 <= 0xdfff) {
      //       ch = line[x][1] + line[x + 1][1];
      //       line[x + 1][1] = ' ';
      //     }
      //   }
      // }
      // Attempt to use ACS for supported characters.
      // This is not ideal, but it's how ncurses works.
@ -1228,7 +1249,9 @@ Screen.prototype.draw = function(start, end) {
      // supports UTF8, but I imagine it's unlikely.
      // Maybe remove !this.tput.unicode check, however,
      // this seems to be the way ncurses does it.
-      if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS) {
+      if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS
        // Necessary for handling unicode when not supported:
        && (this.tput.acscr[ch] || acs)) {
        // Fun fact: even if this.tput.brokenACS wasn't checked here,
        // the linux console would still work fine because the acs
        // table would fail the check of: this.tput.acscr[ch]
@ -2321,42 +2344,39 @@ Element.prototype.parseContent = function(noTags) {
      .replace(/\r\n|\r/g, '\n')
      .replace(/\t/g, this.screen.tabc);
-    if (this.screen.options.doubleWidth
+    if (this.screen.fullUnicode) {
-        && (this.screen.tput.unicode
+      // double-width chars will eat the next char after render. create a
-        || this.screen.tput.numbers.U8 === 1)) {
+      // blank character after it so it doesn't eat the real next char.
      // double-width chars will eat the next char after render - create a
      // blank character after it so it doesn't eat the real next char
      content = content.replace(wideChars, '$1 ');
    } else {
-      // no double-width. replace double-width chars with question-marks.
+      // no double-width or surrogate pairs: replace them with question-marks.
-      // NOTE: could use two chars: '? ' depending on what is intended.
+      content = content.replace(wideChars, '??');
-      // if we did, we could remove the unicode checks above in this if
+      content = content.replace(/[\ud800-\udbff][\udc00-\udfff]/g, '?');
      // statement.
      content = content.replace(wideChars, '?');
    }
-    if (this.screen.options.doubleWidthPerfect) {
+    // XXX Because 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are surrogate pairs,
-      var _content = content;
+    // the length is computed as `2` in javascript, which coincidentally helps
-      content = '';
+    // since they are double width. This may not be necessary.
-      for (var i = 0; i < _content.length; i++) {
+    // if (this.screen.options.fullUnicode) {
-        var point = _content.codePointAt(i);
+    //   var _content = content;
-        if ((point >= 0x20000 && point <= 0x2fffd)
+    //   content = '';
-          || (point >= 0x30000 && point <= 0x3fffd)) {
+    //   for (var i = 0; i < _content.length; i++) {
-          if (this.screen.options.doubleWidth
+    //     var point = _content.codePointAt(i);
-              && (this.screen.tput.unicode
+    //     if ((point >= 0x20000 && point <= 0x2fffd)
-              || this.screen.tput.numbers.U8 === 1)) {
+    //       || (point >= 0x30000 && point <= 0x3fffd)) {
-            content += _content[i] + ' ';
+    //       if (this.screen._unicode) {
-          } else {
+    //         content += _content[i] + ' ';
-            // NOTE: could use two chars: '? ' depending on what is intended.
+    //       } else {
-            // if we did, we could remove the unicode checks above in this if
+    //         // NOTE: could use two chars: '? ' depending on what is intended.
-            // statement.
+    //         // if we did, we could remove the unicode checks above in this if
-            content += '?';
+    //         // statement.
-          }
+    //         content += '?';
-        } else {
+    //       }
-          content += _content[i];
+    //     } else {
-        }
+    //       content += _content[i];
-      }
+    //     }
-    }
+    //   }
    // }
    if (!noTags) {
      content = this._parseTags(content);
@ -2677,6 +2697,30 @@ main:
      : current;
  }, 0);
  // Find all surrogate pairs and compensate for the lack of width
  // on the line by padding with trailing spaces:
  if (this.screen.fullUnicode) {
    for (var i = 0; i < out.length; i++) {
      // NOTE: Happens at 54 cols with all chars enabled in test.
      // Check to see if surrogates got split on end and beginning of 2 lines.
      if (/[\ud800-\udbff]$/.exec(out[i])
        && /^[\udc00-\udfff]/.exec(out[i + 1])) {
        out[i] = out[i] + out[i + 1][0];
        out[i + 1] = out[i + 1].substring(1) + ' ';
      }
      // Pad the end of the lines if the surrogate is not a double-width char.
      // var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length;
      var surrogates = out[i].match(/[\ud800-\udbff][\udc00-\udfff]/g);
      if (surrogates && surrogates.length) {
        for (var j = 0; j < surrogates.length; j++) {
          if (east_asian_width.char_width(surrogates[j].codePointAt(0)) === 1) {
            out[i] += ' ';
          }
        }
      }
    }
  }
  return out;
 };
@ -3991,6 +4035,22 @@ Element.prototype.render = function() {
        continue;
      }
      // Handle surrogate pairs:
      // Make sure we put surrogate pair chars in one cell.
      if (this.screen.fullUnicode) {
        var code = content[ci - 1].charCodeAt(0);
        // if (content.codePointAt(ci - 1) > 0xffff) {
        if (code >= 0xd800 && code <= 0xdbff) {
          var code2 = (content[ci] || '').charCodeAt(0);
          if (code2 >= 0xdc00 && code2 <= 0xdfff) {
            ch = content[ci - 1] + content[ci];
            ci++;
          } else {
            ch = bch;
          }
        }
      }
      if (this.style.transparent) {
        lines[y][x][0] = blend(attr, lines[y][x][0]);
        if (content[ci]) lines[y][x][1] = ch;
@ -8614,6 +8674,10 @@ function hsort(obj) {
  });
 }
 // NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
 // regex anyway. This regex is used to put a blank char after wide chars to
 // be eaten, however, if this is a surrogate pair, parseContent already adds
 // the extra one char because its length equals 2 instead of 1.
 var wideChars = new RegExp('('
  // 0x20000 - 0x2fffd:
  // + '[\\ud840-\\ud87f][\\udc00-\\udffd]'
@ -8631,9 +8695,6 @@ var wideChars = new RegExp('('
  + '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */
  + '\\uff00-\\uff60' /* Fullwidth Forms */
  + '\\uffe0-\\uffe6'
  // XXX Cannot implement these in a regex. Not perfect, but the layout will
  // still not be negatively affected by double-width chars in this range,
  // however, the next char on the screen will be eaten.
  // + '\\u20000-\\u2fffd'
  // + '\\u30000-\\u3fffd'
  + ']'
--- a/test/widget-eaw.js
+++ b/test/widget-eaw.js
@ -5,12 +5,30 @@ screen = blessed.screen({
  dump: __dirname + '/logs/eaw.log',
  smartCSR: true,
  dockBorders: true,
-  doubleWidth: true,
+  fullUnicode: true
  doubleWidthPerfect: true
 });
-var DW = '杜';
+// screen.options.fullUnicode = false;
-var DW2 = String.fromCodePoint ? String.fromCodePoint(0x30000) : 'a';
+// screen.fullUnicode = false;
 // screen._unicode = false;
 // screen.tput.unicode = false;
 // screen.tput.numbers.U8 = -1;
 // screen.tput.strings.enter_alt_charset_mode = false;
 // var DOUBLE = '杜';
 var DOUBLE = String.fromCodePoint
  ? String.fromCodePoint(0x675c)
  : String.fromCharCode(0x675c);
 // var SURROGATE_DOUBLE = '𰀀';
 var SURROGATE_DOUBLE = String.fromCodePoint
  ? String.fromCodePoint(0x30000)
  : String.fromCharCode(0xD880, 0xDC00);
 // var SURROGATE_SINGLE = '𝌆';
 var SURROGATE_SINGLE = String.fromCodePoint
  ? String.fromCodePoint(0x1D306)
  : String.fromCharCode(0xD834, 0xDF06);
 // At cols=44, the bug that is avoided by this occurs:
 // || angles[line[x + 1][1]]) {
@ -60,8 +78,9 @@ var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque'
 + ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non'
 + ' legantur';
-lorem = lorem.replace(/e/gi, DW);
+lorem = lorem.replace(/e/gi, DOUBLE);
-lorem = lorem.replace(/a/gi, DW2);
+lorem = lorem.replace(/a/gi, SURROGATE_DOUBLE);
 lorem = lorem.replace(/o/gi, SURROGATE_SINGLE);
 var main = blessed.box({
  parent: screen,