add support for surrogate pairs. fixes #123. see #4 and slap-editor/slap#107.

2025-02-22 15:48:07 +00:00 · 2015-04-15 10:05:34 -07:00 · 2015-04-15 10:05:34 -07:00 · ea4e142757
commit ea4e142757
parent 1b1775a4c2
3 changed files with 143 additions and 62 deletions
--- a/README.md
+++ b/README.md
@ -264,11 +264,12 @@ The screen on which every other node renders.
  matter whether the keys are locked.
 - __dockBorders__ - automatically "dock" borders with other elements instead of
  overlapping, depending on position (__experimental__). for example:
- __doubleWidth__ - allow for rendering of East Asian double-width characters.
-  this is behind an option because it may affect performance negatively.
- __doubleWidthPerfect__ - handle high code point double-width characters,
-  without this option, high code point double width characters just show up as
-  `?`. that being said, this option will slow content parsing a fair amount.
+- __fullUnicode__ - allow for rendering of East Asian double-width characters.
+  also enable proper rendering of utf-16 surrogate pairs. this allows you to
+  display text above the basic multilingual plane. this is behind an option
+  because it may affect performance slightly negatively. without this option
+  enabled, all double-width and surrogate pair characters will be replaced by
+  `??` and `?` respectively.

 These border-overlapped elements:

--- a/lib/widget.js
+++ b/lib/widget.js
@ -290,6 +290,9 @@ function Screen(options) {

  this.ignoreLocked = options.ignoreLocked || [];

+  this._unicode = this.tput.unicode || this.tput.numbers.U8 === 1;
+  this.fullUnicode = this.options.fullUnicode && this._unicode;
+
  this.dattr = ((0 << 18) | (0x1ff << 9)) | 0x1ff;

  this.renders = 0;
@ -979,7 +982,8 @@ Screen.prototype.draw = function(start, end) {
    , fg
    , bg
    , flags
-    , cwid;
+    , cwid
+    , point;

  var main = ''
    , pre
@ -1196,25 +1200,42 @@ Screen.prototype.draw = function(start, end) {

      // If we find a double-width char, eat the next character which should be
      // a space due to parseContent's behavior.
-      if (this.options.doubleWidth && (this.tput.unicode || this.tput.numbers.U8 === 1)) {
-        cwid = east_asian_width.char_width(line[x][1].codePointAt(0));
-        if (cwid === 2) {
-          // Might also need:
-          // `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
-          if (x === line.length - 1 || angles[line[x + 1][1]]) {
+      if (this.fullUnicode) {
+        // If this is a surrogate pair double-width char, we can ignore it
+        // because parseContent already counted it as length=2.
+        point = line[x][1].codePointAt(0);
+        // if (line[x][1].length === 1) {
+        if (point <= 0xffff) {
+          cwid = east_asian_width.char_width(point);
+          if (cwid === 2) {
+            // Might also need:
+            // `|| line[x + 1][0] !== line[x][0]` for borderless boxes?
+            if (x === line.length - 1 || angles[line[x + 1][1]]) {
+              ch = ' ';
+              o[x][1] = ' ';
+            } else {
+              o[++x][1] = ' ';
+            }
+          } else if (cwid === 0) {
            ch = ' ';
-            o[x][1] = ' ';
-          } else {
-            o[++x][1] = ' ';
          }
-        } else if (cwid === 0) {
-          // No real way to do this right now:
-          // ch = '';
-          ch = ' ';
-          o[x][1] = ' ';
        }
      }

+      // Find surrogate pairs that have been split:
+      // Pad after each character instead of end of line.
+      // XXX Doesn't work well.
+      // if (this.fullUnicode) {
+      //   var code = line[x][1].charCodeAt(0);
+      //   if (code >= 0xd800 && code <= 0xdbff) {
+      //     var code2 = line[x + 1][1].charCodeAt(0);
+      //     if (code2 >= 0xdc00 && code2 <= 0xdfff) {
+      //       ch = line[x][1] + line[x + 1][1];
+      //       line[x + 1][1] = ' ';
+      //     }
+      //   }
+      // }
+
      // Attempt to use ACS for supported characters.
      // This is not ideal, but it's how ncurses works.
      // There are a lot of terminals that support ACS
@ -1228,7 +1249,9 @@ Screen.prototype.draw = function(start, end) {
      // supports UTF8, but I imagine it's unlikely.
      // Maybe remove !this.tput.unicode check, however,
      // this seems to be the way ncurses does it.
-      if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS) {
+      if (this.tput.strings.enter_alt_charset_mode && !this.tput.brokenACS
+        // Necessary for handling unicode when not supported:
+        && (this.tput.acscr[ch] || acs)) {
        // Fun fact: even if this.tput.brokenACS wasn't checked here,
        // the linux console would still work fine because the acs
        // table would fail the check of: this.tput.acscr[ch]
@ -2321,42 +2344,39 @@ Element.prototype.parseContent = function(noTags) {
      .replace(/\r\n|\r/g, '\n')
      .replace(/\t/g, this.screen.tabc);

-    if (this.screen.options.doubleWidth
-        && (this.screen.tput.unicode
-        || this.screen.tput.numbers.U8 === 1)) {
-      // double-width chars will eat the next char after render - create a
-      // blank character after it so it doesn't eat the real next char
+    if (this.screen.fullUnicode) {
+      // double-width chars will eat the next char after render. create a
+      // blank character after it so it doesn't eat the real next char.
      content = content.replace(wideChars, '$1 ');
    } else {
-      // no double-width. replace double-width chars with question-marks.
-      // NOTE: could use two chars: '? ' depending on what is intended.
-      // if we did, we could remove the unicode checks above in this if
-      // statement.
-      content = content.replace(wideChars, '?');
+      // no double-width or surrogate pairs: replace them with question-marks.
+      content = content.replace(wideChars, '??');
+      content = content.replace(/[\ud800-\udbff][\udc00-\udfff]/g, '?');
    }

-    if (this.screen.options.doubleWidthPerfect) {
-      var _content = content;
-      content = '';
-      for (var i = 0; i < _content.length; i++) {
-        var point = _content.codePointAt(i);
-        if ((point >= 0x20000 && point <= 0x2fffd)
-          || (point >= 0x30000 && point <= 0x3fffd)) {
-          if (this.screen.options.doubleWidth
-              && (this.screen.tput.unicode
-              || this.screen.tput.numbers.U8 === 1)) {
-            content += _content[i] + ' ';
-          } else {
-            // NOTE: could use two chars: '? ' depending on what is intended.
-            // if we did, we could remove the unicode checks above in this if
-            // statement.
-            content += '?';
-          }
-        } else {
-          content += _content[i];
-        }
-      }
-    }
+    // XXX Because 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are surrogate pairs,
+    // the length is computed as `2` in javascript, which coincidentally helps
+    // since they are double width. This may not be necessary.
+    // if (this.screen.options.fullUnicode) {
+    //   var _content = content;
+    //   content = '';
+    //   for (var i = 0; i < _content.length; i++) {
+    //     var point = _content.codePointAt(i);
+    //     if ((point >= 0x20000 && point <= 0x2fffd)
+    //       || (point >= 0x30000 && point <= 0x3fffd)) {
+    //       if (this.screen._unicode) {
+    //         content += _content[i] + ' ';
+    //       } else {
+    //         // NOTE: could use two chars: '? ' depending on what is intended.
+    //         // if we did, we could remove the unicode checks above in this if
+    //         // statement.
+    //         content += '?';
+    //       }
+    //     } else {
+    //       content += _content[i];
+    //     }
+    //   }
+    // }

    if (!noTags) {
      content = this._parseTags(content);
@ -2677,6 +2697,30 @@ main:
      : current;
  }, 0);

+  // Find all surrogate pairs and compensate for the lack of width
+  // on the line by padding with trailing spaces:
+  if (this.screen.fullUnicode) {
+    for (var i = 0; i < out.length; i++) {
+      // NOTE: Happens at 54 cols with all chars enabled in test.
+      // Check to see if surrogates got split on end and beginning of 2 lines.
+      if (/[\ud800-\udbff]$/.exec(out[i])
+        && /^[\udc00-\udfff]/.exec(out[i + 1])) {
+        out[i] = out[i] + out[i + 1][0];
+        out[i + 1] = out[i + 1].substring(1) + ' ';
+      }
+      // Pad the end of the lines if the surrogate is not a double-width char.
+      // var surrogates = out[i].length - punycode.ucs2.decode(out[i]).length;
+      var surrogates = out[i].match(/[\ud800-\udbff][\udc00-\udfff]/g);
+      if (surrogates && surrogates.length) {
+        for (var j = 0; j < surrogates.length; j++) {
+          if (east_asian_width.char_width(surrogates[j].codePointAt(0)) === 1) {
+            out[i] += ' ';
+          }
+        }
+      }
+    }
+  }
+
  return out;
 };

@ -3991,6 +4035,22 @@ Element.prototype.render = function() {
        continue;
      }

+      // Handle surrogate pairs:
+      // Make sure we put surrogate pair chars in one cell.
+      if (this.screen.fullUnicode) {
+        var code = content[ci - 1].charCodeAt(0);
+        // if (content.codePointAt(ci - 1) > 0xffff) {
+        if (code >= 0xd800 && code <= 0xdbff) {
+          var code2 = (content[ci] || '').charCodeAt(0);
+          if (code2 >= 0xdc00 && code2 <= 0xdfff) {
+            ch = content[ci - 1] + content[ci];
+            ci++;
+          } else {
+            ch = bch;
+          }
+        }
+      }
+
      if (this.style.transparent) {
        lines[y][x][0] = blend(attr, lines[y][x][0]);
        if (content[ci]) lines[y][x][1] = ch;
@ -8614,6 +8674,10 @@ function hsort(obj) {
  });
 }

+// NOTE: 0x20000 - 0x2fffd and 0x30000 - 0x3fffd are not necessary for this
+// regex anyway. This regex is used to put a blank char after wide chars to
+// be eaten, however, if this is a surrogate pair, parseContent already adds
+// the extra one char because its length equals 2 instead of 1.
 var wideChars = new RegExp('('
  // 0x20000 - 0x2fffd:
  // + '[\\ud840-\\ud87f][\\udc00-\\udffd]'
@ -8631,9 +8695,6 @@ var wideChars = new RegExp('('
  + '\\ufe30-\\ufe6f' /* CJK Compatibility Forms */
  + '\\uff00-\\uff60' /* Fullwidth Forms */
  + '\\uffe0-\\uffe6'
-  // XXX Cannot implement these in a regex. Not perfect, but the layout will
-  // still not be negatively affected by double-width chars in this range,
-  // however, the next char on the screen will be eaten.
  // + '\\u20000-\\u2fffd'
  // + '\\u30000-\\u3fffd'
  + ']'
--- a/test/widget-eaw.js
+++ b/test/widget-eaw.js
@ -5,12 +5,30 @@ screen = blessed.screen({
  dump: __dirname + '/logs/eaw.log',
  smartCSR: true,
  dockBorders: true,
-  doubleWidth: true,
-  doubleWidthPerfect: true
+  fullUnicode: true
 });

-var DW = '杜';
-var DW2 = String.fromCodePoint ? String.fromCodePoint(0x30000) : 'a';
+// screen.options.fullUnicode = false;
+// screen.fullUnicode = false;
+// screen._unicode = false;
+// screen.tput.unicode = false;
+// screen.tput.numbers.U8 = -1;
+// screen.tput.strings.enter_alt_charset_mode = false;
+
+// var DOUBLE = '杜';
+var DOUBLE = String.fromCodePoint
+  ? String.fromCodePoint(0x675c)
+  : String.fromCharCode(0x675c);
+
+// var SURROGATE_DOUBLE = '𰀀';
+var SURROGATE_DOUBLE = String.fromCodePoint
+  ? String.fromCodePoint(0x30000)
+  : String.fromCharCode(0xD880, 0xDC00);
+
+// var SURROGATE_SINGLE = '𝌆';
+var SURROGATE_SINGLE = String.fromCodePoint
+  ? String.fromCodePoint(0x1D306)
+  : String.fromCharCode(0xD834, 0xDF06);

 // At cols=44, the bug that is avoided by this occurs:
 // || angles[line[x + 1][1]]) {
@ -60,8 +78,9 @@ var lorem = 'Non eram nescius Brute cum quae summis ingeniis exquisitaque'
 + ' isdem de rebus alia ratione compositis quid est cur nostri a nostris non'
 + ' legantur';

-lorem = lorem.replace(/e/gi, DW);
-lorem = lorem.replace(/a/gi, DW2);
+lorem = lorem.replace(/e/gi, DOUBLE);
+lorem = lorem.replace(/a/gi, SURROGATE_DOUBLE);
+lorem = lorem.replace(/o/gi, SURROGATE_SINGLE);

 var main = blessed.box({
  parent: screen,