256 lines
6.8 KiB
Go
256 lines
6.8 KiB
Go
package abispec
|
||
|
||
import (
|
||
"fmt"
|
||
"unicode/utf8"
|
||
|
||
"github.com/ethereum/go-ethereum/common/hexutil"
|
||
)
|
||
|
||
func stringToRunes(str string) []rune {
|
||
var runes []rune
|
||
bytes := []byte(str)
|
||
for len(bytes) > 0 {
|
||
r, size := utf8.DecodeRune(bytes)
|
||
if r == utf8.RuneError {
|
||
for i := 0; i < size; i++ {
|
||
runes = append(runes, rune(bytes[i]))
|
||
}
|
||
} else {
|
||
runes = append(runes, r)
|
||
}
|
||
bytes = bytes[size:]
|
||
}
|
||
return runes
|
||
}
|
||
|
||
// Taken from https://mths.be/punycode
|
||
func ucs2decode(str string) []rune {
|
||
var runes = stringToRunes(str)
|
||
var output []rune
|
||
var counter = 0
|
||
var length = len(runes)
|
||
var value rune
|
||
var extra rune
|
||
for counter < length {
|
||
value = runes[counter]
|
||
counter++
|
||
if value >= 0xD800 && value <= 0xDBFF && counter < length {
|
||
// high surrogate, and there is a next character
|
||
extra = runes[counter]
|
||
counter++
|
||
if (extra & 0xFC00) == 0xDC00 { // low surrogate
|
||
output = append(output, ((value&0x3FF)<<10)+(extra&0x3FF)+0x10000)
|
||
} else {
|
||
// unmatched surrogate; only append this code unit, in case the next
|
||
// code unit is the high surrogate of a surrogate pair
|
||
output = append(output, value)
|
||
counter--
|
||
}
|
||
} else {
|
||
output = append(output, value)
|
||
}
|
||
}
|
||
return output
|
||
}
|
||
|
||
// Taken from https://mths.be/punycode
|
||
func ucs2encode(array []rune) []byte {
|
||
var length = len(array)
|
||
var index = 0
|
||
var value rune
|
||
var output []byte
|
||
for index < length {
|
||
value = array[index]
|
||
if value > 0xFFFF {
|
||
value -= 0x10000
|
||
codePoint := rune(uint32(value)>>10&0x3FF | 0xD800)
|
||
output = appendBytes(output, stringFromCharCode(codePoint))
|
||
value = 0xDC00 | value&0x3FF
|
||
}
|
||
output = appendBytes(output, stringFromCharCode(value))
|
||
index++
|
||
}
|
||
return output
|
||
}
|
||
|
||
func appendBytes(dest []byte, bytes []byte) []byte {
|
||
for i := 0; i < len(bytes); i++ {
|
||
dest = append(dest, bytes[i])
|
||
}
|
||
return dest
|
||
}
|
||
|
||
func checkScalarValue(codePoint rune) error {
|
||
if codePoint >= 0xD800 && codePoint <= 0xDFFF {
|
||
return fmt.Errorf("lone surrogate U+%s is not a scalar value", hexutil.EncodeUint64(uint64(codePoint)))
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func stringFromCharCode(codePoint rune) []byte {
|
||
var buf = make([]byte, 4)
|
||
n := utf8.EncodeRune(buf, codePoint)
|
||
return buf[0:n]
|
||
}
|
||
|
||
func createByte(codePoint rune, shift uint32) []byte {
|
||
return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80)
|
||
}
|
||
|
||
func encodeCodePoint(codePoint rune) ([]byte, error) {
|
||
if (uint32(codePoint) & uint32(0xFFFFFF80)) == 0 { // 1-byte sequence
|
||
return stringFromCharCode(codePoint), nil
|
||
}
|
||
var symbol []byte
|
||
if uint32(codePoint)&0xFFFFF800 == 0 { // 2-byte sequence
|
||
symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0)
|
||
} else if (uint32(codePoint) & 0xFFFF0000) == 0 { // 3-byte sequence
|
||
err := checkScalarValue(codePoint)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0)
|
||
symbol = appendBytes(symbol, createByte(codePoint, 6))
|
||
} else if (uint32(codePoint) & 0xFFE00000) == 0 { // 4-byte sequence
|
||
symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0)
|
||
symbol = appendBytes(symbol, createByte(codePoint, 12))
|
||
symbol = appendBytes(symbol, createByte(codePoint, 6))
|
||
}
|
||
symbol = appendBytes(symbol, stringFromCharCode((codePoint&0x3F)|0x80))
|
||
return symbol, nil
|
||
}
|
||
|
||
// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
|
||
func Utf8encode(str string) (string, error) {
|
||
var codePoints = ucs2decode(str)
|
||
var length = len(codePoints)
|
||
var index = 0
|
||
var codePoint rune
|
||
var bytes []byte
|
||
for index < length {
|
||
codePoint = codePoints[index]
|
||
cps, err := encodeCodePoint(codePoint)
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
bytes = appendBytes(bytes, cps)
|
||
index++
|
||
}
|
||
return string(bytes), nil
|
||
}
|
||
|
||
func readContinuationByte(byteArray []rune, byteCount int, pByteIndex *int) (rune, error) {
|
||
if *pByteIndex >= byteCount {
|
||
return utf8.RuneError, fmt.Errorf("invalid byte index")
|
||
}
|
||
|
||
var continuationByte = byteArray[*pByteIndex] & 0xFF
|
||
*pByteIndex = *pByteIndex + 1
|
||
|
||
if (continuationByte & 0xC0) == 0x80 {
|
||
return continuationByte & 0x3F, nil
|
||
}
|
||
|
||
// If we end up here, it’s not a continuation byte
|
||
return utf8.RuneError, fmt.Errorf("invalid continuation byte")
|
||
}
|
||
|
||
func decodeSymbol(byteArray []rune, byteCount int, pByteIndex *int) (rune, bool, error) {
|
||
var byte1 rune
|
||
var codePoint rune
|
||
|
||
if *pByteIndex > byteCount {
|
||
return utf8.RuneError, false, fmt.Errorf("invalid byte index")
|
||
}
|
||
|
||
if *pByteIndex == byteCount {
|
||
return utf8.RuneError, false, nil
|
||
}
|
||
|
||
// Read first byte
|
||
byte1 = byteArray[*pByteIndex] & 0xFF
|
||
*pByteIndex = *pByteIndex + 1
|
||
|
||
// 1-byte sequence (no continuation bytes)
|
||
if (byte1 & 0x80) == 0 {
|
||
return byte1, true, nil
|
||
}
|
||
|
||
// 2-byte sequence
|
||
if (byte1 & 0xE0) == 0xC0 {
|
||
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
codePoint = ((byte1 & 0x1F) << 6) | byte2
|
||
if codePoint >= 0x80 {
|
||
return codePoint, true, nil
|
||
}
|
||
return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
|
||
}
|
||
|
||
// 3-byte sequence (may include unpaired surrogates)
|
||
if (byte1 & 0xF0) == 0xE0 {
|
||
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3
|
||
if codePoint >= 0x0800 {
|
||
err := checkScalarValue(codePoint)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
return codePoint, true, nil
|
||
}
|
||
return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
|
||
}
|
||
|
||
// 4-byte sequence
|
||
if (byte1 & 0xF8) == 0xF0 {
|
||
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
byte4, err := readContinuationByte(byteArray, byteCount, pByteIndex)
|
||
if err != nil {
|
||
return utf8.RuneError, false, err
|
||
}
|
||
codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
|
||
(byte3 << 0x06) | byte4
|
||
if codePoint >= 0x010000 && codePoint <= 0x10FFFF {
|
||
return codePoint, true, nil
|
||
}
|
||
}
|
||
|
||
return utf8.RuneError, false, fmt.Errorf("invalid UTF-8 detected")
|
||
}
|
||
|
||
// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
|
||
func Utf8decode(str string) ([]byte, error) {
|
||
byteArray := ucs2decode(str)
|
||
byteCount := len(byteArray)
|
||
byteIndex := 0
|
||
var codePoints []rune
|
||
for true {
|
||
codePoint, goOn, err := decodeSymbol(byteArray, byteCount, &byteIndex)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if !goOn {
|
||
break
|
||
}
|
||
codePoints = append(codePoints, codePoint)
|
||
}
|
||
return ucs2encode(codePoints), nil
|
||
}
|