status-go/abi-spec/utf8.go

256 lines
6.8 KiB
Go
Raw Normal View History

package abispec
import (
"fmt"
"unicode/utf8"
"github.com/ethereum/go-ethereum/common/hexutil"
)
func stringToRunes(str string) []rune {
var runes []rune
bytes := []byte(str)
for len(bytes) > 0 {
r, size := utf8.DecodeRune(bytes)
if r == utf8.RuneError {
for i := 0; i < size; i++ {
runes = append(runes, rune(bytes[i]))
}
} else {
runes = append(runes, r)
}
bytes = bytes[size:]
}
return runes
}
// Taken from https://mths.be/punycode
func ucs2decode(str string) []rune {
var runes = stringToRunes(str)
var output []rune
var counter = 0
var length = len(runes)
var value rune
var extra rune
for counter < length {
value = runes[counter]
counter++
if value >= 0xD800 && value <= 0xDBFF && counter < length {
// high surrogate, and there is a next character
extra = runes[counter]
counter++
if (extra & 0xFC00) == 0xDC00 { // low surrogate
output = append(output, ((value&0x3FF)<<10)+(extra&0x3FF)+0x10000)
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output = append(output, value)
counter--
}
} else {
output = append(output, value)
}
}
return output
}
// Taken from https://mths.be/punycode
func ucs2encode(array []rune) []byte {
var length = len(array)
var index = 0
var value rune
var output []byte
for index < length {
value = array[index]
if value > 0xFFFF {
value -= 0x10000
codePoint := rune(uint32(value)>>10&0x3FF | 0xD800)
output = appendBytes(output, stringFromCharCode(codePoint))
value = 0xDC00 | value&0x3FF
}
output = appendBytes(output, stringFromCharCode(value))
index++
}
return output
}
func appendBytes(dest []byte, bytes []byte) []byte {
for i := 0; i < len(bytes); i++ {
dest = append(dest, bytes[i])
}
return dest
}
func checkScalarValue(codePoint rune) error {
if codePoint >= 0xD800 && codePoint <= 0xDFFF {
return fmt.Errorf("lone surrogate U+%s is not a scalar value", hexutil.EncodeUint64(uint64(codePoint)))
}
return nil
}
func stringFromCharCode(codePoint rune) []byte {
var buf = make([]byte, 4)
n := utf8.EncodeRune(buf, codePoint)
return buf[0:n]
}
func createByte(codePoint rune, shift uint32) []byte {
return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80)
}
func encodeCodePoint(codePoint rune) ([]byte, error) {
if (uint32(codePoint) & uint32(0xFFFFFF80)) == 0 { // 1-byte sequence
return stringFromCharCode(codePoint), nil
}
var symbol []byte
if uint32(codePoint)&0xFFFFF800 == 0 { // 2-byte sequence
symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0)
} else if (uint32(codePoint) & 0xFFFF0000) == 0 { // 3-byte sequence
err := checkScalarValue(codePoint)
if err != nil {
return nil, err
}
symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0)
symbol = appendBytes(symbol, createByte(codePoint, 6))
} else if (uint32(codePoint) & 0xFFE00000) == 0 { // 4-byte sequence
symbol = stringFromCharCode(((codePoint >> 18) & 0x07) | 0xF0)
symbol = appendBytes(symbol, createByte(codePoint, 12))
symbol = appendBytes(symbol, createByte(codePoint, 6))
}
symbol = appendBytes(symbol, stringFromCharCode((codePoint&0x3F)|0x80))
return symbol, nil
}
// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
func Utf8encode(str string) (string, error) {
var codePoints = ucs2decode(str)
var length = len(codePoints)
var index = 0
var codePoint rune
var bytes []byte
for index < length {
codePoint = codePoints[index]
cps, err := encodeCodePoint(codePoint)
if err != nil {
return "", err
}
bytes = appendBytes(bytes, cps)
index++
}
return string(bytes), nil
}
func readContinuationByte(byteArray []rune, byteCount int, pByteIndex *int) (rune, error) {
if *pByteIndex >= byteCount {
return utf8.RuneError, fmt.Errorf("invalid byte index")
}
var continuationByte = byteArray[*pByteIndex] & 0xFF
*pByteIndex = *pByteIndex + 1
if (continuationByte & 0xC0) == 0x80 {
return continuationByte & 0x3F, nil
}
// If we end up here, its not a continuation byte
return utf8.RuneError, fmt.Errorf("invalid continuation byte")
}
func decodeSymbol(byteArray []rune, byteCount int, pByteIndex *int) (rune, bool, error) {
var byte1 rune
var codePoint rune
if *pByteIndex > byteCount {
return utf8.RuneError, false, fmt.Errorf("invalid byte index")
}
if *pByteIndex == byteCount {
return utf8.RuneError, false, nil
}
// Read first byte
byte1 = byteArray[*pByteIndex] & 0xFF
*pByteIndex = *pByteIndex + 1
// 1-byte sequence (no continuation bytes)
if (byte1 & 0x80) == 0 {
return byte1, true, nil
}
// 2-byte sequence
if (byte1 & 0xE0) == 0xC0 {
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
codePoint = ((byte1 & 0x1F) << 6) | byte2
if codePoint >= 0x80 {
return codePoint, true, nil
}
return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
}
// 3-byte sequence (may include unpaired surrogates)
if (byte1 & 0xF0) == 0xE0 {
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3
if codePoint >= 0x0800 {
err := checkScalarValue(codePoint)
if err != nil {
return utf8.RuneError, false, err
}
return codePoint, true, nil
}
return utf8.RuneError, false, fmt.Errorf("invalid continuation byte")
}
// 4-byte sequence
if (byte1 & 0xF8) == 0xF0 {
byte2, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
byte3, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
byte4, err := readContinuationByte(byteArray, byteCount, pByteIndex)
if err != nil {
return utf8.RuneError, false, err
}
codePoint = ((byte1 & 0x07) << 0x12) | (byte2 << 0x0C) |
(byte3 << 0x06) | byte4
if codePoint >= 0x010000 && codePoint <= 0x10FFFF {
return codePoint, true, nil
}
}
return utf8.RuneError, false, fmt.Errorf("invalid UTF-8 detected")
}
// implementation referenced from https://github.com/mathiasbynens/utf8.js/blob/2ce09544b62f2a274dbcd249473c0986e3660849/utf8.js
func Utf8decode(str string) ([]byte, error) {
byteArray := ucs2decode(str)
byteCount := len(byteArray)
byteIndex := 0
var codePoints []rune
2023-01-13 17:12:46 +00:00
for {
codePoint, goOn, err := decodeSymbol(byteArray, byteCount, &byteIndex)
if err != nil {
return nil, err
}
if !goOn {
break
}
codePoints = append(codePoints, codePoint)
}
return ucs2encode(codePoints), nil
}