Add go-charset and chardet to vendor
This commit is contained in:
parent
2338c69d40
commit
a0938d9386
|
@ -0,0 +1,27 @@
|
|||
Copyright (c) 2014, Paul Rosania. All rights reserved.
|
||||
Portions Copyright (c) 2013, Roger Peppe. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
3. Neither the name of the copyright holder nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
||||
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -0,0 +1,65 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("ascii", fromASCII, toASCII)
|
||||
}
|
||||
|
||||
const errorByte = '?'
|
||||
|
||||
type translateFromASCII bool
|
||||
|
||||
type codePointError struct {
|
||||
i int
|
||||
cp rune
|
||||
charset string
|
||||
}
|
||||
|
||||
func (e *codePointError) Error() string {
|
||||
return fmt.Sprintf("Parse error at index %n: Code point %n is undefined in %s", e.i, e.cp, e.charset)
|
||||
}
|
||||
|
||||
func (strict translateFromASCII) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
buf := bytes.NewBuffer(make([]byte, 0, len(data)))
|
||||
for i, c := range data {
|
||||
if c > 0 && c < 128 {
|
||||
buf.WriteByte(c)
|
||||
if c < 32 && c != 10 && c != 13 && c != 9 {
|
||||
// badly formed
|
||||
}
|
||||
} else {
|
||||
if strict {
|
||||
return 0, nil, &codePointError{i, rune(c), "US-ASCII"}
|
||||
}
|
||||
buf.WriteRune(utf8.RuneError)
|
||||
}
|
||||
}
|
||||
return len(data), buf.Bytes(), nil
|
||||
}
|
||||
|
||||
type translateToASCII bool
|
||||
|
||||
func (strict translateToASCII) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
buf := bytes.NewBuffer(make([]byte, 0, len(data)))
|
||||
for _, c := range data {
|
||||
if c > 0 && c < 128 {
|
||||
buf.WriteByte(c)
|
||||
} else {
|
||||
buf.WriteByte(errorByte)
|
||||
}
|
||||
}
|
||||
return len(data), buf.Bytes(), nil
|
||||
}
|
||||
|
||||
func fromASCII(arg string) (Translator, error) {
|
||||
return new(translateFromASCII), nil
|
||||
}
|
||||
|
||||
func toASCII(arg string) (Translator, error) {
|
||||
return new(translateToASCII), nil
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("big5", fromBig5, nil)
|
||||
}
|
||||
|
||||
// Big5 consists of 89 fonts of 157 chars each
|
||||
const (
|
||||
big5Max = 13973
|
||||
big5Font = 157
|
||||
big5Data = "big5.dat"
|
||||
)
|
||||
|
||||
type translateFromBig5 struct {
|
||||
font int
|
||||
scratch []byte
|
||||
big5map []rune
|
||||
}
|
||||
|
||||
func (p *translateFromBig5) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
p.scratch = p.scratch[:0]
|
||||
n := 0
|
||||
for len(data) > 0 {
|
||||
c := int(data[0])
|
||||
data = data[1:]
|
||||
n++
|
||||
if p.font == -1 {
|
||||
// idle state
|
||||
if c >= 0xa1 {
|
||||
p.font = c
|
||||
continue
|
||||
}
|
||||
if c == 26 {
|
||||
c = '\n'
|
||||
}
|
||||
continue
|
||||
}
|
||||
f := p.font
|
||||
p.font = -1
|
||||
r := utf8.RuneError
|
||||
switch {
|
||||
case c >= 64 && c <= 126:
|
||||
c -= 64
|
||||
case c >= 161 && c <= 254:
|
||||
c = c - 161 + 63
|
||||
default:
|
||||
// bad big5 char
|
||||
f = 255
|
||||
}
|
||||
if f <= 254 {
|
||||
f -= 161
|
||||
ix := f*big5Font + c
|
||||
if ix < len(p.big5map) {
|
||||
r = p.big5map[ix]
|
||||
}
|
||||
if r == -1 {
|
||||
r = utf8.RuneError
|
||||
}
|
||||
}
|
||||
p.scratch = appendRune(p.scratch, r)
|
||||
}
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
|
||||
type big5Key bool
|
||||
|
||||
func fromBig5(arg string) (Translator, error) {
|
||||
big5map, err := cache(big5Key(false), func() (interface{}, error) {
|
||||
data, err := readFile(big5Data)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("charset: cannot open big5 data file: %v", err)
|
||||
}
|
||||
big5map := []rune(string(data))
|
||||
if len(big5map) != big5Max {
|
||||
return nil, fmt.Errorf("charset: corrupt big5 data")
|
||||
}
|
||||
return big5map, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &translateFromBig5{big5map: big5map.([]rune), font: -1}, nil
|
||||
}
|
|
@ -0,0 +1,301 @@
|
|||
// The charset package implements translation between character sets.
|
||||
// It uses Unicode as the intermediate representation.
|
||||
// Because it can be large, the character set data is separated
|
||||
// from the charset package. It can be embedded in the Go
|
||||
// executable by importing the data package:
|
||||
//
|
||||
// import _ "github.com/paulrosania/go-charset/data"
|
||||
//
|
||||
// It can also made available in a data directory (by settting CharsetDir).
|
||||
package charset
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Charset holds information about a given character set.
|
||||
type Charset struct {
|
||||
Name string // Canonical name of character set.
|
||||
Aliases []string // Known aliases.
|
||||
Desc string // Description.
|
||||
NoFrom bool // Not possible to translate from this charset.
|
||||
NoTo bool // Not possible to translate to this charset.
|
||||
}
|
||||
|
||||
// Translator represents a character set converter.
|
||||
// The Translate method translates the given data,
|
||||
// and returns the number of bytes of data consumed,
|
||||
// a slice containing the converted data (which may be
|
||||
// overwritten on the next call to Translate), and any
|
||||
// conversion error. If eof is true, the data represents
|
||||
// the final bytes of the input.
|
||||
type Translator interface {
|
||||
Translate(data []byte, eof bool) (n int, cdata []byte, err error)
|
||||
}
|
||||
|
||||
// A Factory can be used to make character set translators.
|
||||
type Factory interface {
|
||||
// TranslatorFrom creates a translator that will translate from the named character
|
||||
// set to UTF-8.
|
||||
TranslatorFrom(name string) (Translator, error) // Create a Translator from this character set to.
|
||||
|
||||
// TranslatorTo creates a translator that will translate from UTF-8 to the named character set.
|
||||
TranslatorTo(name string) (Translator, error) // Create a Translator To this character set.
|
||||
|
||||
// Names returns all the character set names accessibile through the factory.
|
||||
Names() []string
|
||||
|
||||
// Info returns information on the named character set. It returns nil if the
|
||||
// factory doesn't recognise the given name.
|
||||
Info(name string) *Charset
|
||||
}
|
||||
|
||||
var factories = []Factory{localFactory{}}
|
||||
|
||||
// Register registers a new Factory which will be consulted when NewReader
|
||||
// or NewWriter needs a character set translator for a given name.
|
||||
func Register(factory Factory) {
|
||||
factories = append(factories, factory)
|
||||
}
|
||||
|
||||
// NewReader returns a new Reader that translates from the named
|
||||
// character set to UTF-8 as it reads r.
|
||||
func NewReader(charset string, r io.Reader) (io.Reader, error) {
|
||||
tr, err := TranslatorFrom(charset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return NewTranslatingReader(r, tr), nil
|
||||
}
|
||||
|
||||
// NewWriter returns a new WriteCloser writing to w. It converts writes
|
||||
// of UTF-8 text into writes on w of text in the named character set.
|
||||
// The Close is necessary to flush any remaining partially translated
|
||||
// characters to the output.
|
||||
func NewWriter(charset string, w io.Writer) (io.WriteCloser, error) {
|
||||
tr, err := TranslatorTo(charset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return NewTranslatingWriter(w, tr), nil
|
||||
}
|
||||
|
||||
// Info returns information about a character set, or nil
|
||||
// if the character set is not found.
|
||||
func Info(name string) *Charset {
|
||||
for _, f := range factories {
|
||||
if info := f.Info(name); info != nil {
|
||||
return info
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Names returns the canonical names of all supported character sets, in alphabetical order.
|
||||
func Names() []string {
|
||||
// TODO eliminate duplicates
|
||||
var names []string
|
||||
for _, f := range factories {
|
||||
names = append(names, f.Names()...)
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
// TranslatorFrom returns a translator that will translate from
|
||||
// the named character set to UTF-8.
|
||||
func TranslatorFrom(charset string) (Translator, error) {
|
||||
var err error
|
||||
var tr Translator
|
||||
for _, f := range factories {
|
||||
tr, err = f.TranslatorFrom(charset)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
if tr == nil {
|
||||
return nil, err
|
||||
}
|
||||
return tr, nil
|
||||
}
|
||||
|
||||
// TranslatorTo returns a translator that will translate from UTF-8
|
||||
// to the named character set.
|
||||
func TranslatorTo(charset string) (Translator, error) {
|
||||
var err error
|
||||
var tr Translator
|
||||
for _, f := range factories {
|
||||
tr, err = f.TranslatorTo(charset)
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
if tr == nil {
|
||||
return nil, err
|
||||
}
|
||||
return tr, nil
|
||||
}
|
||||
|
||||
func normalizedChar(c rune) rune {
|
||||
switch {
|
||||
case c >= 'A' && c <= 'Z':
|
||||
c = c - 'A' + 'a'
|
||||
case c == '_':
|
||||
c = '-'
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// NormalisedName returns s with all Roman capitals
|
||||
// mapped to lower case, and '_' mapped to '-'
|
||||
func NormalizedName(s string) string {
|
||||
return strings.Map(normalizedChar, s)
|
||||
}
|
||||
|
||||
type translatingWriter struct {
|
||||
w io.Writer
|
||||
tr Translator
|
||||
buf []byte // unconsumed data from writer.
|
||||
}
|
||||
|
||||
// NewTranslatingWriter returns a new WriteCloser writing to w.
|
||||
// It passes the written bytes through the given Translator.
|
||||
func NewTranslatingWriter(w io.Writer, tr Translator) io.WriteCloser {
|
||||
return &translatingWriter{w: w, tr: tr}
|
||||
}
|
||||
|
||||
func (w *translatingWriter) Write(data []byte) (rn int, rerr error) {
|
||||
wdata := data
|
||||
if len(w.buf) > 0 {
|
||||
w.buf = append(w.buf, data...)
|
||||
wdata = w.buf
|
||||
}
|
||||
n, cdata, err := w.tr.Translate(wdata, false)
|
||||
if err != nil {
|
||||
// TODO
|
||||
}
|
||||
if n > 0 {
|
||||
_, err = w.w.Write(cdata)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
w.buf = w.buf[:0]
|
||||
if n < len(wdata) {
|
||||
w.buf = append(w.buf, wdata[n:]...)
|
||||
}
|
||||
return len(data), nil
|
||||
}
|
||||
|
||||
func (p *translatingWriter) Close() error {
|
||||
for {
|
||||
n, data, err := p.tr.Translate(p.buf, true)
|
||||
p.buf = p.buf[n:]
|
||||
if err != nil {
|
||||
// TODO
|
||||
}
|
||||
// If the Translator produces no data
|
||||
// at EOF, then assume that it never will.
|
||||
if len(data) == 0 {
|
||||
break
|
||||
}
|
||||
n, err = p.w.Write(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n < len(data) {
|
||||
return io.ErrShortWrite
|
||||
}
|
||||
if len(p.buf) == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type translatingReader struct {
|
||||
r io.Reader
|
||||
tr Translator
|
||||
cdata []byte // unconsumed data from converter.
|
||||
rdata []byte // unconverted data from reader.
|
||||
err error // final error from reader.
|
||||
}
|
||||
|
||||
// NewTranslatingReader returns a new Reader that
|
||||
// translates data using the given Translator as it reads r.
|
||||
func NewTranslatingReader(r io.Reader, tr Translator) io.Reader {
|
||||
return &translatingReader{r: r, tr: tr}
|
||||
}
|
||||
|
||||
func (r *translatingReader) Read(buf []byte) (int, error) {
|
||||
for {
|
||||
if len(r.cdata) > 0 {
|
||||
n := copy(buf, r.cdata)
|
||||
r.cdata = r.cdata[n:]
|
||||
return n, nil
|
||||
}
|
||||
if r.err == nil {
|
||||
r.rdata = ensureCap(r.rdata, len(r.rdata)+len(buf))
|
||||
n, err := r.r.Read(r.rdata[len(r.rdata):cap(r.rdata)])
|
||||
// Guard against non-compliant Readers.
|
||||
if n == 0 && err == nil {
|
||||
err = io.EOF
|
||||
}
|
||||
r.rdata = r.rdata[0 : len(r.rdata)+n]
|
||||
r.err = err
|
||||
} else if len(r.rdata) == 0 {
|
||||
break
|
||||
}
|
||||
nc, cdata, cvterr := r.tr.Translate(r.rdata, r.err != nil)
|
||||
if cvterr != nil {
|
||||
// TODO
|
||||
}
|
||||
r.cdata = cdata
|
||||
|
||||
// Ensure that we consume all bytes at eof
|
||||
// if the converter refuses them.
|
||||
if nc == 0 && r.err != nil {
|
||||
nc = len(r.rdata)
|
||||
}
|
||||
|
||||
// Copy unconsumed data to the start of the rdata buffer.
|
||||
r.rdata = r.rdata[0:copy(r.rdata, r.rdata[nc:])]
|
||||
}
|
||||
return 0, r.err
|
||||
}
|
||||
|
||||
// ensureCap returns s with a capacity of at least n bytes.
|
||||
// If cap(s) < n, then it returns a new copy of s with the
|
||||
// required capacity.
|
||||
func ensureCap(s []byte, n int) []byte {
|
||||
if n <= cap(s) {
|
||||
return s
|
||||
}
|
||||
// logic adapted from appendslice1 in runtime
|
||||
m := cap(s)
|
||||
if m == 0 {
|
||||
m = n
|
||||
} else {
|
||||
for {
|
||||
if m < 1024 {
|
||||
m += m
|
||||
} else {
|
||||
m += m / 4
|
||||
}
|
||||
if m >= n {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
t := make([]byte, len(s), m)
|
||||
copy(t, s)
|
||||
return t
|
||||
}
|
||||
|
||||
func appendRune(buf []byte, r rune) []byte {
|
||||
n := len(buf)
|
||||
buf = ensureCap(buf, n+utf8.UTFMax)
|
||||
nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
|
||||
return buf[0 : n+nu]
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("cp", fromCodePage, toCodePage)
|
||||
}
|
||||
|
||||
type translateFromCodePage struct {
|
||||
byte2rune *[256]rune
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
type cpKeyFrom string
|
||||
type cpKeyTo string
|
||||
|
||||
func (p *translateFromCodePage) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
p.scratch = ensureCap(p.scratch, len(data)*utf8.UTFMax)[:0]
|
||||
buf := p.scratch
|
||||
for _, x := range data {
|
||||
r := p.byte2rune[x]
|
||||
if r < utf8.RuneSelf {
|
||||
buf = append(buf, byte(r))
|
||||
continue
|
||||
}
|
||||
size := utf8.EncodeRune(buf[len(buf):cap(buf)], r)
|
||||
buf = buf[0 : len(buf)+size]
|
||||
}
|
||||
return len(data), buf, nil
|
||||
}
|
||||
|
||||
type toCodePageInfo struct {
|
||||
rune2byte map[rune]byte
|
||||
// same gives the number of runes at start of code page that map exactly to
|
||||
// unicode.
|
||||
same rune
|
||||
}
|
||||
|
||||
type translateToCodePage struct {
|
||||
toCodePageInfo
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
func (p *translateToCodePage) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
p.scratch = ensureCap(p.scratch, len(data))
|
||||
buf := p.scratch[:0]
|
||||
|
||||
for i := 0; i < len(data); {
|
||||
r := rune(data[i])
|
||||
size := 1
|
||||
if r >= utf8.RuneSelf {
|
||||
r, size = utf8.DecodeRune(data[i:])
|
||||
if size == 1 && !eof && !utf8.FullRune(data[i:]) {
|
||||
return i, buf, nil
|
||||
}
|
||||
}
|
||||
|
||||
var b byte
|
||||
if r < p.same {
|
||||
b = byte(r)
|
||||
} else {
|
||||
var ok bool
|
||||
b, ok = p.rune2byte[r]
|
||||
if !ok {
|
||||
b = '?'
|
||||
}
|
||||
}
|
||||
buf = append(buf, b)
|
||||
i += size
|
||||
}
|
||||
return len(data), buf, nil
|
||||
}
|
||||
|
||||
func fromCodePage(arg string) (Translator, error) {
|
||||
runes, err := cache(cpKeyFrom(arg), func() (interface{}, error) {
|
||||
data, err := readFile(arg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
runes := []rune(string(data))
|
||||
if len(runes) != 256 {
|
||||
return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, len(runes))
|
||||
}
|
||||
r := new([256]rune)
|
||||
copy(r[:], runes)
|
||||
return r, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &translateFromCodePage{byte2rune: runes.(*[256]rune)}, nil
|
||||
}
|
||||
|
||||
func toCodePage(arg string) (Translator, error) {
|
||||
m, err := cache(cpKeyTo(arg), func() (interface{}, error) {
|
||||
data, err := readFile(arg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
info := toCodePageInfo{
|
||||
rune2byte: make(map[rune]byte),
|
||||
same: 256,
|
||||
}
|
||||
atStart := true
|
||||
i := rune(0)
|
||||
for _, r := range string(data) {
|
||||
if atStart {
|
||||
if r == i {
|
||||
i++
|
||||
continue
|
||||
}
|
||||
info.same = i
|
||||
atStart = false
|
||||
}
|
||||
info.rune2byte[r] = byte(i)
|
||||
i++
|
||||
}
|
||||
// TODO fix tables
|
||||
// fmt.Printf("%s, same = %d\n", arg, info.same)
|
||||
if i != 256 {
|
||||
return nil, fmt.Errorf("charset: %q has wrong rune count (%d)", arg, i)
|
||||
}
|
||||
return info, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &translateToCodePage{toCodePageInfo: m.(toCodePageInfo)}, nil
|
||||
}
|
|
@ -0,0 +1,195 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("cp932", fromCP932, nil)
|
||||
}
|
||||
|
||||
// encoding details
|
||||
// (Traditional) Shift-JIS
|
||||
//
|
||||
// 00..1f control characters
|
||||
// 20 space
|
||||
// 21..7f JIS X 0201:1976/1997 roman (see notes)
|
||||
// 80 undefined
|
||||
// 81..9f lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
|
||||
// a0 undefined
|
||||
// a1..df JIS X 0201:1976/1997 katakana
|
||||
// e0..ea lead byte of JIS X 0208-1983 or JIS X 0202:1990/1997
|
||||
// eb..ff undefined
|
||||
//
|
||||
// CP932 (windows-31J)
|
||||
//
|
||||
// this encoding scheme extends Shift-JIS in the following way
|
||||
//
|
||||
// eb..ec undefined (marked as lead bytes - see notes below)
|
||||
// ed..ee lead byte of NEC-selected IBM extended characters
|
||||
// ef undefined (marked as lead byte - see notes below)
|
||||
// f0..f9 lead byte of User defined GAIJI (see note below)
|
||||
// fa..fc lead byte of IBM extended characters
|
||||
// fd..ff undefined
|
||||
//
|
||||
//
|
||||
// Notes
|
||||
//
|
||||
// JISX 0201:1976/1997 roman
|
||||
// this is the same as ASCII but with 0x5c (ASCII code for '\')
|
||||
// representing the Yen currency symbol '¥' (U+00a5)
|
||||
// This mapping is contentious, some conversion packages implent it
|
||||
// others do not.
|
||||
// The mapping files from The Unicode Consortium show cp932 mapping
|
||||
// plain ascii in the range 00..7f whereas shift-jis maps 0x5c ('\') to the yen
|
||||
// symbol (¥) and 0x7e ('~') to overline (¯)
|
||||
//
|
||||
// CP932 double-byte character codes:
|
||||
//
|
||||
// eb-ec, ef, f0-f9:
|
||||
// Marked as DBCS LEAD BYTEs in the unicode mapping data
|
||||
// obtained from:
|
||||
// https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT
|
||||
//
|
||||
// but there are no defined mappings for codes in this range.
|
||||
// It is not clear whether or not an implementation should
|
||||
// consume one or two bytes before emitting an error char.
|
||||
|
||||
const (
|
||||
kanaPages = 1
|
||||
kanaPageSize = 63
|
||||
kanaChar0 = 0xa1
|
||||
|
||||
cp932Pages = 45 // 81..84, 87..9f, e0..ea, ed..ee, fa..fc
|
||||
cp932PageSize = 189 // 40..fc (including 7f)
|
||||
cp932Char0 = 0x40
|
||||
)
|
||||
|
||||
type jisTables struct {
|
||||
page0 [256]rune
|
||||
dbcsoff [256]int
|
||||
cp932 []rune
|
||||
}
|
||||
|
||||
type translateFromCP932 struct {
|
||||
tables *jisTables
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
func (p *translateFromCP932) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
tables := p.tables
|
||||
p.scratch = p.scratch[:0]
|
||||
n := 0
|
||||
for i := 0; i < len(data); i++ {
|
||||
b := data[i]
|
||||
r := tables.page0[b]
|
||||
if r != -1 {
|
||||
p.scratch = appendRune(p.scratch, r)
|
||||
n++
|
||||
continue
|
||||
}
|
||||
// DBCS
|
||||
i++
|
||||
if i >= len(data) {
|
||||
break
|
||||
}
|
||||
pnum := tables.dbcsoff[b]
|
||||
ix := int(data[i]) - cp932Char0
|
||||
if pnum == -1 || ix < 0 || ix >= cp932PageSize {
|
||||
r = utf8.RuneError
|
||||
} else {
|
||||
r = tables.cp932[pnum*cp932PageSize+ix]
|
||||
}
|
||||
p.scratch = appendRune(p.scratch, r)
|
||||
n += 2
|
||||
}
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
|
||||
type cp932Key bool
|
||||
|
||||
func fromCP932(arg string) (Translator, error) {
|
||||
shiftJIS := arg == "shiftjis"
|
||||
tables, err := cache(cp932Key(shiftJIS), func() (interface{}, error) {
|
||||
tables := new(jisTables)
|
||||
kana, err := jisGetMap("jisx0201kana.dat", kanaPageSize, kanaPages)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tables.cp932, err = jisGetMap("cp932.dat", cp932PageSize, cp932Pages)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// jisx0201kana is mapped into 0xA1..0xDF
|
||||
for i := 0; i < kanaPageSize; i++ {
|
||||
tables.page0[i+kanaChar0] = kana[i]
|
||||
}
|
||||
|
||||
// 00..7f same as ascii in cp932
|
||||
for i := rune(0); i < 0x7f; i++ {
|
||||
tables.page0[i] = i
|
||||
}
|
||||
|
||||
if shiftJIS {
|
||||
// shift-jis uses JIS X 0201 for the ASCII range
|
||||
// this is the same as ASCII apart from
|
||||
// 0x5c ('\') maps to yen symbol (¥) and 0x7e ('~') maps to overline (¯)
|
||||
tables.page0['\\'] = '¥'
|
||||
tables.page0['~'] = '¯'
|
||||
}
|
||||
|
||||
// pre-calculate DBCS page numbers to mapping file page numbers
|
||||
// and mark codes in page0 that are DBCS lead bytes
|
||||
pnum := 0
|
||||
for i := 0x81; i <= 0x84; i++ {
|
||||
tables.page0[i] = -1
|
||||
tables.dbcsoff[i] = pnum
|
||||
pnum++
|
||||
}
|
||||
for i := 0x87; i <= 0x9f; i++ {
|
||||
tables.page0[i] = -1
|
||||
tables.dbcsoff[i] = pnum
|
||||
pnum++
|
||||
}
|
||||
for i := 0xe0; i <= 0xea; i++ {
|
||||
tables.page0[i] = -1
|
||||
tables.dbcsoff[i] = pnum
|
||||
pnum++
|
||||
}
|
||||
if shiftJIS {
|
||||
return tables, nil
|
||||
}
|
||||
// add in cp932 extensions
|
||||
for i := 0xed; i <= 0xee; i++ {
|
||||
tables.page0[i] = -1
|
||||
tables.dbcsoff[i] = pnum
|
||||
pnum++
|
||||
}
|
||||
for i := 0xfa; i <= 0xfc; i++ {
|
||||
tables.page0[i] = -1
|
||||
tables.dbcsoff[i] = pnum
|
||||
pnum++
|
||||
}
|
||||
return tables, nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &translateFromCP932{tables: tables.(*jisTables)}, nil
|
||||
}
|
||||
|
||||
func jisGetMap(name string, pgsize, npages int) ([]rune, error) {
|
||||
data, err := readFile(name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
m := []rune(string(data))
|
||||
if len(m) != pgsize*npages {
|
||||
return nil, fmt.Errorf("%q: incorrect length data", name)
|
||||
}
|
||||
return m, nil
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
var files = make(map[string]func() (io.ReadCloser, error))
|
||||
|
||||
// RegisterDataFile registers the existence of a given data
|
||||
// file with the given name that may be used by a character-set converter.
|
||||
// It is intended to be used by packages that wish to embed
|
||||
// data in the executable binary, and should not be
|
||||
// used normally.
|
||||
func RegisterDataFile(name string, open func() (io.ReadCloser, error)) {
|
||||
files[name] = open
|
||||
}
|
||||
|
||||
// CharsetDir gives the location of the default data file directory.
|
||||
// This directory will be used for files with names that have not
|
||||
// been registered with RegisterDataFile.
|
||||
var CharsetDir = "/usr/local/lib/go-charset/datafiles"
|
||||
|
||||
func readFile(name string) (data []byte, err error) {
|
||||
var r io.ReadCloser
|
||||
if open := files[name]; open != nil {
|
||||
r, err = open()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
r, err = os.Open(filepath.Join(CharsetDir, name))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
return ioutil.ReadAll(r)
|
||||
}
|
|
@ -0,0 +1,184 @@
|
|||
// The iconv package provides an interface to the GNU iconv character set
|
||||
// conversion library (see http://www.gnu.org/software/libiconv/).
|
||||
// It automatically registers all the character sets with the charset package,
|
||||
// so it is usually used simply for the side effects of importing it.
|
||||
// Example:
|
||||
// import (
|
||||
// "go-charset.googlecode.com/hg/charset"
|
||||
// _ "go-charset.googlecode.com/hg/charset/iconv"
|
||||
// )
|
||||
package iconv
|
||||
|
||||
//#cgo darwin LDFLAGS: -liconv
|
||||
//#include <stdlib.h>
|
||||
//#include <iconv.h>
|
||||
//#include <errno.h>
|
||||
//iconv_t iconv_open_error = (iconv_t)-1;
|
||||
//size_t iconv_error = (size_t)-1;
|
||||
import "C"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
"unicode/utf8"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type iconvTranslator struct {
|
||||
cd C.iconv_t
|
||||
invalid rune
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
func canonicalChar(c rune) rune {
|
||||
if c >= 'a' && c <= 'z' {
|
||||
return c - 'a' + 'A'
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
func canonicalName(s string) string {
|
||||
return strings.Map(canonicalChar, s)
|
||||
}
|
||||
|
||||
func init() {
|
||||
charset.Register(iconvFactory{})
|
||||
}
|
||||
|
||||
type iconvFactory struct {
|
||||
}
|
||||
|
||||
func (iconvFactory) TranslatorFrom(name string) (charset.Translator, error) {
|
||||
return Translator("UTF-8", name, utf8.RuneError)
|
||||
}
|
||||
|
||||
func (iconvFactory) TranslatorTo(name string) (charset.Translator, error) {
|
||||
// BUG This is wrong. The target character set may not be ASCII
|
||||
// compatible. There's no easy solution to this other than
|
||||
// removing the offending code point.
|
||||
return Translator(name, "UTF-8", '?')
|
||||
}
|
||||
|
||||
// Translator returns a Translator that translates between
|
||||
// the named character sets. When an invalid multibyte
|
||||
// character is found, the bytes in invalid are substituted instead.
|
||||
func Translator(toCharset, fromCharset string, invalid rune) (charset.Translator, error) {
|
||||
cto, cfrom := C.CString(toCharset), C.CString(fromCharset)
|
||||
cd, err := C.iconv_open(cto, cfrom)
|
||||
|
||||
C.free(unsafe.Pointer(cfrom))
|
||||
C.free(unsafe.Pointer(cto))
|
||||
|
||||
if cd == C.iconv_open_error {
|
||||
if err == syscall.EINVAL {
|
||||
return nil, errors.New("iconv: conversion not supported")
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
t := &iconvTranslator{cd: cd, invalid: invalid}
|
||||
runtime.SetFinalizer(t, func(*iconvTranslator) {
|
||||
C.iconv_close(cd)
|
||||
})
|
||||
return t, nil
|
||||
}
|
||||
|
||||
func (iconvFactory) Names() []string {
|
||||
all := aliases()
|
||||
names := make([]string, 0, len(all))
|
||||
for name, aliases := range all {
|
||||
if aliases[0] == name {
|
||||
names = append(names, name)
|
||||
}
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
func (iconvFactory) Info(name string) *charset.Charset {
|
||||
name = strings.ToLower(name)
|
||||
all := aliases()
|
||||
a, ok := all[name]
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
return &charset.Charset{
|
||||
Name: name,
|
||||
Aliases: a,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *iconvTranslator) Translate(data []byte, eof bool) (rn int, rd []byte, rerr error) {
|
||||
n := 0
|
||||
p.scratch = p.scratch[:0]
|
||||
for len(data) > 0 {
|
||||
p.scratch = ensureCap(p.scratch, len(p.scratch)+len(data)*utf8.UTFMax)
|
||||
cData := (*C.char)(unsafe.Pointer(&data[:1][0]))
|
||||
nData := C.size_t(len(data))
|
||||
|
||||
ns := len(p.scratch)
|
||||
cScratch := (*C.char)(unsafe.Pointer(&p.scratch[ns : ns+1][0]))
|
||||
nScratch := C.size_t(cap(p.scratch) - ns)
|
||||
r, err := C.iconv(p.cd, &cData, &nData, &cScratch, &nScratch)
|
||||
|
||||
p.scratch = p.scratch[0 : cap(p.scratch)-int(nScratch)]
|
||||
n += len(data) - int(nData)
|
||||
data = data[len(data)-int(nData):]
|
||||
|
||||
if r != C.iconv_error || err == nil {
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
switch err := err.(syscall.Errno); err {
|
||||
case C.EILSEQ:
|
||||
// invalid multibyte sequence - skip one byte and continue
|
||||
p.scratch = appendRune(p.scratch, p.invalid)
|
||||
n++
|
||||
data = data[1:]
|
||||
case C.EINVAL:
|
||||
// incomplete multibyte sequence
|
||||
return n, p.scratch, nil
|
||||
case C.E2BIG:
|
||||
// output buffer not large enough; try again with larger buffer.
|
||||
p.scratch = ensureCap(p.scratch, cap(p.scratch)+utf8.UTFMax)
|
||||
default:
|
||||
panic(fmt.Sprintf("unexpected error code: %v", err))
|
||||
}
|
||||
}
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
|
||||
// ensureCap returns s with a capacity of at least n bytes.
|
||||
// If cap(s) < n, then it returns a new copy of s with the
|
||||
// required capacity.
|
||||
func ensureCap(s []byte, n int) []byte {
|
||||
if n <= cap(s) {
|
||||
return s
|
||||
}
|
||||
// logic adapted from appendslice1 in runtime
|
||||
m := cap(s)
|
||||
if m == 0 {
|
||||
m = n
|
||||
} else {
|
||||
for {
|
||||
if m < 1024 {
|
||||
m += m
|
||||
} else {
|
||||
m += m / 4
|
||||
}
|
||||
if m >= n {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
t := make([]byte, len(s), m)
|
||||
copy(t, s)
|
||||
return t
|
||||
}
|
||||
|
||||
func appendRune(buf []byte, r rune) []byte {
|
||||
n := len(buf)
|
||||
buf = ensureCap(buf, n+utf8.UTFMax)
|
||||
nu := utf8.EncodeRune(buf[n:n+utf8.UTFMax], r)
|
||||
return buf[0 : n+nu]
|
||||
}
|
80
vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go
generated
vendored
Normal file
80
vendor/github.com/paulrosania/go-charset/charset/iconv/list_query.go
generated
vendored
Normal file
|
@ -0,0 +1,80 @@
|
|||
// +build !linux
|
||||
// This file is systemdependent because not all versions
|
||||
// of iconv have the iconvlist function.
|
||||
|
||||
package iconv
|
||||
|
||||
//#cgo darwin LDFLAGS: -liconv
|
||||
//#cgo freebsd LDFLAGS: -liconv
|
||||
//#cgo windows LDFLAGS: -liconv
|
||||
//#include <stdlib.h>
|
||||
//#include <string.h>
|
||||
//#include <iconv.h>
|
||||
//#include <errno.h>
|
||||
//
|
||||
//typedef struct nameList nameList;
|
||||
//struct nameList {
|
||||
// int n;
|
||||
// char **names;
|
||||
// nameList *next;
|
||||
//};
|
||||
//
|
||||
//int
|
||||
//addNames(unsigned int n, const char *const *names, void *data) {
|
||||
// // we can't call back to Go because of the stack size issue,
|
||||
// // so copy all the names.
|
||||
// nameList *hd, *e;
|
||||
// int i;
|
||||
//
|
||||
// hd = data;
|
||||
// e = malloc(sizeof(nameList));
|
||||
// e->n = n;
|
||||
// e->names = malloc(sizeof(char*) * n);
|
||||
// for(i = 0; i < n; i++){
|
||||
// e->names[i] = strdup(names[i]);
|
||||
// }
|
||||
// e->next = hd->next;
|
||||
// hd->next = e;
|
||||
// return 0;
|
||||
//}
|
||||
//
|
||||
//nameList *
|
||||
//listNames(void) {
|
||||
// nameList hd;
|
||||
// hd.next = 0;
|
||||
// iconvlist(addNames, &hd);
|
||||
// return hd.next;
|
||||
//}
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
var getAliasesOnce sync.Once
|
||||
var allAliases = map[string][]string{}
|
||||
|
||||
func aliases() map[string][]string {
|
||||
getAliasesOnce.Do(getAliases)
|
||||
return allAliases
|
||||
}
|
||||
|
||||
func getAliases() {
|
||||
var next *C.nameList
|
||||
for p := C.listNames(); p != nil; p = next {
|
||||
next = p.next
|
||||
aliases := make([]string, p.n)
|
||||
pnames := (*[1e9]*C.char)(unsafe.Pointer(p.names))
|
||||
for i := range aliases {
|
||||
aliases[i] = strings.ToLower(C.GoString(pnames[i]))
|
||||
C.free(unsafe.Pointer(pnames[i]))
|
||||
}
|
||||
C.free(unsafe.Pointer(p.names))
|
||||
C.free(unsafe.Pointer(p))
|
||||
for _, alias := range aliases {
|
||||
allAliases[alias] = aliases
|
||||
}
|
||||
}
|
||||
}
|
176
vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go
generated
vendored
Normal file
176
vendor/github.com/paulrosania/go-charset/charset/iconv/list_static.go
generated
vendored
Normal file
|
@ -0,0 +1,176 @@
|
|||
// +build linux
|
||||
|
||||
// We just use a list of names obtained from iconv on a platform
|
||||
// that allows iconvlist. We could invoke the iconv command,
|
||||
// but that might fail too, and it gives no information about aliases.
|
||||
|
||||
package iconv
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
func aliases() map[string][]string {
|
||||
initAliasesOnce.Do(initAliases)
|
||||
return allAliases
|
||||
}
|
||||
|
||||
var initAliasesOnce sync.Once
|
||||
var allAliases map[string][]string
|
||||
|
||||
func initAliases() {
|
||||
allAliases = make(map[string][]string)
|
||||
for _, a := range aliasData {
|
||||
for _, alias := range a {
|
||||
allAliases[alias] = a
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var aliasData = [][]string{
|
||||
{"437", "cp437", "ibm437", "cspc8codepage437"},
|
||||
{"850", "cp850", "ibm850", "cspc850multilingual"},
|
||||
{"852", "cp852", "ibm852", "cspcp852"},
|
||||
{"855", "cp855", "ibm855", "csibm855"},
|
||||
{"857", "cp857", "ibm857", "csibm857"},
|
||||
{"860", "cp860", "ibm860", "csibm860"},
|
||||
{"861", "cp-is", "cp861", "ibm861", "csibm861"},
|
||||
{"862", "cp862", "ibm862", "cspc862latinhebrew"},
|
||||
{"863", "cp863", "ibm863", "csibm863"},
|
||||
{"865", "cp865", "ibm865", "csibm865"},
|
||||
{"866", "cp866", "ibm866", "csibm866"},
|
||||
{"869", "cp-gr", "cp869", "ibm869", "csibm869"},
|
||||
{"ansi-x3.4-1968", "ansi-x3.4-1986", "ascii", "cp367", "ibm367", "iso-ir-6", "iso646-us", "iso-646.irv:1991", "us", "us-ascii", "csascii"},
|
||||
{"arabic", "asmo-708", "ecma-114", "iso-8859-6", "iso-ir-127", "iso8859-6", "iso-8859-6", "iso-8859-6:1987", "csisolatinarabic"},
|
||||
{"armscii-8"},
|
||||
{"atari", "atarist"},
|
||||
{"big5-2003"},
|
||||
{"big-5", "big-five", "big5", "bigfive", "cn-big5", "csbig5"},
|
||||
{"big5-hkscs:1999"},
|
||||
{"big5-hkscs:2001"},
|
||||
{"big5-hkscs", "big5-hkscs:2004", "big5hkscs"},
|
||||
{"c99"},
|
||||
{"chinese", "gb-2312-80", "iso-ir-58", "csiso58gb231280"},
|
||||
{"cn", "gb-1988-80", "iso-ir-57", "iso646-cn", "csiso57gb1988"},
|
||||
{"cn-gb", "euc-cn", "euccn", "gb2312", "csgb2312"},
|
||||
{"cn-gb-isoir165", "iso-ir-165"},
|
||||
{"cp1046"},
|
||||
{"cp1124"},
|
||||
{"cp1125"},
|
||||
{"cp1129"},
|
||||
{"cp1131"},
|
||||
{"cp1133", "ibm-cp1133"},
|
||||
{"cp1161", "ibm-1161", "ibm1161", "csibm1161"},
|
||||
{"cp1162", "ibm-1162", "ibm1162", "csibm1162"},
|
||||
{"cp1163", "ibm-1163", "ibm1163", "csibm1163"},
|
||||
{"cp1250", "ms-ee", "windows-1250"},
|
||||
{"cp1251", "ms-cyrl", "windows-1251"},
|
||||
{"cp1252", "ms-ansi", "windows-1252"},
|
||||
{"cp1253", "ms-greek", "windows-1253"},
|
||||
{"cp1254", "ms-turk", "windows-1254"},
|
||||
{"cp1255", "ms-hebr", "windows-1255"},
|
||||
{"cp1256", "ms-arab", "windows-1256"},
|
||||
{"cp1257", "winbaltrim", "windows-1257"},
|
||||
{"cp1258", "windows-1258"},
|
||||
{"cp1361", "johab"},
|
||||
{"cp154", "cyrillic-asian", "pt154", "ptcp154", "csptcp154"},
|
||||
{"cp737"},
|
||||
{"cp775", "ibm775", "cspc775baltic"},
|
||||
{"cp819", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso-8859-1", "iso-8859-1:1987", "l1", "latin1", "csisolatin1"},
|
||||
{"cp853"},
|
||||
{"cp856"},
|
||||
{"cp858"},
|
||||
{"cp864", "ibm864", "csibm864"},
|
||||
{"cp874", "windows-874"},
|
||||
{"cp922"},
|
||||
{"cp932"},
|
||||
{"cp936", "ms936", "windows-936"},
|
||||
{"cp943"},
|
||||
{"cp949", "uhc"},
|
||||
{"cp950"},
|
||||
{"cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso-8859-5", "iso-8859-5:1988", "csisolatincyrillic"},
|
||||
{"dec-hanyu"},
|
||||
{"dec-kanji"},
|
||||
{"ecma-118", "elot-928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso-8859-7", "iso-8859-7:1987", "iso-8859-7:2003", "csisolatingreek"},
|
||||
{"euc-jis-2004", "euc-jisx0213"},
|
||||
{"euc-jp", "eucjp", "extended-unix-code-packed-format-for-japanese", "cseucpkdfmtjapanese"},
|
||||
{"euc-kr", "euckr", "cseuckr"},
|
||||
{"euc-tw", "euctw", "cseuctw"},
|
||||
{"gb18030"},
|
||||
{"gbk"},
|
||||
{"georgian-academy"},
|
||||
{"georgian-ps"},
|
||||
{"hebrew", "iso-8859-8", "iso-ir-138", "iso8859-8", "iso-8859-8", "iso-8859-8:1988", "csisolatinhebrew"},
|
||||
{"hp-roman8", "r8", "roman8", "cshproman8"},
|
||||
{"hz", "hz-gb-2312"},
|
||||
{"iso-10646-ucs-2", "ucs-2", "csunicode"},
|
||||
{"iso-10646-ucs-4", "ucs-4", "csucs4"},
|
||||
{"iso-2022-cn", "csiso2022cn"},
|
||||
{"iso-2022-cn-ext"},
|
||||
{"iso-2022-jp-1"},
|
||||
{"iso-2022-jp-2004", "iso-2022-jp-3"},
|
||||
{"iso-2022-jp-2", "csiso2022jp2"},
|
||||
{"iso-2022-jp", "csiso2022jp"},
|
||||
{"iso-2022-kr", "csiso2022kr"},
|
||||
{"iso-8859-10", "iso-ir-157", "iso8859-10", "iso-8859-10", "iso-8859-10:1992", "l6", "latin6", "csisolatin6"},
|
||||
{"iso-8859-11", "iso8859-11", "iso-8859-11"},
|
||||
{"iso-8859-13", "iso-ir-179", "iso8859-13", "iso-8859-13", "l7", "latin7"},
|
||||
{"iso-8859-14", "iso-celtic", "iso-ir-199", "iso8859-14", "iso-8859-14", "iso-8859-14:1998", "l8", "latin8"},
|
||||
{"iso-8859-15", "iso-ir-203", "iso8859-15", "iso-8859-15", "iso-8859-15:1998", "latin-9"},
|
||||
{"iso-8859-16", "iso-ir-226", "iso8859-16", "iso-8859-16", "iso-8859-16:2001", "l10", "latin10"},
|
||||
{"iso-8859-2", "iso-ir-101", "iso8859-2", "iso-8859-2", "iso-8859-2:1987", "l2", "latin2", "csisolatin2"},
|
||||
{"iso-8859-3", "iso-ir-109", "iso8859-3", "iso-8859-3", "iso-8859-3:1988", "l3", "latin3", "csisolatin3"},
|
||||
{"iso-8859-4", "iso-ir-110", "iso8859-4", "iso-8859-4", "iso-8859-4:1988", "l4", "latin4", "csisolatin4"},
|
||||
{"iso-8859-9", "iso-ir-148", "iso8859-9", "iso-8859-9", "iso-8859-9:1989", "l5", "latin5", "csisolatin5"},
|
||||
{"iso-ir-149", "korean", "ksc-5601", "ks-c-5601-1987", "ks-c-5601-1989", "csksc56011987"},
|
||||
{"iso-ir-14", "iso646-jp", "jis-c6220-1969-ro", "jp", "csiso14jisc6220ro"},
|
||||
{"iso-ir-159", "jis-x0212", "jis-x0212-1990", "jis-x0212.1990-0", "x0212", "csiso159jisx02121990"},
|
||||
{"iso-ir-166", "tis-620", "tis620", "tis620-0", "tis620.2529-1", "tis620.2533-0", "tis620.2533-1"},
|
||||
{"iso-ir-230", "tds565"},
|
||||
{"iso-ir-87", "jis0208", "jis-c6226-1983", "jis-x0208", "jis-x0208-1983", "jis-x0208-1990", "x0208", "csiso87jisx0208"},
|
||||
{"java"},
|
||||
{"jisx0201-1976", "jis-x0201", "x0201", "cshalfwidthkatakana"},
|
||||
{"koi8-r", "cskoi8r"},
|
||||
{"koi8-ru"},
|
||||
{"koi8-t"},
|
||||
{"koi8-u"},
|
||||
{"kz-1048", "rk1048", "strk1048-2002", "cskz1048"},
|
||||
{"macarabic"},
|
||||
{"maccentraleurope"},
|
||||
{"maccroatian"},
|
||||
{"maccyrillic"},
|
||||
{"macgreek"},
|
||||
{"machebrew"},
|
||||
{"maciceland"},
|
||||
{"mac", "macintosh", "macroman", "csmacintosh"},
|
||||
{"macromania"},
|
||||
{"macthai"},
|
||||
{"macturkish"},
|
||||
{"macukraine"},
|
||||
{"ms-kanji", "shift-jis", "shift-jis", "sjis", "csshiftjis"},
|
||||
{" MS-Windows", "Japanese", "(cp932)"},
|
||||
{"mulelao-1"},
|
||||
{"nextstep"},
|
||||
{"riscos-latin1"},
|
||||
{"shift-jis-2004", "shift-jisx0213"},
|
||||
{"tcvn", "tcvn-5712", "tcvn5712-1", "tcvn5712-1:1993"},
|
||||
{"ucs-2be", "unicode-1-1", "unicodebig", "csunicode11"},
|
||||
{"ucs-2-internal"},
|
||||
{"ucs-2le", "unicodelittle"},
|
||||
{"ucs-2-swapped"},
|
||||
{"ucs-4be"},
|
||||
{"ucs-4-internal"},
|
||||
{"ucs-4le"},
|
||||
{"ucs-4-swapped"},
|
||||
{"unicode-1-1-utf-7", "utf-7", "csunicode11utf7"},
|
||||
{"utf-16"},
|
||||
{"utf-16be"},
|
||||
{"utf-16le"},
|
||||
{"utf-32"},
|
||||
{"utf-32be"},
|
||||
{"utf-32le"},
|
||||
{"utf-8"},
|
||||
{"utf-8-mac", "utf8-mac"},
|
||||
{"viscii", "viscii1.1-1", "csviscii"},
|
||||
{"windows-31j", "cp932"},
|
||||
}
|
|
@ -0,0 +1,162 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var (
|
||||
readLocalCharsetsOnce sync.Once
|
||||
localCharsets = make(map[string]*localCharset)
|
||||
)
|
||||
|
||||
type localCharset struct {
|
||||
Charset
|
||||
arg string
|
||||
*class
|
||||
}
|
||||
|
||||
// A class of character sets.
|
||||
// Each class can be instantiated with an argument specified in the config file.
|
||||
// Many character sets can use a single class.
|
||||
type class struct {
|
||||
from, to func(arg string) (Translator, error)
|
||||
}
|
||||
|
||||
// The set of classes, indexed by class name.
|
||||
var classes = make(map[string]*class)
|
||||
|
||||
func registerClass(charset string, from, to func(arg string) (Translator, error)) {
|
||||
classes[charset] = &class{from, to}
|
||||
}
|
||||
|
||||
type localFactory struct{}
|
||||
|
||||
func (f localFactory) TranslatorFrom(name string) (Translator, error) {
|
||||
f.init()
|
||||
name = NormalizedName(name)
|
||||
cs := localCharsets[name]
|
||||
if cs == nil {
|
||||
return nil, fmt.Errorf("character set %q not found", name)
|
||||
}
|
||||
if cs.from == nil {
|
||||
return nil, fmt.Errorf("cannot translate from %q", name)
|
||||
}
|
||||
return cs.from(cs.arg)
|
||||
}
|
||||
|
||||
func (f localFactory) TranslatorTo(name string) (Translator, error) {
|
||||
f.init()
|
||||
name = NormalizedName(name)
|
||||
cs := localCharsets[name]
|
||||
if cs == nil {
|
||||
return nil, fmt.Errorf("character set %q not found", name)
|
||||
}
|
||||
if cs.to == nil {
|
||||
return nil, fmt.Errorf("cannot translate to %q", name)
|
||||
}
|
||||
return cs.to(cs.arg)
|
||||
}
|
||||
|
||||
func (f localFactory) Names() []string {
|
||||
f.init()
|
||||
var names []string
|
||||
for name, cs := range localCharsets {
|
||||
// add names only for non-aliases.
|
||||
if localCharsets[cs.Name] == cs {
|
||||
names = append(names, name)
|
||||
}
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
func (f localFactory) Info(name string) *Charset {
|
||||
f.init()
|
||||
lcs := localCharsets[NormalizedName(name)]
|
||||
if lcs == nil {
|
||||
return nil
|
||||
}
|
||||
// copy the charset info so that callers can't mess with it.
|
||||
cs := lcs.Charset
|
||||
return &cs
|
||||
}
|
||||
|
||||
func (f localFactory) init() {
|
||||
readLocalCharsetsOnce.Do(readLocalCharsets)
|
||||
}
|
||||
|
||||
// charsetEntry is the data structure for one entry in the JSON config file.
|
||||
// If Alias is non-empty, it should be the canonical name of another
|
||||
// character set; otherwise Class should be the name
|
||||
// of an entry in classes, and Arg is the argument for
|
||||
// instantiating it.
|
||||
type charsetEntry struct {
|
||||
Aliases []string
|
||||
Desc string
|
||||
Class string
|
||||
Arg string
|
||||
}
|
||||
|
||||
// readCharsets reads the JSON config file.
|
||||
// It's done once only, when first needed.
|
||||
func readLocalCharsets() {
|
||||
csdata, err := readFile("charsets.json")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "charset: cannot open \"charsets.json\": %v\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
var entries map[string]charsetEntry
|
||||
err = json.Unmarshal(csdata, &entries)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "charset: cannot decode config file: %v\n", err)
|
||||
}
|
||||
for name, e := range entries {
|
||||
class := classes[e.Class]
|
||||
if class == nil {
|
||||
continue
|
||||
}
|
||||
name = NormalizedName(name)
|
||||
for i, a := range e.Aliases {
|
||||
e.Aliases[i] = NormalizedName(a)
|
||||
}
|
||||
cs := &localCharset{
|
||||
Charset: Charset{
|
||||
Name: name,
|
||||
Aliases: e.Aliases,
|
||||
Desc: e.Desc,
|
||||
NoFrom: class.from == nil,
|
||||
NoTo: class.to == nil,
|
||||
},
|
||||
arg: e.Arg,
|
||||
class: class,
|
||||
}
|
||||
localCharsets[cs.Name] = cs
|
||||
for _, a := range cs.Aliases {
|
||||
localCharsets[a] = cs
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// A general cache store that local character set translators
|
||||
// can use for persistent storage of data.
|
||||
var (
|
||||
cacheMutex sync.Mutex
|
||||
cacheStore = make(map[interface{}]interface{})
|
||||
)
|
||||
|
||||
func cache(key interface{}, f func() (interface{}, error)) (interface{}, error) {
|
||||
cacheMutex.Lock()
|
||||
defer cacheMutex.Unlock()
|
||||
if x := cacheStore[key]; x != nil {
|
||||
return x, nil
|
||||
}
|
||||
x, err := f()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
cacheStore[key] = x
|
||||
return x, err
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("utf16", fromUTF16, toUTF16)
|
||||
}
|
||||
|
||||
type translateFromUTF16 struct {
|
||||
first bool
|
||||
endian binary.ByteOrder
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
func (p *translateFromUTF16) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
data = data[0 : len(data)&^1] // round to even number of bytes.
|
||||
if len(data) < 2 {
|
||||
return 0, nil, nil
|
||||
}
|
||||
n := 0
|
||||
if p.first && p.endian == nil {
|
||||
switch binary.BigEndian.Uint16(data) {
|
||||
case 0xfeff:
|
||||
p.endian = binary.BigEndian
|
||||
data = data[2:]
|
||||
n += 2
|
||||
case 0xfffe:
|
||||
p.endian = binary.LittleEndian
|
||||
data = data[2:]
|
||||
n += 2
|
||||
default:
|
||||
p.endian = guessEndian(data)
|
||||
}
|
||||
p.first = false
|
||||
}
|
||||
|
||||
p.scratch = p.scratch[:0]
|
||||
for ; len(data) > 0; data = data[2:] {
|
||||
p.scratch = appendRune(p.scratch, rune(p.endian.Uint16(data)))
|
||||
n += 2
|
||||
}
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
|
||||
func guessEndian(data []byte) binary.ByteOrder {
|
||||
// XXX TODO
|
||||
return binary.LittleEndian
|
||||
}
|
||||
|
||||
type translateToUTF16 struct {
|
||||
first bool
|
||||
endian binary.ByteOrder
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
func (p *translateToUTF16) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
p.scratch = ensureCap(p.scratch[:0], (len(data)+1)*2)
|
||||
if p.first {
|
||||
p.scratch = p.scratch[0:2]
|
||||
p.endian.PutUint16(p.scratch, 0xfeff)
|
||||
p.first = false
|
||||
}
|
||||
n := 0
|
||||
for len(data) > 0 {
|
||||
if !utf8.FullRune(data) && !eof {
|
||||
break
|
||||
}
|
||||
r, size := utf8.DecodeRune(data)
|
||||
// TODO if r > 65535?
|
||||
|
||||
slen := len(p.scratch)
|
||||
p.scratch = p.scratch[0 : slen+2]
|
||||
p.endian.PutUint16(p.scratch[slen:], uint16(r))
|
||||
data = data[size:]
|
||||
n += size
|
||||
}
|
||||
return n, p.scratch, nil
|
||||
}
|
||||
|
||||
func getEndian(arg string) (binary.ByteOrder, error) {
|
||||
switch arg {
|
||||
case "le":
|
||||
return binary.LittleEndian, nil
|
||||
case "be":
|
||||
return binary.BigEndian, nil
|
||||
case "":
|
||||
return nil, nil
|
||||
}
|
||||
return nil, errors.New("charset: unknown utf16 endianness")
|
||||
}
|
||||
|
||||
func fromUTF16(arg string) (Translator, error) {
|
||||
endian, err := getEndian(arg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &translateFromUTF16{first: true, endian: endian}, nil
|
||||
}
|
||||
|
||||
func toUTF16(arg string) (Translator, error) {
|
||||
endian, err := getEndian(arg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &translateToUTF16{first: false, endian: endian}, nil
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package charset
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func init() {
|
||||
registerClass("utf8", toUTF8, toUTF8)
|
||||
}
|
||||
|
||||
type translateToUTF8 struct {
|
||||
scratch []byte
|
||||
}
|
||||
|
||||
var errorBytes = []byte(string(utf8.RuneError))
|
||||
|
||||
const errorRuneLen = len(string(utf8.RuneError))
|
||||
|
||||
func (p *translateToUTF8) Translate(data []byte, eof bool) (int, []byte, error) {
|
||||
p.scratch = ensureCap(p.scratch, (len(data))*errorRuneLen)
|
||||
buf := p.scratch[:0]
|
||||
for i := 0; i < len(data); {
|
||||
// fast path for ASCII
|
||||
if b := data[i]; b < utf8.RuneSelf {
|
||||
buf = append(buf, b)
|
||||
i++
|
||||
continue
|
||||
}
|
||||
_, size := utf8.DecodeRune(data[i:])
|
||||
if size == 1 {
|
||||
if !eof && !utf8.FullRune(data) {
|
||||
// When DecodeRune has converted only a single
|
||||
// byte, we know there must be some kind of error
|
||||
// because we know the byte's not ASCII.
|
||||
// If we aren't at EOF, and it's an incomplete
|
||||
// rune encoding, then we return to process
|
||||
// the final bytes in a subsequent call.
|
||||
return i, buf, nil
|
||||
}
|
||||
buf = append(buf, errorBytes...)
|
||||
} else {
|
||||
buf = append(buf, data[i:i+size]...)
|
||||
}
|
||||
i += size
|
||||
}
|
||||
return len(data), buf, nil
|
||||
}
|
||||
|
||||
func toUTF8(arg string) (Translator, error) {
|
||||
return new(translateToUTF8), nil
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"flag"
|
||||
"fmt"
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
_ "github.com/paulrosania/go-charset/charset/iconv"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var listFlag = flag.Bool("l", false, "list available character sets")
|
||||
var verboseFlag = flag.Bool("v", false, "list more information")
|
||||
var fromCharset = flag.String("f", "utf-8", "translate from this character set")
|
||||
var toCharset = flag.String("t", "utf-8", "translate to this character set")
|
||||
|
||||
func main() {
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "usage: tcs [-l] [-v] [charset]\n")
|
||||
fmt.Fprintf(os.Stderr, "\ttcs [-f charset] [-t charset] [file]\n")
|
||||
}
|
||||
flag.Parse()
|
||||
if *listFlag {
|
||||
cs := ""
|
||||
switch flag.NArg() {
|
||||
case 1:
|
||||
cs = flag.Arg(0)
|
||||
case 0:
|
||||
default:
|
||||
flag.Usage()
|
||||
}
|
||||
listCharsets(*verboseFlag, cs)
|
||||
return
|
||||
}
|
||||
var f *os.File
|
||||
switch flag.NArg() {
|
||||
case 0:
|
||||
f = os.Stdin
|
||||
case 1:
|
||||
var err error
|
||||
f, err = os.Open(flag.Arg(0))
|
||||
if err != nil {
|
||||
fatalf("cannot open %q: %v", err)
|
||||
}
|
||||
}
|
||||
r, err := charset.NewReader(*fromCharset, f)
|
||||
if err != nil {
|
||||
fatalf("cannot translate from %q: %v", *fromCharset, err)
|
||||
}
|
||||
w, err := charset.NewWriter(*toCharset, os.Stdout)
|
||||
if err != nil {
|
||||
fatalf("cannot translate to %q: ", err)
|
||||
}
|
||||
_, err = io.Copy(w, r)
|
||||
if err != nil {
|
||||
fatalf("%v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func listCharsets(verbose bool, csname string) {
|
||||
var buf bytes.Buffer
|
||||
if !verbose {
|
||||
if csname != "" {
|
||||
cs := charset.Info(csname)
|
||||
if cs == nil {
|
||||
fatalf("no such charset %q", csname)
|
||||
}
|
||||
fmt.Fprintf(&buf, "%s %s\n", cs.Name, strings.Join(cs.Aliases, " "))
|
||||
} else {
|
||||
fmt.Fprintf(&buf, "%v\n", strings.Join(charset.Names(), " "))
|
||||
}
|
||||
} else {
|
||||
var charsets []*charset.Charset
|
||||
if csname != "" {
|
||||
cs := charset.Info(csname)
|
||||
if cs == nil {
|
||||
fatalf("no such charset %q", csname)
|
||||
}
|
||||
charsets = []*charset.Charset{cs}
|
||||
} else {
|
||||
for _, name := range charset.Names() {
|
||||
if cs := charset.Info(name); cs != nil {
|
||||
charsets = append(charsets, cs)
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, cs := range charsets {
|
||||
fmt.Fprintf(&buf, "%s %s\n", cs.Name, strings.Join(cs.Aliases, " "))
|
||||
if cs.Desc != "" {
|
||||
fmt.Fprintf(&buf, "\t%s\n", cs.Desc)
|
||||
}
|
||||
}
|
||||
}
|
||||
os.Stdout.Write(buf.Bytes())
|
||||
}
|
||||
|
||||
func fatalf(f string, a ...interface{}) {
|
||||
s := fmt.Sprintf(f, a...)
|
||||
fmt.Fprintf(os.Stderr, "%s\n", s)
|
||||
os.Exit(2)
|
||||
}
|
File diff suppressed because one or more lines are too long
18
vendor/github.com/paulrosania/go-charset/data/data_charsets.json.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_charsets.json.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("charsets.json", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("{\n\"8bit\": {\n\t\"Desc\": \"raw 8-bit data\",\n\t\"Class\": \"8bit\",\n\t\"Comment\": \"special class for raw 8bit data that has been converted to utf-8\"\n},\n\"big5\": {\n\t\"Desc\": \"Big 5 (HKU)\",\n\t\"Class\": \"big5\",\n\t\"Comment\": \"Traditional Chinese\"\n},\n\"euc-jp\": {\n\t\"Aliases\":[\"x-euc-jp\"],\n\t\"Desc\": \"Japanese Extended UNIX Code\",\n\t\"Class\": \"euc-jp\"\n},\n\"gb2312\": {\n\t\"Aliases\":[\"iso-ir-58\", \"chinese\", \"gb_2312-80\"],\n\t\"Desc\": \"Chinese mixed one byte\",\n\t\"Class\": \"gb2312\"\n},\n\"ibm437\": {\n\t\"Aliases\":[\"437\", \"cp437\"],\n\t\"Desc\": \"IBM PC: CP 437\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"ibm437.cp\",\n\t\"Comment\": \"originally from jhelling@cs.ruu.nl (Jeroen Hellingman)\"\n},\n\"ibm850\": {\n\t\"Aliases\":[\"850\", \"cp850\"],\n\t\"Desc\": \"IBM PS/2: CP 850\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"ibm850.cp\",\n\t\"Comment\": \"originally from jhelling@cs.ruu.nl (Jeroen Hellingman)\"\n},\n\"ibm866\": {\n\t\"Aliases\":[\"cp866\", \"866\"],\n\t\"Desc\": \"Russian MS-DOS CP 866\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"ibm866.cp\"\n},\n\"iso-8859-1\": {\n\t\"Aliases\":[\"iso-ir-100\", \"ibm819\", \"l1\", \"iso8859-1\", \"iso-latin-1\", \"iso_8859-1:1987\", \"cp819\", \"iso_8859-1\", \"iso8859_1\", \"latin1\"],\n\t\"Desc\": \"Latin-1\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-1.cp\"\n},\n\"iso-8859-10\": {\n\t\"Aliases\":[\"iso_8859-10:1992\", \"l6\", \"iso-ir-157\", \"latin6\"],\n\t\"Desc\": \"Latin-6\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-10.cp\",\n\t\"Comment\": \"originally from dkuug.dk:i18n/charmaps/ISO_8859-10:1993\"\n},\n\"iso-8859-15\": {\n\t\"Aliases\":[\"l9-iso-8859-15\", \"latin9\"],\n\t\"Desc\": \"Latin-9\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-15.cp\"\n},\n\"iso-8859-2\": {\n\t\"Aliases\":[\"iso-ir-101\", \"iso_8859-2:1987\", \"l2\", \"iso_8859-2\", \"latin2\"],\n\t\"Desc\": \"Latin-2\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-2.cp\"\n},\n\"iso-8859-3\": {\n\t\"Aliases\":[\"iso-ir-109\", \"l3\", \"iso_8859-3:1988\", \"iso_8859-3\", \"latin3\"],\n\t\"Desc\": \"Latin-3\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-3.cp\"\n},\n\"iso-8859-4\": {\n\t\"Aliases\":[\"iso-ir-110\", \"iso_8859-4:1988\", \"l4\", \"iso_8859-4\", \"latin4\"],\n\t\"Desc\": \"Latin-4\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-4.cp\"\n},\n\"iso-8859-5\": {\n\t\"Aliases\":[\"cyrillic\", \"iso_8859-5\", \"iso-ir-144\", \"iso_8859-5:1988\"],\n\t\"Desc\": \"Part 5 (Cyrillic)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-5.cp\"\n},\n\"iso-8859-6\": {\n\t\"Aliases\":[\"ecma-114\", \"iso_8859-6:1987\", \"arabic\", \"iso_8859-6\", \"asmo-708\", \"iso-ir-127\"],\n\t\"Desc\": \"Part 6 (Arabic)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-6.cp\"\n},\n\"iso-8859-7\": {\n\t\"Aliases\":[\"greek8\", \"elot_928\", \"ecma-118\", \"greek\", \"iso_8859-7\", \"iso_8859-7:1987\", \"iso-ir-126\"],\n\t\"Desc\": \"Part 7 (Greek)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-7.cp\"\n},\n\"iso-8859-8\": {\n\t\"Aliases\":[\"iso_8859-8:1988\", \"hebrew\", \"iso_8859-8\", \"iso-ir-138\"],\n\t\"Desc\": \"Part 8 (Hebrew)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-8.cp\"\n},\n\"iso-8859-9\": {\n\t\"Aliases\":[\"l5\", \"iso_8859-9:1989\", \"iso_8859-9\", \"iso-ir-148\", \"latin5\"],\n\t\"Desc\": \"Latin-5\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"iso-8859-9.cp\"\n},\n\"koi8-r\": {\n\t\"Desc\": \"KOI8-R (RFC1489)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"koi8-r.cp\"\n},\n\"shift_jis\": {\n\t\"Aliases\":[\"sjis\", \"ms_kanji\", \"x-sjis\"],\n\t\"Desc\": \"Shift-JIS Japanese\",\n\t\"Class\": \"cp932\",\n\t\"Arg\": \"shiftjis\"\n},\n\"us-ascii\": {\n\t\"Aliases\":[\"ascii\"],\n\t\"Desc\": \"US-ASCII (RFC20)\",\n\t\"Class\": \"ascii\"\n},\n\"utf-16\": {\n\t\"Aliases\":[\"utf16\"],\n\t\"Desc\": \"Unicode UTF-16\",\n\t\"Class\": \"utf16\"\n},\n\"utf-16be\": {\n\t\"Aliases\":[\"utf16be\"],\n\t\"Desc\": \"Unicode UTF-16 big endian\",\n\t\"Class\": \"utf16\",\n\t\"Arg\": \"be\"\n},\n\"utf-16le\": {\n\t\"Aliases\":[\"utf16le\"],\n\t\"Desc\": \"Unicode UTF-16 little endian\",\n\t\"Class\": \"utf16\",\n\t\"Arg\": \"le\"\n},\n\"utf-8\": {\n\t\"Aliases\":[\"utf8\"],\n\t\"Desc\": \"Unicode UTF-8\",\n\t\"Class\": \"utf8\"\n},\n\"windows-1250\": {\n\t\"Desc\": \"MS Windows CP 1250 (Central Europe)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"windows-1250.cp\"\n},\n\"windows-1251\": {\n\t\"Desc\": \"MS Windows CP 1251 (Cyrillic)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"windows-1251.cp\"\n},\n\"windows-1252\": {\n\t\"Desc\": \"MS Windows CP 1252 (Latin 1)\",\n\t\"Class\": \"cp\",\n\t\"Arg\": \"windows-1252.cp\"\n},\n\"windows-31j\": {\n\t\"Aliases\":[\"cp932\"],\n\t\"Desc\": \"MS-Windows Japanese (cp932)\",\n\t\"Class\": \"cp932\",\n\t\"Arg\": \"cp932\"\n}\n}\n")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("ibm437.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007fÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜ¢£¥₧ƒáíóúñѪº¿⌐¬½¼¡«»░▒▓│┤╡╢╖╕╣║╗╝╜╛┐└┴┬├─┼╞╟╚╔╩╦╠═╬╧╨╤╥╙╘╒╓╫╪┘┌█▄▌▐▀αßΓπΣσµτΦΘΩδ∞∅∈∩≡±≥≤⌠⌡÷≈°•·√ⁿ²∎\u00a0")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("ibm850.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007fÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜø£Ø׃áíóúñѪº¿®¬½¼¡«»░▒▓│┤ÁÂÀ©╣║╗╝¢¥┐└┴┬├─┼ãÃ╚╔╩╦╠═╬¤ðÐÊËÈıÍÎÏ┘┌█▄¦Ì▀ÓßÔÒõÕµþÞÚÛÙýݯ´\u00ad±‗¾¶§÷¸°¨·¹³²∎\u00a0")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("ibm866.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007fАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмноп<D0BE><D0BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>рстуфхцчшщъыьэюяЁё<D081><D191><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-1.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-1.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-1.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0¡¢£¤¥¦§¨©ª«¬\u00ad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-10.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-10.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-10.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0ĄĒĢĪĨĶ§ĻĐŠŦŽ\u00adŪŊ°ąēģīĩķ·ļĐšŧž—ūŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎÏÐŅŌÓÔÕÖŨØŲÚÛÜÝÞßāáâãäåæįčéęëėíîïðņōóôõöũøųúûüýþĸ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-15.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-15.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-15.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0¡¢£€¥Š§š©ª«¬\u00ad®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-2.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-2.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-2.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0Ą˘Ł¤ĽŚ§¨ŠŞŤŹ\u00adŽŻ°ą˛ł´ľśˇ¸šşťź˝žżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-3.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-3.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-3.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0Ħ˘£¤<C2A3>Ĥ§¨İŞĞĴ\u00ad<61>Ż°ħ²³´µĥ·¸ışğĵ½<C4B5>żÀÁÂ<C381>ÄĊĈÇÈÉÊËÌÍÎÏ<C38E>ÑÒÓÔĠÖ×ĜÙÚÛÜŬŜßàáâ<C3A1>äċĉçèéêëìíîï<C3AE>ñòóôġö÷ĝùúûüŭŝ˙")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-4.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-4.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-4.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0ĄĸŖ¤ĨĻ§¨ŠĒĢŦ\u00adŽ¯°ą˛ŗ´ĩļˇ¸šēģŧŊžŋĀÁÂÃÄÅÆĮČÉĘËĖÍÎĪĐŅŌĶÔÕÖ×ØŲÚÛÜŨŪßāáâãäåæįčéęëėíîīđņōķôõö÷øųúûüũū˙")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-5.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-5.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-5.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0ЁЂЃЄЅІЇЈЉЊЋЌ\u00adЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-6.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-6.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-6.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0<61><30><EFBFBD>¤<EFBFBD><C2A4><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>،\u00ad<61><64><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>؛<EFBFBD><D89B><EFBFBD>؟<EFBFBD>ءآأؤإئابةتثجحخدذرزسشصضطظعغ<D8B9><D8BA><EFBFBD><EFBFBD><EFBFBD>ـفقكلمنهوىيًٌٍَُِّْ<D991><D992><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-7.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-7.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-7.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0‘’£<E28099><C2A3>¦§¨©<C2A8>«¬\u00ad<61>―°±²³΄΅Ά·ΈΉΊ»Ό½ΎΏΐΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ<CEA0>ΣΤΥΦΧΨΩΪΫάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋόύώ<CF8D>")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-8.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-8.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-8.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0<61>¢£¤¥¦§¨©×«¬\u00ad®‾°±²³´µ¶·¸¹÷»¼½¾<C2BD><C2BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>‗אבגדהוזחטיךכלםמןנסעףפץצקרשת<D7A9><D7AA><EFBFBD><EFBFBD><EFBFBD>")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-9.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_iso-8859-9.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("iso-8859-9.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f\u00a0¡¢£¤¥¦§¨©ª«¬\u00ad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏĞÑÒÓÔÕÖ×ØÙÚÛÜİŞßàáâãäåæçèéêëìíîïğñòóôõö÷øùúûüışÿ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_jisx0201kana.dat.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_jisx0201kana.dat.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("jisx0201kana.dat", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("koi8-r.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f─│┌┐└┘├┤┬┴┼▀▄█▌▐░▒▓⌠■∙√≈≤≥\u00a0⌡°²·÷═║╒ё╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡Ё╢╣╤╥╦╧╨╩╪╫╬©юабцдефгхийклмнопярстужвьызшэщчъЮАБЦДЕФГХИЙКЛМНОПЯРСТУЖВЬЫЗШЭЩЧЪ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1250.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1250.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("windows-1250.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f€<66>‚<EFBFBD>„…†‡<E280A0>‰Š‹ŚŤŽŹ<C5BD>‘’“”•–—<E28093>™š›śťžź\u00a0ˇ˘Ł¤Ą¦§¨©Ş«¬\u00ad®Ż°±˛ł´µ¶·¸ąş»Ľ˝ľżŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢßŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1251.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1251.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("windows-1251.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007fЂЃ‚ѓ„…†‡<E280A0>‰Љ‹ЊЌЋЏђ‘’“”•–—<E28093>™љ›њќћџ\u00a0ЎўЈ¤Ґ¦§Ё©Є«¬\u00ad®Ї°±Ііґµ¶·ё№є»јЅѕїАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1252.cp.go
generated
vendored
Normal file
18
vendor/github.com/paulrosania/go-charset/data/data_windows-1252.cp.go
generated
vendored
Normal file
|
@ -0,0 +1,18 @@
|
|||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile("windows-1252.cp", func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader("\x00\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\u007f€<66>‚ƒ„…†‡ˆ‰Š‹Œ<E280B9>Ž<EFBFBD><C5BD>‘’“”•–—˜™š›œ<E280BA>žŸ\u00a0¡¢£¤¥¦§¨©ª«¬\u00ad®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ")
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
// The data package embeds all the charset
|
||||
// data files as Go data. It registers the data with the charset
|
||||
// package as a side effect of its import. To use:
|
||||
//
|
||||
// import _ "github.com/paulrosania/go-charset"
|
||||
package data
|
|
@ -0,0 +1,97 @@
|
|||
// +build ignore
|
||||
|
||||
// go run generate.go && go fmt
|
||||
|
||||
// The generate-charset-data command generates the Go source code
|
||||
// for github.com/paulrosania/go-charset/data from the data files
|
||||
// found in github.com/paulrosania/go-charset/datafiles.
|
||||
// It should be run in the go-charset root directory.
|
||||
// The resulting Go files will need gofmt'ing.
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
type info struct {
|
||||
Path string
|
||||
}
|
||||
|
||||
var tfuncs = template.FuncMap{
|
||||
"basename": func(s string) string {
|
||||
return filepath.Base(s)
|
||||
},
|
||||
"read": func(path string) ([]byte, error) {
|
||||
return ioutil.ReadFile(path)
|
||||
},
|
||||
}
|
||||
|
||||
var tmpl = template.Must(template.New("").Funcs(tfuncs).Parse(`
|
||||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
package data
|
||||
import (
|
||||
"github.com/paulrosania/go-charset/charset"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func init() {
|
||||
charset.RegisterDataFile({{basename .Path | printf "%q"}}, func() (io.ReadCloser, error) {
|
||||
r := strings.NewReader({{read .Path | printf "%q"}})
|
||||
return ioutil.NopCloser(r), nil
|
||||
})
|
||||
}
|
||||
`))
|
||||
|
||||
var docTmpl = template.Must(template.New("").Funcs(tfuncs).Parse(`
|
||||
// This file is automatically generated by generate-charset-data.
|
||||
// Do not hand-edit.
|
||||
|
||||
// The {{basename .Package}} package embeds all the charset
|
||||
// data files as Go data. It registers the data with the charset
|
||||
// package as a side effect of its import. To use:
|
||||
//
|
||||
// import _ "github.com/paulrosania/go-charset"
|
||||
package {{basename .Package}}
|
||||
`))
|
||||
|
||||
func main() {
|
||||
dataDir := filepath.Join("..", "datafiles")
|
||||
d, err := os.Open(dataDir)
|
||||
if err != nil {
|
||||
fatalf("%v", err)
|
||||
}
|
||||
names, err := d.Readdirnames(0)
|
||||
if err != nil {
|
||||
fatalf("cannot read datafiles dir: %v", err)
|
||||
}
|
||||
for _, name := range names {
|
||||
writeFile("data_"+name+".go", tmpl, info{
|
||||
Path: filepath.Join(dataDir, name),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func writeFile(name string, t *template.Template, data interface{}) {
|
||||
w, err := os.Create(name)
|
||||
if err != nil {
|
||||
fatalf("cannot create output file: %v", err)
|
||||
}
|
||||
defer w.Close()
|
||||
err = t.Execute(w, data)
|
||||
if err != nil {
|
||||
fatalf("template execute %q: %v", name, err)
|
||||
}
|
||||
}
|
||||
|
||||
func fatalf(f string, a ...interface{}) {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", fmt.Sprintf(f, a...))
|
||||
os.Exit(2)
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
package chardet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
)
|
||||
|
||||
type recognizer2022 struct {
|
||||
charset string
|
||||
escapes [][]byte
|
||||
}
|
||||
|
||||
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
return recognizerOutput{
|
||||
Charset: r.charset,
|
||||
Confidence: r.matchConfidence(input.input),
|
||||
}
|
||||
}
|
||||
|
||||
func (r *recognizer2022) matchConfidence(input []byte) int {
|
||||
var hits, misses, shifts int
|
||||
input:
|
||||
for i := 0; i < len(input); i++ {
|
||||
c := input[i]
|
||||
if c == 0x1B {
|
||||
for _, esc := range r.escapes {
|
||||
if bytes.HasPrefix(input[i+1:], esc) {
|
||||
hits++
|
||||
i += len(esc)
|
||||
continue input
|
||||
}
|
||||
}
|
||||
misses++
|
||||
} else if c == 0x0E || c == 0x0F {
|
||||
shifts++
|
||||
}
|
||||
}
|
||||
if hits == 0 {
|
||||
return 0
|
||||
}
|
||||
quality := (100*hits - 100*misses) / (hits + misses)
|
||||
if hits+shifts < 5 {
|
||||
quality -= (5 - (hits + shifts)) * 10
|
||||
}
|
||||
if quality < 0 {
|
||||
quality = 0
|
||||
}
|
||||
return quality
|
||||
}
|
||||
|
||||
var escapeSequences_2022JP = [][]byte{
|
||||
{0x24, 0x28, 0x43}, // KS X 1001:1992
|
||||
{0x24, 0x28, 0x44}, // JIS X 212-1990
|
||||
{0x24, 0x40}, // JIS C 6226-1978
|
||||
{0x24, 0x41}, // GB 2312-80
|
||||
{0x24, 0x42}, // JIS X 208-1983
|
||||
{0x26, 0x40}, // JIS X 208 1990, 1997
|
||||
{0x28, 0x42}, // ASCII
|
||||
{0x28, 0x48}, // JIS-Roman
|
||||
{0x28, 0x49}, // Half-width katakana
|
||||
{0x28, 0x4a}, // JIS-Roman
|
||||
{0x2e, 0x41}, // ISO 8859-1
|
||||
{0x2e, 0x46}, // ISO 8859-7
|
||||
}
|
||||
|
||||
var escapeSequences_2022KR = [][]byte{
|
||||
{0x24, 0x29, 0x43},
|
||||
}
|
||||
|
||||
var escapeSequences_2022CN = [][]byte{
|
||||
{0x24, 0x29, 0x41}, // GB 2312-80
|
||||
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
|
||||
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
|
||||
{0x24, 0x29, 0x45}, // ISO-IR-165
|
||||
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
|
||||
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
|
||||
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
|
||||
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
|
||||
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
|
||||
{0x4e}, // SS2
|
||||
{0x4f}, // SS3
|
||||
}
|
||||
|
||||
func newRecognizer_2022JP() *recognizer2022 {
|
||||
return &recognizer2022{
|
||||
"ISO-2022-JP",
|
||||
escapeSequences_2022JP,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_2022KR() *recognizer2022 {
|
||||
return &recognizer2022{
|
||||
"ISO-2022-KR",
|
||||
escapeSequences_2022KR,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_2022CN() *recognizer2022 {
|
||||
return &recognizer2022{
|
||||
"ISO-2022-CN",
|
||||
escapeSequences_2022CN,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
Copyright (c) 2012 chardet Authors
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
of the Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Partial of the Software is derived from ICU project. See icu-license.html for
|
||||
license of the derivative portions.
|
|
@ -0,0 +1,136 @@
|
|||
// Package chardet ports character set detection from ICU.
|
||||
package chardet
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Result contains all the information that charset detector gives.
|
||||
type Result struct {
|
||||
// IANA name of the detected charset.
|
||||
Charset string
|
||||
// IANA name of the detected language. It may be empty for some charsets.
|
||||
Language string
|
||||
// Confidence of the Result. Scale from 1 to 100. The bigger, the more confident.
|
||||
Confidence int
|
||||
}
|
||||
|
||||
// Detector implements charset detection.
|
||||
type Detector struct {
|
||||
recognizers []recognizer
|
||||
stripTag bool
|
||||
}
|
||||
|
||||
// List of charset recognizers
|
||||
var recognizers = []recognizer{
|
||||
newRecognizer_utf8(),
|
||||
newRecognizer_utf16be(),
|
||||
newRecognizer_utf16le(),
|
||||
newRecognizer_utf32be(),
|
||||
newRecognizer_utf32le(),
|
||||
newRecognizer_8859_1_en(),
|
||||
newRecognizer_8859_1_da(),
|
||||
newRecognizer_8859_1_de(),
|
||||
newRecognizer_8859_1_es(),
|
||||
newRecognizer_8859_1_fr(),
|
||||
newRecognizer_8859_1_it(),
|
||||
newRecognizer_8859_1_nl(),
|
||||
newRecognizer_8859_1_no(),
|
||||
newRecognizer_8859_1_pt(),
|
||||
newRecognizer_8859_1_sv(),
|
||||
newRecognizer_8859_2_cs(),
|
||||
newRecognizer_8859_2_hu(),
|
||||
newRecognizer_8859_2_pl(),
|
||||
newRecognizer_8859_2_ro(),
|
||||
newRecognizer_8859_5_ru(),
|
||||
newRecognizer_8859_6_ar(),
|
||||
newRecognizer_8859_7_el(),
|
||||
newRecognizer_8859_8_I_he(),
|
||||
newRecognizer_8859_8_he(),
|
||||
newRecognizer_windows_1251(),
|
||||
newRecognizer_windows_1256(),
|
||||
newRecognizer_KOI8_R(),
|
||||
newRecognizer_8859_9_tr(),
|
||||
|
||||
newRecognizer_sjis(),
|
||||
newRecognizer_gb_18030(),
|
||||
newRecognizer_euc_jp(),
|
||||
newRecognizer_euc_kr(),
|
||||
newRecognizer_big5(),
|
||||
|
||||
newRecognizer_2022JP(),
|
||||
newRecognizer_2022KR(),
|
||||
newRecognizer_2022CN(),
|
||||
|
||||
newRecognizer_IBM424_he_rtl(),
|
||||
newRecognizer_IBM424_he_ltr(),
|
||||
newRecognizer_IBM420_ar_rtl(),
|
||||
newRecognizer_IBM420_ar_ltr(),
|
||||
}
|
||||
|
||||
// NewTextDetector creates a Detector for plain text.
|
||||
func NewTextDetector() *Detector {
|
||||
return &Detector{recognizers, false}
|
||||
}
|
||||
|
||||
// NewHtmlDetector creates a Detector for Html.
|
||||
func NewHtmlDetector() *Detector {
|
||||
return &Detector{recognizers, true}
|
||||
}
|
||||
|
||||
var (
|
||||
NotDetectedError = errors.New("Charset not detected.")
|
||||
)
|
||||
|
||||
// DetectBest returns the Result with highest Confidence.
|
||||
func (d *Detector) DetectBest(b []byte) (r *Result, err error) {
|
||||
var all []Result
|
||||
if all, err = d.DetectAll(b); err == nil {
|
||||
r = &all[0]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// DetectAll returns all Results which have non-zero Confidence. The Results are sorted by Confidence in descending order.
|
||||
func (d *Detector) DetectAll(b []byte) ([]Result, error) {
|
||||
input := newRecognizerInput(b, d.stripTag)
|
||||
outputChan := make(chan recognizerOutput)
|
||||
for _, r := range d.recognizers {
|
||||
go matchHelper(r, input, outputChan)
|
||||
}
|
||||
outputs := make([]recognizerOutput, 0, len(d.recognizers))
|
||||
for i := 0; i < len(d.recognizers); i++ {
|
||||
o := <-outputChan
|
||||
if o.Confidence > 0 {
|
||||
outputs = append(outputs, o)
|
||||
}
|
||||
}
|
||||
if len(outputs) == 0 {
|
||||
return nil, NotDetectedError
|
||||
}
|
||||
|
||||
sort.Sort(recognizerOutputs(outputs))
|
||||
dedupOutputs := make([]Result, 0, len(outputs))
|
||||
foundCharsets := make(map[string]struct{}, len(outputs))
|
||||
for _, o := range outputs {
|
||||
if _, found := foundCharsets[o.Charset]; !found {
|
||||
dedupOutputs = append(dedupOutputs, Result(o))
|
||||
foundCharsets[o.Charset] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(dedupOutputs) == 0 {
|
||||
return nil, NotDetectedError
|
||||
}
|
||||
return dedupOutputs, nil
|
||||
}
|
||||
|
||||
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
|
||||
outputChan <- r.Match(input)
|
||||
}
|
||||
|
||||
type recognizerOutputs []recognizerOutput
|
||||
|
||||
func (r recognizerOutputs) Len() int { return len(r) }
|
||||
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
|
||||
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
|
@ -0,0 +1,345 @@
|
|||
package chardet
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"math"
|
||||
)
|
||||
|
||||
type recognizerMultiByte struct {
|
||||
charset string
|
||||
language string
|
||||
decoder charDecoder
|
||||
commonChars []uint16
|
||||
}
|
||||
|
||||
type charDecoder interface {
|
||||
DecodeOneChar([]byte) (c uint16, remain []byte, err error)
|
||||
}
|
||||
|
||||
func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
return recognizerOutput{
|
||||
Charset: r.charset,
|
||||
Language: r.language,
|
||||
Confidence: r.matchConfidence(input),
|
||||
}
|
||||
}
|
||||
|
||||
func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
|
||||
raw := input.raw
|
||||
var c uint16
|
||||
var err error
|
||||
var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
|
||||
for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
|
||||
totalCharCount++
|
||||
if err != nil {
|
||||
badCharCount++
|
||||
} else if c <= 0xFF {
|
||||
singleByteCharCount++
|
||||
} else {
|
||||
doubleByteCharCount++
|
||||
if r.commonChars != nil && binarySearch(r.commonChars, c) {
|
||||
commonCharCount++
|
||||
}
|
||||
}
|
||||
if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
if doubleByteCharCount <= 10 && badCharCount == 0 {
|
||||
if doubleByteCharCount == 0 && totalCharCount < 10 {
|
||||
return 0
|
||||
} else {
|
||||
return 10
|
||||
}
|
||||
}
|
||||
|
||||
if doubleByteCharCount < 20*badCharCount {
|
||||
return 0
|
||||
}
|
||||
if r.commonChars == nil {
|
||||
confidence := 30 + doubleByteCharCount - 20*badCharCount
|
||||
if confidence > 100 {
|
||||
confidence = 100
|
||||
}
|
||||
return confidence
|
||||
}
|
||||
maxVal := math.Log(float64(doubleByteCharCount) / 4)
|
||||
scaleFactor := 90 / maxVal
|
||||
confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
|
||||
if confidence > 100 {
|
||||
confidence = 100
|
||||
}
|
||||
if confidence < 0 {
|
||||
confidence = 0
|
||||
}
|
||||
return confidence
|
||||
}
|
||||
|
||||
func binarySearch(l []uint16, c uint16) bool {
|
||||
start := 0
|
||||
end := len(l) - 1
|
||||
for start <= end {
|
||||
mid := (start + end) / 2
|
||||
if c == l[mid] {
|
||||
return true
|
||||
} else if c < l[mid] {
|
||||
end = mid - 1
|
||||
} else {
|
||||
start = mid + 1
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var eobError = errors.New("End of input buffer")
|
||||
var badCharError = errors.New("Decode a bad char")
|
||||
|
||||
type charDecoder_sjis struct {
|
||||
}
|
||||
|
||||
func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
|
||||
if len(input) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
first := input[0]
|
||||
c = uint16(first)
|
||||
remain = input[1:]
|
||||
if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
|
||||
return
|
||||
}
|
||||
if len(remain) == 0 {
|
||||
return c, remain, badCharError
|
||||
}
|
||||
second := remain[0]
|
||||
remain = remain[1:]
|
||||
c = c<<8 | uint16(second)
|
||||
if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
|
||||
} else {
|
||||
err = badCharError
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var commonChars_sjis = []uint16{
|
||||
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
|
||||
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
|
||||
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
|
||||
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
|
||||
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
|
||||
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
|
||||
}
|
||||
|
||||
func newRecognizer_sjis() *recognizerMultiByte {
|
||||
return &recognizerMultiByte{
|
||||
"Shift_JIS",
|
||||
"ja",
|
||||
charDecoder_sjis{},
|
||||
commonChars_sjis,
|
||||
}
|
||||
}
|
||||
|
||||
type charDecoder_euc struct {
|
||||
}
|
||||
|
||||
func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
|
||||
if len(input) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
first := input[0]
|
||||
remain = input[1:]
|
||||
c = uint16(first)
|
||||
if first <= 0x8D {
|
||||
return uint16(first), remain, nil
|
||||
}
|
||||
if len(remain) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
second := remain[0]
|
||||
remain = remain[1:]
|
||||
c = c<<8 | uint16(second)
|
||||
if first >= 0xA1 && first <= 0xFE {
|
||||
if second < 0xA1 {
|
||||
err = badCharError
|
||||
}
|
||||
return
|
||||
}
|
||||
if first == 0x8E {
|
||||
if second < 0xA1 {
|
||||
err = badCharError
|
||||
}
|
||||
return
|
||||
}
|
||||
if first == 0x8F {
|
||||
if len(remain) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
third := remain[0]
|
||||
remain = remain[1:]
|
||||
c = c<<0 | uint16(third)
|
||||
if third < 0xa1 {
|
||||
err = badCharError
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var commonChars_euc_jp = []uint16{
|
||||
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
|
||||
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
|
||||
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
|
||||
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
|
||||
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
|
||||
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
|
||||
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
|
||||
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
|
||||
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
|
||||
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
|
||||
}
|
||||
|
||||
var commonChars_euc_kr = []uint16{
|
||||
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
|
||||
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
|
||||
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
|
||||
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
|
||||
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
|
||||
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
|
||||
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
|
||||
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
|
||||
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
|
||||
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
|
||||
}
|
||||
|
||||
func newRecognizer_euc_jp() *recognizerMultiByte {
|
||||
return &recognizerMultiByte{
|
||||
"EUC-JP",
|
||||
"ja",
|
||||
charDecoder_euc{},
|
||||
commonChars_euc_jp,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_euc_kr() *recognizerMultiByte {
|
||||
return &recognizerMultiByte{
|
||||
"EUC-KR",
|
||||
"ko",
|
||||
charDecoder_euc{},
|
||||
commonChars_euc_kr,
|
||||
}
|
||||
}
|
||||
|
||||
type charDecoder_big5 struct {
|
||||
}
|
||||
|
||||
func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
|
||||
if len(input) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
first := input[0]
|
||||
remain = input[1:]
|
||||
c = uint16(first)
|
||||
if first <= 0x7F || first == 0xFF {
|
||||
return
|
||||
}
|
||||
if len(remain) == 0 {
|
||||
return c, nil, eobError
|
||||
}
|
||||
second := remain[0]
|
||||
remain = remain[1:]
|
||||
c = c<<8 | uint16(second)
|
||||
if second < 0x40 || second == 0x7F || second == 0xFF {
|
||||
err = badCharError
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var commonChars_big5 = []uint16{
|
||||
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
|
||||
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
|
||||
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
|
||||
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
|
||||
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
|
||||
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
|
||||
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
|
||||
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
|
||||
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
|
||||
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
|
||||
}
|
||||
|
||||
func newRecognizer_big5() *recognizerMultiByte {
|
||||
return &recognizerMultiByte{
|
||||
"Big5",
|
||||
"zh",
|
||||
charDecoder_big5{},
|
||||
commonChars_big5,
|
||||
}
|
||||
}
|
||||
|
||||
type charDecoder_gb_18030 struct {
|
||||
}
|
||||
|
||||
func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
|
||||
if len(input) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
first := input[0]
|
||||
remain = input[1:]
|
||||
c = uint16(first)
|
||||
if first <= 0x80 {
|
||||
return
|
||||
}
|
||||
if len(remain) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
second := remain[0]
|
||||
remain = remain[1:]
|
||||
c = c<<8 | uint16(second)
|
||||
if first >= 0x81 && first <= 0xFE {
|
||||
if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
|
||||
return
|
||||
}
|
||||
|
||||
if second >= 0x30 && second <= 0x39 {
|
||||
if len(remain) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
third := remain[0]
|
||||
remain = remain[1:]
|
||||
if third >= 0x81 && third <= 0xFE {
|
||||
if len(remain) == 0 {
|
||||
return 0, nil, eobError
|
||||
}
|
||||
fourth := remain[0]
|
||||
remain = remain[1:]
|
||||
if fourth >= 0x30 && fourth <= 0x39 {
|
||||
c = c<<16 | uint16(third)<<8 | uint16(fourth)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
err = badCharError
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
var commonChars_gb_18030 = []uint16{
|
||||
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
|
||||
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
|
||||
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
|
||||
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
|
||||
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
|
||||
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
|
||||
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
|
||||
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
|
||||
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
|
||||
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
|
||||
}
|
||||
|
||||
func newRecognizer_gb_18030() *recognizerMultiByte {
|
||||
return &recognizerMultiByte{
|
||||
"GB-18030",
|
||||
"zh",
|
||||
charDecoder_gb_18030{},
|
||||
commonChars_gb_18030,
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package chardet
|
||||
|
||||
type recognizer interface {
|
||||
Match(*recognizerInput) recognizerOutput
|
||||
}
|
||||
|
||||
type recognizerOutput Result
|
||||
|
||||
type recognizerInput struct {
|
||||
raw []byte
|
||||
input []byte
|
||||
tagStripped bool
|
||||
byteStats []int
|
||||
hasC1Bytes bool
|
||||
}
|
||||
|
||||
func newRecognizerInput(raw []byte, stripTag bool) *recognizerInput {
|
||||
input, stripped := mayStripInput(raw, stripTag)
|
||||
byteStats := computeByteStats(input)
|
||||
return &recognizerInput{
|
||||
raw: raw,
|
||||
input: input,
|
||||
tagStripped: stripped,
|
||||
byteStats: byteStats,
|
||||
hasC1Bytes: computeHasC1Bytes(byteStats),
|
||||
}
|
||||
}
|
||||
|
||||
func mayStripInput(raw []byte, stripTag bool) (out []byte, stripped bool) {
|
||||
const inputBufferSize = 8192
|
||||
out = make([]byte, 0, inputBufferSize)
|
||||
var badTags, openTags int32
|
||||
var inMarkup bool = false
|
||||
stripped = false
|
||||
if stripTag {
|
||||
stripped = true
|
||||
for _, c := range raw {
|
||||
if c == '<' {
|
||||
if inMarkup {
|
||||
badTags += 1
|
||||
}
|
||||
inMarkup = true
|
||||
openTags += 1
|
||||
}
|
||||
if !inMarkup {
|
||||
out = append(out, c)
|
||||
if len(out) >= inputBufferSize {
|
||||
break
|
||||
}
|
||||
}
|
||||
if c == '>' {
|
||||
inMarkup = false
|
||||
}
|
||||
}
|
||||
}
|
||||
if openTags < 5 || openTags/5 < badTags || (len(out) < 100 && len(raw) > 600) {
|
||||
limit := len(raw)
|
||||
if limit > inputBufferSize {
|
||||
limit = inputBufferSize
|
||||
}
|
||||
out = make([]byte, limit)
|
||||
copy(out, raw[:limit])
|
||||
stripped = false
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func computeByteStats(input []byte) []int {
|
||||
r := make([]int, 256)
|
||||
for _, c := range input {
|
||||
r[c] += 1
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func computeHasC1Bytes(byteStats []int) bool {
|
||||
for _, count := range byteStats[0x80 : 0x9F+1] {
|
||||
if count > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,882 @@
|
|||
package chardet
|
||||
|
||||
// Recognizer for single byte charset family
|
||||
type recognizerSingleByte struct {
|
||||
charset string
|
||||
hasC1ByteCharset string
|
||||
language string
|
||||
charMap *[256]byte
|
||||
ngram *[64]uint32
|
||||
}
|
||||
|
||||
func (r *recognizerSingleByte) Match(input *recognizerInput) recognizerOutput {
|
||||
var charset string = r.charset
|
||||
if input.hasC1Bytes && len(r.hasC1ByteCharset) > 0 {
|
||||
charset = r.hasC1ByteCharset
|
||||
}
|
||||
return recognizerOutput{
|
||||
Charset: charset,
|
||||
Language: r.language,
|
||||
Confidence: r.parseNgram(input.input),
|
||||
}
|
||||
}
|
||||
|
||||
type ngramState struct {
|
||||
ngram uint32
|
||||
ignoreSpace bool
|
||||
ngramCount, ngramHit uint32
|
||||
table *[64]uint32
|
||||
}
|
||||
|
||||
func newNgramState(table *[64]uint32) *ngramState {
|
||||
return &ngramState{
|
||||
ngram: 0,
|
||||
ignoreSpace: false,
|
||||
ngramCount: 0,
|
||||
ngramHit: 0,
|
||||
table: table,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ngramState) AddByte(b byte) {
|
||||
const ngramMask = 0xFFFFFF
|
||||
if !(b == 0x20 && s.ignoreSpace) {
|
||||
s.ngram = ((s.ngram << 8) | uint32(b)) & ngramMask
|
||||
s.ignoreSpace = (s.ngram == 0x20)
|
||||
s.ngramCount++
|
||||
if s.lookup() {
|
||||
s.ngramHit++
|
||||
}
|
||||
}
|
||||
s.ignoreSpace = (b == 0x20)
|
||||
}
|
||||
|
||||
func (s *ngramState) HitRate() float32 {
|
||||
if s.ngramCount == 0 {
|
||||
return 0
|
||||
}
|
||||
return float32(s.ngramHit) / float32(s.ngramCount)
|
||||
}
|
||||
|
||||
func (s *ngramState) lookup() bool {
|
||||
var index int
|
||||
if s.table[index+32] <= s.ngram {
|
||||
index += 32
|
||||
}
|
||||
if s.table[index+16] <= s.ngram {
|
||||
index += 16
|
||||
}
|
||||
if s.table[index+8] <= s.ngram {
|
||||
index += 8
|
||||
}
|
||||
if s.table[index+4] <= s.ngram {
|
||||
index += 4
|
||||
}
|
||||
if s.table[index+2] <= s.ngram {
|
||||
index += 2
|
||||
}
|
||||
if s.table[index+1] <= s.ngram {
|
||||
index += 1
|
||||
}
|
||||
if s.table[index] > s.ngram {
|
||||
index -= 1
|
||||
}
|
||||
if index < 0 || s.table[index] != s.ngram {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func (r *recognizerSingleByte) parseNgram(input []byte) int {
|
||||
state := newNgramState(r.ngram)
|
||||
for _, inChar := range input {
|
||||
c := r.charMap[inChar]
|
||||
if c != 0 {
|
||||
state.AddByte(c)
|
||||
}
|
||||
}
|
||||
state.AddByte(0x20)
|
||||
rate := state.HitRate()
|
||||
if rate > 0.33 {
|
||||
return 98
|
||||
}
|
||||
return int(rate * 300)
|
||||
}
|
||||
|
||||
var charMap_8859_1 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
|
||||
0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_en = [64]uint32{
|
||||
0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
|
||||
0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
|
||||
0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
|
||||
0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_da = [64]uint32{
|
||||
0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
|
||||
0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
|
||||
0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
|
||||
0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_de = [64]uint32{
|
||||
0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
|
||||
0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
|
||||
0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
|
||||
0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_es = [64]uint32{
|
||||
0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
|
||||
0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
|
||||
0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
|
||||
0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_fr = [64]uint32{
|
||||
0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
|
||||
0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
|
||||
0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
|
||||
0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_it = [64]uint32{
|
||||
0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
|
||||
0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
|
||||
0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
|
||||
0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_nl = [64]uint32{
|
||||
0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
|
||||
0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
|
||||
0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
|
||||
0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_no = [64]uint32{
|
||||
0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
|
||||
0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
|
||||
0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
|
||||
0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_pt = [64]uint32{
|
||||
0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
|
||||
0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
|
||||
0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
|
||||
0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
|
||||
}
|
||||
|
||||
var ngrams_8859_1_sv = [64]uint32{
|
||||
0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
|
||||
0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
|
||||
0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
|
||||
0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_1(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-1",
|
||||
hasC1ByteCharset: "windows-1252",
|
||||
language: language,
|
||||
charMap: &charMap_8859_1,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_1_en() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("en", &ngrams_8859_1_en)
|
||||
}
|
||||
func newRecognizer_8859_1_da() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("da", &ngrams_8859_1_da)
|
||||
}
|
||||
func newRecognizer_8859_1_de() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("de", &ngrams_8859_1_de)
|
||||
}
|
||||
func newRecognizer_8859_1_es() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("es", &ngrams_8859_1_es)
|
||||
}
|
||||
func newRecognizer_8859_1_fr() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("fr", &ngrams_8859_1_fr)
|
||||
}
|
||||
func newRecognizer_8859_1_it() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("it", &ngrams_8859_1_it)
|
||||
}
|
||||
func newRecognizer_8859_1_nl() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("nl", &ngrams_8859_1_nl)
|
||||
}
|
||||
func newRecognizer_8859_1_no() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("no", &ngrams_8859_1_no)
|
||||
}
|
||||
func newRecognizer_8859_1_pt() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("pt", &ngrams_8859_1_pt)
|
||||
}
|
||||
func newRecognizer_8859_1_sv() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("sv", &ngrams_8859_1_sv)
|
||||
}
|
||||
|
||||
var charMap_8859_2 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
|
||||
0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
|
||||
0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
|
||||
0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
|
||||
}
|
||||
|
||||
var ngrams_8859_2_cs = [64]uint32{
|
||||
0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
|
||||
0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
|
||||
0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
|
||||
0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
|
||||
}
|
||||
|
||||
var ngrams_8859_2_hu = [64]uint32{
|
||||
0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
|
||||
0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
|
||||
0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
|
||||
0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
|
||||
}
|
||||
|
||||
var ngrams_8859_2_pl = [64]uint32{
|
||||
0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
|
||||
0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
|
||||
0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
|
||||
0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
|
||||
}
|
||||
|
||||
var ngrams_8859_2_ro = [64]uint32{
|
||||
0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
|
||||
0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
|
||||
0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
|
||||
0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_2(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-2",
|
||||
hasC1ByteCharset: "windows-1250",
|
||||
language: language,
|
||||
charMap: &charMap_8859_2,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_2_cs() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("cs", &ngrams_8859_2_cs)
|
||||
}
|
||||
func newRecognizer_8859_2_hu() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("hu", &ngrams_8859_2_hu)
|
||||
}
|
||||
func newRecognizer_8859_2_pl() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("pl", &ngrams_8859_2_pl)
|
||||
}
|
||||
func newRecognizer_8859_2_ro() *recognizerSingleByte {
|
||||
return newRecognizer_8859_1("ro", &ngrams_8859_2_ro)
|
||||
}
|
||||
|
||||
var charMap_8859_5 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
|
||||
}
|
||||
|
||||
var ngrams_8859_5_ru = [64]uint32{
|
||||
0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
|
||||
0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
|
||||
0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
|
||||
0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_5(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-5",
|
||||
language: language,
|
||||
charMap: &charMap_8859_5,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_5_ru() *recognizerSingleByte {
|
||||
return newRecognizer_8859_5("ru", &ngrams_8859_5_ru)
|
||||
}
|
||||
|
||||
var charMap_8859_6 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
|
||||
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
}
|
||||
|
||||
var ngrams_8859_6_ar = [64]uint32{
|
||||
0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
|
||||
0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
|
||||
0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
|
||||
0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_6(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-6",
|
||||
language: language,
|
||||
charMap: &charMap_8859_6,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_6_ar() *recognizerSingleByte {
|
||||
return newRecognizer_8859_6("ar", &ngrams_8859_6_ar)
|
||||
}
|
||||
|
||||
var charMap_8859_7 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
|
||||
0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
|
||||
0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
|
||||
}
|
||||
|
||||
var ngrams_8859_7_el = [64]uint32{
|
||||
0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
|
||||
0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
|
||||
0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
|
||||
0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_7(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-7",
|
||||
hasC1ByteCharset: "windows-1253",
|
||||
language: language,
|
||||
charMap: &charMap_8859_7,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_7_el() *recognizerSingleByte {
|
||||
return newRecognizer_8859_7("el", &ngrams_8859_7_el)
|
||||
}
|
||||
|
||||
var charMap_8859_8 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
}
|
||||
|
||||
var ngrams_8859_8_I_he = [64]uint32{
|
||||
0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
|
||||
0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
|
||||
0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
|
||||
0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
|
||||
}
|
||||
|
||||
var ngrams_8859_8_he = [64]uint32{
|
||||
0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
|
||||
0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
|
||||
0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
|
||||
0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_8(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-8",
|
||||
hasC1ByteCharset: "windows-1255",
|
||||
language: language,
|
||||
charMap: &charMap_8859_8,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_8_I_he() *recognizerSingleByte {
|
||||
r := newRecognizer_8859_8("he", &ngrams_8859_8_I_he)
|
||||
r.charset = "ISO-8859-8-I"
|
||||
return r
|
||||
}
|
||||
|
||||
func newRecognizer_8859_8_he() *recognizerSingleByte {
|
||||
return newRecognizer_8859_8("he", &ngrams_8859_8_he)
|
||||
}
|
||||
|
||||
var charMap_8859_9 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
|
||||
0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
}
|
||||
|
||||
var ngrams_8859_9_tr = [64]uint32{
|
||||
0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
|
||||
0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
|
||||
0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
|
||||
0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
|
||||
}
|
||||
|
||||
func newRecognizer_8859_9(language string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "ISO-8859-9",
|
||||
hasC1ByteCharset: "windows-1254",
|
||||
language: language,
|
||||
charMap: &charMap_8859_9,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_8859_9_tr() *recognizerSingleByte {
|
||||
return newRecognizer_8859_9("tr", &ngrams_8859_9_tr)
|
||||
}
|
||||
|
||||
var charMap_windows_1256 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
|
||||
0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
|
||||
0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
|
||||
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
|
||||
0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
|
||||
}
|
||||
|
||||
var ngrams_windows_1256 = [64]uint32{
|
||||
0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
|
||||
0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
|
||||
0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
|
||||
0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
|
||||
}
|
||||
|
||||
func newRecognizer_windows_1256() *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "windows-1256",
|
||||
language: "ar",
|
||||
charMap: &charMap_windows_1256,
|
||||
ngram: &ngrams_windows_1256,
|
||||
}
|
||||
}
|
||||
|
||||
var charMap_windows_1251 = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
|
||||
0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
|
||||
0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
|
||||
0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
|
||||
0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
|
||||
0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
|
||||
0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
|
||||
0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
|
||||
0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
|
||||
}
|
||||
|
||||
var ngrams_windows_1251 = [64]uint32{
|
||||
0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
|
||||
0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
|
||||
0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
|
||||
0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
|
||||
}
|
||||
|
||||
func newRecognizer_windows_1251() *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "windows-1251",
|
||||
language: "ar",
|
||||
charMap: &charMap_windows_1251,
|
||||
ngram: &ngrams_windows_1251,
|
||||
}
|
||||
}
|
||||
|
||||
var charMap_KOI8_R = [256]byte{
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
|
||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
|
||||
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
|
||||
0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
|
||||
0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
|
||||
0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
}
|
||||
|
||||
var ngrams_KOI8_R = [64]uint32{
|
||||
0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
|
||||
0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
|
||||
0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
|
||||
0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
|
||||
}
|
||||
|
||||
func newRecognizer_KOI8_R() *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: "KOI8-R",
|
||||
language: "ru",
|
||||
charMap: &charMap_KOI8_R,
|
||||
ngram: &ngrams_KOI8_R,
|
||||
}
|
||||
}
|
||||
|
||||
var charMap_IBM424_he = [256]byte{
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
|
||||
/* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
}
|
||||
|
||||
var ngrams_IBM424_he_rtl = [64]uint32{
|
||||
0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
|
||||
0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
|
||||
0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
|
||||
0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
|
||||
}
|
||||
|
||||
var ngrams_IBM424_he_ltr = [64]uint32{
|
||||
0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
|
||||
0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
|
||||
0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
|
||||
0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
|
||||
}
|
||||
|
||||
func newRecognizer_IBM424_he(charset string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: charset,
|
||||
language: "he",
|
||||
charMap: &charMap_IBM424_he,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_IBM424_he_rtl() *recognizerSingleByte {
|
||||
return newRecognizer_IBM424_he("IBM424_rtl", &ngrams_IBM424_he_rtl)
|
||||
}
|
||||
|
||||
func newRecognizer_IBM424_he_ltr() *recognizerSingleByte {
|
||||
return newRecognizer_IBM424_he("IBM424_ltr", &ngrams_IBM424_he_ltr)
|
||||
}
|
||||
|
||||
var charMap_IBM420_ar = [256]byte{
|
||||
/* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
|
||||
/* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
|
||||
/* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
|
||||
/* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
|
||||
/* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
|
||||
/* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
||||
/* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
|
||||
/* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
|
||||
/* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
|
||||
/* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
|
||||
}
|
||||
|
||||
var ngrams_IBM420_ar_rtl = [64]uint32{
|
||||
0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
|
||||
0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
|
||||
0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
|
||||
0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
|
||||
}
|
||||
|
||||
var ngrams_IBM420_ar_ltr = [64]uint32{
|
||||
0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
|
||||
0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
|
||||
0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
|
||||
0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156,
|
||||
}
|
||||
|
||||
func newRecognizer_IBM420_ar(charset string, ngram *[64]uint32) *recognizerSingleByte {
|
||||
return &recognizerSingleByte{
|
||||
charset: charset,
|
||||
language: "ar",
|
||||
charMap: &charMap_IBM420_ar,
|
||||
ngram: ngram,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_IBM420_ar_rtl() *recognizerSingleByte {
|
||||
return newRecognizer_IBM420_ar("IBM420_rtl", &ngrams_IBM420_ar_rtl)
|
||||
}
|
||||
|
||||
func newRecognizer_IBM420_ar_ltr() *recognizerSingleByte {
|
||||
return newRecognizer_IBM420_ar("IBM420_ltr", &ngrams_IBM420_ar_ltr)
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
package chardet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
)
|
||||
|
||||
var (
|
||||
utf16beBom = []byte{0xFE, 0xFF}
|
||||
utf16leBom = []byte{0xFF, 0xFE}
|
||||
utf32beBom = []byte{0x00, 0x00, 0xFE, 0xFF}
|
||||
utf32leBom = []byte{0xFF, 0xFE, 0x00, 0x00}
|
||||
)
|
||||
|
||||
type recognizerUtf16be struct {
|
||||
}
|
||||
|
||||
func newRecognizer_utf16be() *recognizerUtf16be {
|
||||
return &recognizerUtf16be{}
|
||||
}
|
||||
|
||||
func (*recognizerUtf16be) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
output = recognizerOutput{
|
||||
Charset: "UTF-16BE",
|
||||
}
|
||||
if bytes.HasPrefix(input.raw, utf16beBom) {
|
||||
output.Confidence = 100
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type recognizerUtf16le struct {
|
||||
}
|
||||
|
||||
func newRecognizer_utf16le() *recognizerUtf16le {
|
||||
return &recognizerUtf16le{}
|
||||
}
|
||||
|
||||
func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
output = recognizerOutput{
|
||||
Charset: "UTF-16LE",
|
||||
}
|
||||
if bytes.HasPrefix(input.raw, utf16leBom) && !bytes.HasPrefix(input.raw, utf32leBom) {
|
||||
output.Confidence = 100
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
type recognizerUtf32 struct {
|
||||
name string
|
||||
bom []byte
|
||||
decodeChar func(input []byte) uint32
|
||||
}
|
||||
|
||||
func decodeUtf32be(input []byte) uint32 {
|
||||
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
|
||||
}
|
||||
|
||||
func decodeUtf32le(input []byte) uint32 {
|
||||
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
|
||||
}
|
||||
|
||||
func newRecognizer_utf32be() *recognizerUtf32 {
|
||||
return &recognizerUtf32{
|
||||
"UTF-32BE",
|
||||
utf32beBom,
|
||||
decodeUtf32be,
|
||||
}
|
||||
}
|
||||
|
||||
func newRecognizer_utf32le() *recognizerUtf32 {
|
||||
return &recognizerUtf32{
|
||||
"UTF-32LE",
|
||||
utf32leBom,
|
||||
decodeUtf32le,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
output = recognizerOutput{
|
||||
Charset: r.name,
|
||||
}
|
||||
hasBom := bytes.HasPrefix(input.raw, r.bom)
|
||||
var numValid, numInvalid uint32
|
||||
for b := input.raw; len(b) >= 4; b = b[4:] {
|
||||
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
|
||||
numInvalid++
|
||||
} else {
|
||||
numValid++
|
||||
}
|
||||
}
|
||||
if hasBom && numInvalid == 0 {
|
||||
output.Confidence = 100
|
||||
} else if hasBom && numValid > numInvalid*10 {
|
||||
output.Confidence = 80
|
||||
} else if numValid > 3 && numInvalid == 0 {
|
||||
output.Confidence = 100
|
||||
} else if numValid > 0 && numInvalid == 0 {
|
||||
output.Confidence = 80
|
||||
} else if numValid > numInvalid*10 {
|
||||
output.Confidence = 25
|
||||
}
|
||||
return
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
package chardet
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
)
|
||||
|
||||
var utf8Bom = []byte{0xEF, 0xBB, 0xBF}
|
||||
|
||||
type recognizerUtf8 struct {
|
||||
}
|
||||
|
||||
func newRecognizer_utf8() *recognizerUtf8 {
|
||||
return &recognizerUtf8{}
|
||||
}
|
||||
|
||||
func (*recognizerUtf8) Match(input *recognizerInput) (output recognizerOutput) {
|
||||
output = recognizerOutput{
|
||||
Charset: "UTF-8",
|
||||
}
|
||||
hasBom := bytes.HasPrefix(input.raw, utf8Bom)
|
||||
inputLen := len(input.raw)
|
||||
var numValid, numInvalid uint32
|
||||
var trailBytes uint8
|
||||
for i := 0; i < inputLen; i++ {
|
||||
c := input.raw[i]
|
||||
if c&0x80 == 0 {
|
||||
continue
|
||||
}
|
||||
if c&0xE0 == 0xC0 {
|
||||
trailBytes = 1
|
||||
} else if c&0xF0 == 0xE0 {
|
||||
trailBytes = 2
|
||||
} else if c&0xF8 == 0xF0 {
|
||||
trailBytes = 3
|
||||
} else {
|
||||
numInvalid++
|
||||
if numInvalid > 5 {
|
||||
break
|
||||
}
|
||||
trailBytes = 0
|
||||
}
|
||||
|
||||
for i++; i < inputLen; i++ {
|
||||
c = input.raw[i]
|
||||
if c&0xC0 != 0x80 {
|
||||
numInvalid++
|
||||
break
|
||||
}
|
||||
if trailBytes--; trailBytes == 0 {
|
||||
numValid++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hasBom && numInvalid == 0 {
|
||||
output.Confidence = 100
|
||||
} else if hasBom && numValid > numInvalid*10 {
|
||||
output.Confidence = 80
|
||||
} else if numValid > 3 && numInvalid == 0 {
|
||||
output.Confidence = 100
|
||||
} else if numValid > 0 && numInvalid == 0 {
|
||||
output.Confidence = 80
|
||||
} else if numValid == 0 && numInvalid == 0 {
|
||||
// Plain ASCII
|
||||
output.Confidence = 10
|
||||
} else if numValid > numInvalid*10 {
|
||||
output.Confidence = 25
|
||||
}
|
||||
return
|
||||
}
|
|
@ -400,6 +400,14 @@
|
|||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/paulrosania/go-charset",
|
||||
"repository": "https://github.com/paulrosania/go-charset",
|
||||
"vcs": "git",
|
||||
"revision": "621bb39fcc835dce592e682f5073025d0169587b",
|
||||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/pborman/uuid",
|
||||
"repository": "https://github.com/pborman/uuid",
|
||||
|
@ -416,6 +424,14 @@
|
|||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/saintfish/chardet",
|
||||
"repository": "https://github.com/saintfish/chardet",
|
||||
"vcs": "git",
|
||||
"revision": "3af4cd4741ca4f3eb0c407c034571a6fb0ea529c",
|
||||
"branch": "master",
|
||||
"notests": true
|
||||
},
|
||||
{
|
||||
"importpath": "github.com/sorcix/irc",
|
||||
"repository": "https://github.com/sorcix/irc",
|
||||
|
|
Loading…
Reference in New Issue