route/vendor/github.com/apparentlymart/go-textseg/textseg/make_tables.go

308 lines
7.7 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// Modified by Martin Atkins to serve the needs of package textseg.
// +build ignore
package main
import (
"bufio"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"sort"
"strconv"
"strings"
"unicode"
)
var url = flag.String("url",
"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
"URL of Unicode database directory")
var verbose = flag.Bool("verbose",
false,
"write data to stdout as it is parsed")
var localFiles = flag.Bool("local",
false,
"data files have been copied to the current directory; for debugging only")
var outputFile = flag.String("output",
"",
"output file for generated tables; default stdout")
var output *bufio.Writer
func main() {
flag.Parse()
setupOutput()
graphemePropertyRanges := make(map[string]*unicode.RangeTable)
loadUnicodeData("GraphemeBreakProperty.txt", graphemePropertyRanges)
wordPropertyRanges := make(map[string]*unicode.RangeTable)
loadUnicodeData("WordBreakProperty.txt", wordPropertyRanges)
sentencePropertyRanges := make(map[string]*unicode.RangeTable)
loadUnicodeData("SentenceBreakProperty.txt", sentencePropertyRanges)
fmt.Fprintf(output, fileHeader, *url)
generateTables("Grapheme", graphemePropertyRanges)
generateTables("Word", wordPropertyRanges)
generateTables("Sentence", sentencePropertyRanges)
flushOutput()
}
// WordBreakProperty.txt has the form:
// 05F0..05F2 ; Hebrew_Letter # Lo [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
// FB1D ; Hebrew_Letter # Lo HEBREW LETTER YOD WITH HIRIQ
func openReader(file string) (input io.ReadCloser) {
if *localFiles {
f, err := os.Open(file)
if err != nil {
log.Fatal(err)
}
input = f
} else {
path := *url + file
resp, err := http.Get(path)
if err != nil {
log.Fatal(err)
}
if resp.StatusCode != 200 {
log.Fatal("bad GET status for "+file, resp.Status)
}
input = resp.Body
}
return
}
func loadUnicodeData(filename string, propertyRanges map[string]*unicode.RangeTable) {
f := openReader(filename)
defer f.Close()
bufioReader := bufio.NewReader(f)
line, err := bufioReader.ReadString('\n')
for err == nil {
parseLine(line, propertyRanges)
line, err = bufioReader.ReadString('\n')
}
// if the err was EOF still need to process last value
if err == io.EOF {
parseLine(line, propertyRanges)
}
}
const comment = "#"
const sep = ";"
const rnge = ".."
func parseLine(line string, propertyRanges map[string]*unicode.RangeTable) {
if strings.HasPrefix(line, comment) {
return
}
line = strings.TrimSpace(line)
if len(line) == 0 {
return
}
commentStart := strings.Index(line, comment)
if commentStart > 0 {
line = line[0:commentStart]
}
pieces := strings.Split(line, sep)
if len(pieces) != 2 {
log.Printf("unexpected %d pieces in %s", len(pieces), line)
return
}
propertyName := strings.TrimSpace(pieces[1])
rangeTable, ok := propertyRanges[propertyName]
if !ok {
rangeTable = &unicode.RangeTable{
LatinOffset: 0,
}
propertyRanges[propertyName] = rangeTable
}
codepointRange := strings.TrimSpace(pieces[0])
rngeIndex := strings.Index(codepointRange, rnge)
if rngeIndex < 0 {
// single codepoint, not range
codepointInt, err := strconv.ParseUint(codepointRange, 16, 64)
if err != nil {
log.Printf("error parsing int: %v", err)
return
}
if codepointInt < 0x10000 {
r16 := unicode.Range16{
Lo: uint16(codepointInt),
Hi: uint16(codepointInt),
Stride: 1,
}
addR16ToTable(rangeTable, r16)
} else {
r32 := unicode.Range32{
Lo: uint32(codepointInt),
Hi: uint32(codepointInt),
Stride: 1,
}
addR32ToTable(rangeTable, r32)
}
} else {
rngeStart := codepointRange[0:rngeIndex]
rngeEnd := codepointRange[rngeIndex+2:]
rngeStartInt, err := strconv.ParseUint(rngeStart, 16, 64)
if err != nil {
log.Printf("error parsing int: %v", err)
return
}
rngeEndInt, err := strconv.ParseUint(rngeEnd, 16, 64)
if err != nil {
log.Printf("error parsing int: %v", err)
return
}
if rngeStartInt < 0x10000 && rngeEndInt < 0x10000 {
r16 := unicode.Range16{
Lo: uint16(rngeStartInt),
Hi: uint16(rngeEndInt),
Stride: 1,
}
addR16ToTable(rangeTable, r16)
} else if rngeStartInt >= 0x10000 && rngeEndInt >= 0x10000 {
r32 := unicode.Range32{
Lo: uint32(rngeStartInt),
Hi: uint32(rngeEndInt),
Stride: 1,
}
addR32ToTable(rangeTable, r32)
} else {
log.Printf("unexpected range")
}
}
}
func addR16ToTable(r *unicode.RangeTable, r16 unicode.Range16) {
if r.R16 == nil {
r.R16 = make([]unicode.Range16, 0, 1)
}
r.R16 = append(r.R16, r16)
if r16.Hi <= unicode.MaxLatin1 {
r.LatinOffset++
}
}
func addR32ToTable(r *unicode.RangeTable, r32 unicode.Range32) {
if r.R32 == nil {
r.R32 = make([]unicode.Range32, 0, 1)
}
r.R32 = append(r.R32, r32)
}
func generateTables(prefix string, propertyRanges map[string]*unicode.RangeTable) {
prNames := make([]string, 0, len(propertyRanges))
for k := range propertyRanges {
prNames = append(prNames, k)
}
sort.Strings(prNames)
for _, key := range prNames {
rt := propertyRanges[key]
fmt.Fprintf(output, "var _%s%s = %s\n", prefix, key, generateRangeTable(rt))
}
fmt.Fprintf(output, "type _%sRuneRange unicode.RangeTable\n", prefix)
fmt.Fprintf(output, "func _%sRuneType(r rune) *_%sRuneRange {\n", prefix, prefix)
fmt.Fprintf(output, "\tswitch {\n")
for _, key := range prNames {
fmt.Fprintf(output, "\tcase unicode.Is(_%s%s, r):\n\t\treturn (*_%sRuneRange)(_%s%s)\n", prefix, key, prefix, prefix, key)
}
fmt.Fprintf(output, "\tdefault:\n\t\treturn nil\n")
fmt.Fprintf(output, "\t}\n")
fmt.Fprintf(output, "}\n")
fmt.Fprintf(output, "func (rng *_%sRuneRange) String() string {\n", prefix)
fmt.Fprintf(output, "\tswitch (*unicode.RangeTable)(rng) {\n")
for _, key := range prNames {
fmt.Fprintf(output, "\tcase _%s%s:\n\t\treturn %q\n", prefix, key, key)
}
fmt.Fprintf(output, "\tdefault:\n\t\treturn \"Other\"\n")
fmt.Fprintf(output, "\t}\n")
fmt.Fprintf(output, "}\n")
}
func generateRangeTable(rt *unicode.RangeTable) string {
rv := "&unicode.RangeTable{\n"
if rt.R16 != nil {
rv += "\tR16: []unicode.Range16{\n"
for _, r16 := range rt.R16 {
rv += fmt.Sprintf("\t\t%#v,\n", r16)
}
rv += "\t},\n"
}
if rt.R32 != nil {
rv += "\tR32: []unicode.Range32{\n"
for _, r32 := range rt.R32 {
rv += fmt.Sprintf("\t\t%#v,\n", r32)
}
rv += "\t},\n"
}
rv += fmt.Sprintf("\t\tLatinOffset: %d,\n", rt.LatinOffset)
rv += "}\n"
return rv
}
const fileHeader = `// Generated by running
// maketables --url=%s
// DO NOT EDIT
package textseg
import(
"unicode"
)
`
func setupOutput() {
output = bufio.NewWriter(startGofmt())
}
// startGofmt connects output to a gofmt process if -output is set.
func startGofmt() io.Writer {
if *outputFile == "" {
return os.Stdout
}
stdout, err := os.Create(*outputFile)
if err != nil {
log.Fatal(err)
}
// Pipe output to gofmt.
gofmt := exec.Command("gofmt")
fd, err := gofmt.StdinPipe()
if err != nil {
log.Fatal(err)
}
gofmt.Stdout = stdout
gofmt.Stderr = os.Stderr
err = gofmt.Start()
if err != nil {
log.Fatal(err)
}
return fd
}
func flushOutput() {
err := output.Flush()
if err != nil {
log.Fatal(err)
}
}