220 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Go
		
	
	
	
			
		
		
	
	
			220 lines
		
	
	
		
			5.4 KiB
		
	
	
	
		
			Go
		
	
	
	
//  Copyright (c) 2015 Couchbase, Inc.
 | 
						||
//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
 | 
						||
//  except in compliance with the License. You may obtain a copy of the License at
 | 
						||
//    http://www.apache.org/licenses/LICENSE-2.0
 | 
						||
//  Unless required by applicable law or agreed to in writing, software distributed under the
 | 
						||
//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 | 
						||
//  either express or implied. See the License for the specific language governing permissions
 | 
						||
//  and limitations under the License.
 | 
						||
 | 
						||
// +build ignore
 | 
						||
 | 
						||
package main
 | 
						||
 | 
						||
import (
 | 
						||
	"bufio"
 | 
						||
	"bytes"
 | 
						||
	"flag"
 | 
						||
	"fmt"
 | 
						||
	"io"
 | 
						||
	"log"
 | 
						||
	"net/http"
 | 
						||
	"os"
 | 
						||
	"os/exec"
 | 
						||
	"strconv"
 | 
						||
	"strings"
 | 
						||
	"unicode"
 | 
						||
)
 | 
						||
 | 
						||
var url = flag.String("url",
 | 
						||
	"http://www.unicode.org/Public/"+unicode.Version+"/ucd/auxiliary/",
 | 
						||
	"URL of Unicode database directory")
 | 
						||
var verbose = flag.Bool("verbose",
 | 
						||
	false,
 | 
						||
	"write data to stdout as it is parsed")
 | 
						||
var localFiles = flag.Bool("local",
 | 
						||
	false,
 | 
						||
	"data files have been copied to the current directory; for debugging only")
 | 
						||
 | 
						||
var outputFile = flag.String("output",
 | 
						||
	"",
 | 
						||
	"output file for generated tables; default stdout")
 | 
						||
 | 
						||
var output *bufio.Writer
 | 
						||
 | 
						||
func main() {
 | 
						||
	flag.Parse()
 | 
						||
	setupOutput()
 | 
						||
 | 
						||
	graphemeTests := make([]test, 0)
 | 
						||
	graphemeComments := make([]string, 0)
 | 
						||
	graphemeTests, graphemeComments = loadUnicodeData("GraphemeBreakTest.txt", graphemeTests, graphemeComments)
 | 
						||
	wordTests := make([]test, 0)
 | 
						||
	wordComments := make([]string, 0)
 | 
						||
	wordTests, wordComments = loadUnicodeData("WordBreakTest.txt", wordTests, wordComments)
 | 
						||
	sentenceTests := make([]test, 0)
 | 
						||
	sentenceComments := make([]string, 0)
 | 
						||
	sentenceTests, sentenceComments = loadUnicodeData("SentenceBreakTest.txt", sentenceTests, sentenceComments)
 | 
						||
 | 
						||
	fmt.Fprintf(output, fileHeader, *url)
 | 
						||
	generateTestTables("Grapheme", graphemeTests, graphemeComments)
 | 
						||
	generateTestTables("Word", wordTests, wordComments)
 | 
						||
	generateTestTables("Sentence", sentenceTests, sentenceComments)
 | 
						||
 | 
						||
	flushOutput()
 | 
						||
}
 | 
						||
 | 
						||
// WordBreakProperty.txt has the form:
 | 
						||
// 05F0..05F2    ; Hebrew_Letter # Lo   [3] HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW LIGATURE YIDDISH DOUBLE YOD
 | 
						||
// FB1D          ; Hebrew_Letter # Lo       HEBREW LETTER YOD WITH HIRIQ
 | 
						||
func openReader(file string) (input io.ReadCloser) {
 | 
						||
	if *localFiles {
 | 
						||
		f, err := os.Open(file)
 | 
						||
		if err != nil {
 | 
						||
			log.Fatal(err)
 | 
						||
		}
 | 
						||
		input = f
 | 
						||
	} else {
 | 
						||
		path := *url + file
 | 
						||
		resp, err := http.Get(path)
 | 
						||
		if err != nil {
 | 
						||
			log.Fatal(err)
 | 
						||
		}
 | 
						||
		if resp.StatusCode != 200 {
 | 
						||
			log.Fatal("bad GET status for "+file, resp.Status)
 | 
						||
		}
 | 
						||
		input = resp.Body
 | 
						||
	}
 | 
						||
	return
 | 
						||
}
 | 
						||
 | 
						||
func loadUnicodeData(filename string, tests []test, comments []string) ([]test, []string) {
 | 
						||
	f := openReader(filename)
 | 
						||
	defer f.Close()
 | 
						||
	bufioReader := bufio.NewReader(f)
 | 
						||
	line, err := bufioReader.ReadString('\n')
 | 
						||
	for err == nil {
 | 
						||
		tests, comments = parseLine(line, tests, comments)
 | 
						||
		line, err = bufioReader.ReadString('\n')
 | 
						||
	}
 | 
						||
	// if the err was EOF still need to process last value
 | 
						||
	if err == io.EOF {
 | 
						||
		tests, comments = parseLine(line, tests, comments)
 | 
						||
	}
 | 
						||
	return tests, comments
 | 
						||
}
 | 
						||
 | 
						||
const comment = "#"
 | 
						||
const brk = "÷"
 | 
						||
const nbrk = "×"
 | 
						||
 | 
						||
type test [][]byte
 | 
						||
 | 
						||
func parseLine(line string, tests []test, comments []string) ([]test, []string) {
 | 
						||
	if strings.HasPrefix(line, comment) {
 | 
						||
		return tests, comments
 | 
						||
	}
 | 
						||
	line = strings.TrimSpace(line)
 | 
						||
	if len(line) == 0 {
 | 
						||
		return tests, comments
 | 
						||
	}
 | 
						||
	commentStart := strings.Index(line, comment)
 | 
						||
	comment := strings.TrimSpace(line[commentStart+1:])
 | 
						||
	if commentStart > 0 {
 | 
						||
		line = line[0:commentStart]
 | 
						||
	}
 | 
						||
	pieces := strings.Split(line, brk)
 | 
						||
	t := make(test, 0)
 | 
						||
	for _, piece := range pieces {
 | 
						||
		piece = strings.TrimSpace(piece)
 | 
						||
		if len(piece) > 0 {
 | 
						||
			codePoints := strings.Split(piece, nbrk)
 | 
						||
			word := ""
 | 
						||
			for _, codePoint := range codePoints {
 | 
						||
				codePoint = strings.TrimSpace(codePoint)
 | 
						||
				r, err := strconv.ParseInt(codePoint, 16, 64)
 | 
						||
				if err != nil {
 | 
						||
					log.Printf("err: %v for '%s'", err, string(r))
 | 
						||
					return tests, comments
 | 
						||
				}
 | 
						||
 | 
						||
				word += string(r)
 | 
						||
			}
 | 
						||
			t = append(t, []byte(word))
 | 
						||
		}
 | 
						||
	}
 | 
						||
	tests = append(tests, t)
 | 
						||
	comments = append(comments, comment)
 | 
						||
	return tests, comments
 | 
						||
}
 | 
						||
 | 
						||
func generateTestTables(prefix string, tests []test, comments []string) {
 | 
						||
	fmt.Fprintf(output, testHeader, prefix)
 | 
						||
	for i, t := range tests {
 | 
						||
		fmt.Fprintf(output, "\t\t{\n")
 | 
						||
		fmt.Fprintf(output, "\t\t\tinput: %#v,\n", bytes.Join(t, []byte{}))
 | 
						||
		fmt.Fprintf(output, "\t\t\toutput: %s,\n", generateTest(t))
 | 
						||
		fmt.Fprintf(output, "\t\t\tcomment: `%s`,\n", comments[i])
 | 
						||
		fmt.Fprintf(output, "\t\t},\n")
 | 
						||
	}
 | 
						||
	fmt.Fprintf(output, "}\n")
 | 
						||
}
 | 
						||
 | 
						||
func generateTest(t test) string {
 | 
						||
	rv := "[][]byte{"
 | 
						||
	for _, te := range t {
 | 
						||
		rv += fmt.Sprintf("%#v,", te)
 | 
						||
	}
 | 
						||
	rv += "}"
 | 
						||
	return rv
 | 
						||
}
 | 
						||
 | 
						||
const fileHeader = `// Generated by running
 | 
						||
//      maketesttables --url=%s
 | 
						||
// DO NOT EDIT
 | 
						||
 | 
						||
package segment
 | 
						||
`
 | 
						||
 | 
						||
const testHeader = `var unicode%sTests = []struct {
 | 
						||
		input  []byte
 | 
						||
		output [][]byte
 | 
						||
		comment string
 | 
						||
	}{
 | 
						||
`
 | 
						||
 | 
						||
func setupOutput() {
 | 
						||
	output = bufio.NewWriter(startGofmt())
 | 
						||
}
 | 
						||
 | 
						||
// startGofmt connects output to a gofmt process if -output is set.
 | 
						||
func startGofmt() io.Writer {
 | 
						||
	if *outputFile == "" {
 | 
						||
		return os.Stdout
 | 
						||
	}
 | 
						||
	stdout, err := os.Create(*outputFile)
 | 
						||
	if err != nil {
 | 
						||
		log.Fatal(err)
 | 
						||
	}
 | 
						||
	// Pipe output to gofmt.
 | 
						||
	gofmt := exec.Command("gofmt")
 | 
						||
	fd, err := gofmt.StdinPipe()
 | 
						||
	if err != nil {
 | 
						||
		log.Fatal(err)
 | 
						||
	}
 | 
						||
	gofmt.Stdout = stdout
 | 
						||
	gofmt.Stderr = os.Stderr
 | 
						||
	err = gofmt.Start()
 | 
						||
	if err != nil {
 | 
						||
		log.Fatal(err)
 | 
						||
	}
 | 
						||
	return fd
 | 
						||
}
 | 
						||
 | 
						||
func flushOutput() {
 | 
						||
	err := output.Flush()
 | 
						||
	if err != nil {
 | 
						||
		log.Fatal(err)
 | 
						||
	}
 | 
						||
}
 |