route/vendor/github.com/ulikunitz/xz/internal/randtxt/probs.go

186 lines
4.6 KiB
Go

// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package randtxt supports the generation of random text using a
// trigram model for the English language.
package randtxt
import (
"math"
"math/rand"
"sort"
)
// ngram stores an entry from the language model.
type ngram struct {
s string
lgP float64
lgQ float64
}
// ngrams represents a slice of ngram values and is used to represent a
// language model.
type ngrams []ngram
func (s ngrams) Len() int { return len(s) }
func (s ngrams) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }
// Sorts the language model in the sequence of their ngrams.
func (s ngrams) Sort() { sort.Sort(s) }
// Search is looking for an ngram or the position where it would be
// inserted.
func (s ngrams) Search(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// prob represents a string, usually an ngram, and a probability value.
type prob struct {
s string
p float64
}
// probs is a slice of prob values that can be sorted and searched.
type probs []prob
func (s probs) Len() int { return len(s) }
func (s probs) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }
// SortByNgram sorts the probs slice by ngram, field s.
func (s probs) SortByNgram() { sort.Sort(s) }
// SortsByProb sorts the probs slice by probability, field p.
func (s probs) SortByProb() { sort.Sort(byProb{s}) }
// SearchNgram searches for an ngram or the position where it would be
// inserted.
func (s probs) SearchNgram(g string) int {
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
}
// SearchProb searches ngrams for a specific probability or where it
// would be inserted.
func (s probs) SearchProb(p float64) int {
return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
}
// byProb is used to sort probs slice by probability, field p.
type byProb struct {
probs
}
func (s byProb) Less(i, j int) bool {
return s.probs[i].p < s.probs[j].p
}
// cdf can be used to setup a cumulative distribution function
// represented by a probs slice. We should have returned an actual
// function.
func cdf(n int, p func(i int) prob) probs {
prs := make(probs, n)
sum := 0.0
for i := range prs {
pr := p(i)
sum += pr.p
prs[i] = pr
}
q := 1.0 / sum
x := 0.0
for i, pr := range prs {
x += pr.p * q
if x > 1.0 {
x = 1.0
}
prs[i].p = x
}
if !sort.IsSorted(byProb{prs}) {
panic("cdf not sorted")
}
return prs
}
// pCDFOfLM converts a language model into a cumulative distribution
// function represented by probs.
func pCDFOfLM(lm ngrams) probs {
return cdf(len(lm), func(i int) prob {
return prob{lm[i].s, math.Exp2(lm[i].lgP)}
})
}
// cCDF converts a ngrams slice into a cumulative distribution function
// using the conditional probability lgQ.
func cCDF(s ngrams) probs {
return cdf(len(s), func(i int) prob {
return prob{s[i].s, math.Exp2(s[i].lgQ)}
})
}
// comap contains a map of conditional distribution function for the
// last character.
type comap map[string]probs
// comapOfLM converts a language model in a map of conditional
// distribution functions.
func comapOfLM(lm ngrams) comap {
if !sort.IsSorted(lm) {
panic("lm is not sorted")
}
m := make(comap, 26*26)
for i := 0; i < len(lm); {
j := i
g := lm[i].s
g2 := g[:2]
z := g2 + "Z"
i = lm.Search(z)
if i >= len(lm) || lm[i].s != z {
panic("unexpected search result")
}
i++
m[g2] = cCDF(lm[j:i])
}
return m
}
// trigram returns the trigram with prefix g2 using a probability value
// in the range [0.0,1.0).
func (c comap) trigram(g2 string, p float64) string {
prs := c[g2]
i := prs.SearchProb(p)
return prs[i].s
}
var (
// CDF for normal probabilities
pcdf = pCDFOfLM(englm3)
// map of two letter conditionals
cmap = comapOfLM(englm3)
)
// Reader generates a stream of text of uppercase letters with trigrams
// distributed according to a language model of the English language.
type Reader struct {
rnd *rand.Rand
g3 string
}
// NewReader creates a new reader. The argument src must create a uniformly
// distributed stream of random values.
func NewReader(src rand.Source) *Reader {
rnd := rand.New(src)
i := pcdf.SearchProb(rnd.Float64())
return &Reader{rnd, pcdf[i].s}
}
// Read reads random text. The Read function will always return len(p)
// bytes and will never return an error.
func (r *Reader) Read(p []byte) (n int, err error) {
for i := range p {
r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
p[i] = r.g3[2]
}
return len(p), nil
}