186 lines
4.6 KiB
Go
186 lines
4.6 KiB
Go
// Copyright 2014-2017 Ulrich Kunitz. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Package randtxt supports the generation of random text using a
|
|
// trigram model for the English language.
|
|
package randtxt
|
|
|
|
import (
|
|
"math"
|
|
"math/rand"
|
|
"sort"
|
|
)
|
|
|
|
// ngram stores an entry from the language model.
|
|
type ngram struct {
|
|
s string
|
|
lgP float64
|
|
lgQ float64
|
|
}
|
|
|
|
// ngrams represents a slice of ngram values and is used to represent a
|
|
// language model.
|
|
type ngrams []ngram
|
|
|
|
func (s ngrams) Len() int { return len(s) }
|
|
func (s ngrams) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
func (s ngrams) Less(i, j int) bool { return s[i].s < s[j].s }
|
|
|
|
// Sorts the language model in the sequence of their ngrams.
|
|
func (s ngrams) Sort() { sort.Sort(s) }
|
|
|
|
// Search is looking for an ngram or the position where it would be
|
|
// inserted.
|
|
func (s ngrams) Search(g string) int {
|
|
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
|
|
}
|
|
|
|
// prob represents a string, usually an ngram, and a probability value.
|
|
type prob struct {
|
|
s string
|
|
p float64
|
|
}
|
|
|
|
// probs is a slice of prob values that can be sorted and searched.
|
|
type probs []prob
|
|
|
|
func (s probs) Len() int { return len(s) }
|
|
func (s probs) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
func (s probs) Less(i, j int) bool { return s[i].s < s[j].s }
|
|
|
|
// SortByNgram sorts the probs slice by ngram, field s.
|
|
func (s probs) SortByNgram() { sort.Sort(s) }
|
|
|
|
// SortsByProb sorts the probs slice by probability, field p.
|
|
func (s probs) SortByProb() { sort.Sort(byProb{s}) }
|
|
|
|
// SearchNgram searches for an ngram or the position where it would be
|
|
// inserted.
|
|
func (s probs) SearchNgram(g string) int {
|
|
return sort.Search(len(s), func(k int) bool { return s[k].s >= g })
|
|
}
|
|
|
|
// SearchProb searches ngrams for a specific probability or where it
|
|
// would be inserted.
|
|
func (s probs) SearchProb(p float64) int {
|
|
return sort.Search(len(s), func(k int) bool { return s[k].p >= p })
|
|
}
|
|
|
|
// byProb is used to sort probs slice by probability, field p.
|
|
type byProb struct {
|
|
probs
|
|
}
|
|
|
|
func (s byProb) Less(i, j int) bool {
|
|
return s.probs[i].p < s.probs[j].p
|
|
}
|
|
|
|
// cdf can be used to setup a cumulative distribution function
|
|
// represented by a probs slice. We should have returned an actual
|
|
// function.
|
|
func cdf(n int, p func(i int) prob) probs {
|
|
prs := make(probs, n)
|
|
sum := 0.0
|
|
for i := range prs {
|
|
pr := p(i)
|
|
sum += pr.p
|
|
prs[i] = pr
|
|
}
|
|
q := 1.0 / sum
|
|
x := 0.0
|
|
for i, pr := range prs {
|
|
x += pr.p * q
|
|
if x > 1.0 {
|
|
x = 1.0
|
|
}
|
|
prs[i].p = x
|
|
}
|
|
if !sort.IsSorted(byProb{prs}) {
|
|
panic("cdf not sorted")
|
|
}
|
|
return prs
|
|
}
|
|
|
|
// pCDFOfLM converts a language model into a cumulative distribution
|
|
// function represented by probs.
|
|
func pCDFOfLM(lm ngrams) probs {
|
|
return cdf(len(lm), func(i int) prob {
|
|
return prob{lm[i].s, math.Exp2(lm[i].lgP)}
|
|
})
|
|
}
|
|
|
|
// cCDF converts a ngrams slice into a cumulative distribution function
|
|
// using the conditional probability lgQ.
|
|
func cCDF(s ngrams) probs {
|
|
return cdf(len(s), func(i int) prob {
|
|
return prob{s[i].s, math.Exp2(s[i].lgQ)}
|
|
})
|
|
}
|
|
|
|
// comap contains a map of conditional distribution function for the
|
|
// last character.
|
|
type comap map[string]probs
|
|
|
|
// comapOfLM converts a language model in a map of conditional
|
|
// distribution functions.
|
|
func comapOfLM(lm ngrams) comap {
|
|
if !sort.IsSorted(lm) {
|
|
panic("lm is not sorted")
|
|
}
|
|
m := make(comap, 26*26)
|
|
for i := 0; i < len(lm); {
|
|
j := i
|
|
g := lm[i].s
|
|
g2 := g[:2]
|
|
z := g2 + "Z"
|
|
i = lm.Search(z)
|
|
if i >= len(lm) || lm[i].s != z {
|
|
panic("unexpected search result")
|
|
}
|
|
i++
|
|
m[g2] = cCDF(lm[j:i])
|
|
}
|
|
return m
|
|
}
|
|
|
|
// trigram returns the trigram with prefix g2 using a probability value
|
|
// in the range [0.0,1.0).
|
|
func (c comap) trigram(g2 string, p float64) string {
|
|
prs := c[g2]
|
|
i := prs.SearchProb(p)
|
|
return prs[i].s
|
|
}
|
|
|
|
var (
|
|
// CDF for normal probabilities
|
|
pcdf = pCDFOfLM(englm3)
|
|
// map of two letter conditionals
|
|
cmap = comapOfLM(englm3)
|
|
)
|
|
|
|
// Reader generates a stream of text of uppercase letters with trigrams
|
|
// distributed according to a language model of the English language.
|
|
type Reader struct {
|
|
rnd *rand.Rand
|
|
g3 string
|
|
}
|
|
|
|
// NewReader creates a new reader. The argument src must create a uniformly
|
|
// distributed stream of random values.
|
|
func NewReader(src rand.Source) *Reader {
|
|
rnd := rand.New(src)
|
|
i := pcdf.SearchProb(rnd.Float64())
|
|
return &Reader{rnd, pcdf[i].s}
|
|
}
|
|
|
|
// Read reads random text. The Read function will always return len(p)
|
|
// bytes and will never return an error.
|
|
func (r *Reader) Read(p []byte) (n int, err error) {
|
|
for i := range p {
|
|
r.g3 = cmap.trigram(r.g3[1:], r.rnd.Float64())
|
|
p[i] = r.g3[2]
|
|
}
|
|
return len(p), nil
|
|
}
|