This commit is contained in:
techknowlogick 2021-02-28 18:08:33 -05:00 committed by GitHub
parent 030646eea4
commit 47f6a4ec3f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
947 changed files with 26119 additions and 7062 deletions

View file

@ -33,6 +33,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"unsafe"
@ -122,6 +123,8 @@ func (b *Bucket) Do(k string, f func(mc *memcached.Client, vb uint16) error) (er
}
func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, deadline bool) (err error) {
var lastError error
if SlowServerCallWarningThreshold > 0 {
defer slowLog(time.Now(), "call to Do(%q)", k)
}
@ -131,7 +134,7 @@ func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, de
for i := 0; i < maxTries; i++ {
conn, pool, err := b.getConnectionToVBucket(vb)
if err != nil {
if isConnError(err) && backOff(i, maxTries, backOffDuration, true) {
if (err == errNoPool || isConnError(err)) && backOff(i, maxTries, backOffDuration, true) {
b.Refresh()
continue
}
@ -143,13 +146,13 @@ func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, de
} else {
conn.SetDeadline(noDeadline)
}
err = f(conn, uint16(vb))
lastError = f(conn, uint16(vb))
var retry bool
discard := isOutOfBoundsError(err)
retry := false
discard := isOutOfBoundsError(lastError) || IsReadTimeOutError(lastError)
// MB-30967 / MB-31001 implement back off for transient errors
if resp, ok := err.(*gomemcached.MCResponse); ok {
if resp, ok := lastError.(*gomemcached.MCResponse); ok {
switch resp.Status {
case gomemcached.NOT_MY_VBUCKET:
b.Refresh()
@ -162,12 +165,10 @@ func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, de
retry = true
case gomemcached.ENOMEM:
fallthrough
case gomemcached.TMPFAIL:
case gomemcached.TMPFAIL, gomemcached.EBUSY:
retry = backOff(i, maxTries, backOffDuration, true)
default:
retry = false
}
} else if err != nil && isConnError(err) && backOff(i, maxTries, backOffDuration, true) {
} else if lastError != nil && isConnError(lastError) && backOff(i, maxTries, backOffDuration, true) {
retry = true
}
@ -178,11 +179,11 @@ func (b *Bucket) Do2(k string, f func(mc *memcached.Client, vb uint16) error, de
}
if !retry {
return err
return lastError
}
}
return fmt.Errorf("unable to complete action after %v attemps", maxTries)
return fmt.Errorf("unable to complete action after %v attemps: ", maxTries, lastError)
}
type GatheredStats struct {
@ -211,6 +212,20 @@ func getStatsParallel(sn string, b *Bucket, offset int, which string,
}
}
func getStatsParallelFunc(fn func(key, val []byte), sn string, b *Bucket, offset int, which string,
ch chan<- GatheredStats) {
pool := b.getConnPool(offset)
conn, err := pool.Get()
if err == nil {
conn.SetDeadline(getDeadline(time.Time{}, DefaultTimeout))
err = conn.StatsFunc(which, fn)
pool.Return(conn)
}
ch <- GatheredStats{Server: sn, Err: err}
}
// GetStats gets a set of stats from all servers.
//
// Returns a map of server ID -> map of stat key to map value.
@ -246,6 +261,108 @@ func (b *Bucket) GatherStats(which string) map[string]GatheredStats {
return rv
}
// GatherStats returns a map of server ID -> GatheredStats from all servers.
func (b *Bucket) GatherStatsFunc(which string, fn func(key, val []byte)) map[string]error {
var errMap map[string]error
vsm := b.VBServerMap()
if vsm.ServerList == nil {
return errMap
}
// Go grab all the things at once.
ch := make(chan GatheredStats, len(vsm.ServerList))
for i, sn := range vsm.ServerList {
go getStatsParallelFunc(fn, sn, b, i, which, ch)
}
// Gather the results
for range vsm.ServerList {
gs := <-ch
if gs.Err != nil {
if errMap == nil {
errMap = make(map[string]error)
errMap[gs.Server] = gs.Err
}
}
}
return errMap
}
type BucketStats int
const (
StatCount = BucketStats(iota)
StatSize
)
var bucketStatString = []string{
"curr_items",
"ep_value_size",
}
var collectionStatString = []string{
"items",
"disk_size",
}
// Get selected bucket or collection stats
func (b *Bucket) GetIntStats(refresh bool, which []BucketStats, context ...*memcached.ClientContext) ([]int64, error) {
if refresh {
b.Refresh()
}
var vals []int64 = make([]int64, len(which))
if len(vals) == 0 {
return vals, nil
}
var outErr error
if len(context) > 0 {
collKey := fmt.Sprintf("collections-byid 0x%x", context[0].CollId)
errs := b.GatherStatsFunc(collKey, func(key, val []byte) {
for i, f := range which {
lk := len(key)
ls := len(collectionStatString[f])
if lk >= ls && string(key[lk-ls:]) == collectionStatString[f] {
v, err := strconv.ParseInt(string(val), 10, 64)
if err == nil {
atomic.AddInt64(&vals[i], v)
} else if outErr == nil {
outErr = err
}
}
}
})
// have to use a range to access any one element of a map
for _, err := range errs {
return nil, err
}
} else {
errs := b.GatherStatsFunc("", func(key, val []byte) {
for i, f := range which {
if string(key) == bucketStatString[f] {
v, err := strconv.ParseInt(string(val), 10, 64)
if err == nil {
atomic.AddInt64(&vals[i], v)
} else if outErr == nil {
outErr = err
}
}
}
})
// have to use a range to access any one element of a map
for _, err := range errs {
return nil, err
}
}
return vals, outErr
}
// Get bucket count through the bucket stats
func (b *Bucket) GetCount(refresh bool, context ...*memcached.ClientContext) (count int64, err error) {
if refresh {
@ -351,6 +468,9 @@ func isAuthError(err error) bool {
}
func IsReadTimeOutError(err error) bool {
if err == nil {
return false
}
estr := err.Error()
return strings.Contains(estr, "read tcp") ||
strings.Contains(estr, "i/o timeout")
@ -456,6 +576,21 @@ func (b *Bucket) doBulkGet(vb uint16, keys []string, reqDeadline time.Time,
}
b.Refresh()
backOffAttempts++
} else if err == errNoPool {
if !backOff(backOffAttempts, MaxBackOffRetries, backOffDuration, true) {
logging.Errorf("Connection Error %v : %v", bname, err)
ech <- err
return err
}
err = b.Refresh()
if err != nil {
ech <- err
return err
}
backOffAttempts++
// retry, and make no noise
return nil
}
logging.Infof("Pool Get returned %v: %v", bname, err)
// retry
@ -498,8 +633,8 @@ func (b *Bucket) doBulkGet(vb uint16, keys []string, reqDeadline time.Time,
ech <- err
return err
case error:
if isOutOfBoundsError(err) {
// We got an out of bound error, retry the operation
if isOutOfBoundsError(err) || IsReadTimeOutError(err) {
// We got an out of bounds error or a read timeout error; retry the operation
discard = true
return nil
} else if isConnError(err) && backOff(backOffAttempts, MaxBackOffRetries, backOffDuration, true) {
@ -816,6 +951,14 @@ var ErrKeyExists = errors.New("key exists")
func (b *Bucket) Write(k string, flags, exp int, v interface{},
opt WriteOptions, context ...*memcached.ClientContext) (err error) {
_, err = b.WriteWithCAS(k, flags, exp, v, opt, context...)
return err
}
func (b *Bucket) WriteWithCAS(k string, flags, exp int, v interface{},
opt WriteOptions, context ...*memcached.ClientContext) (cas uint64, err error) {
if ClientOpCallback != nil {
defer func(t time.Time) {
ClientOpCallback(fmt.Sprintf("Write(%v)", opt), k, t, err)
@ -826,7 +969,7 @@ func (b *Bucket) Write(k string, flags, exp int, v interface{},
if opt&Raw == 0 {
data, err = json.Marshal(v)
if err != nil {
return err
return cas, err
}
} else if v != nil {
data = v.([]byte)
@ -852,14 +995,18 @@ func (b *Bucket) Write(k string, flags, exp int, v interface{},
res, err = mc.Set(vb, k, flags, exp, data, context...)
}
if err == nil {
cas = res.Cas
}
return err
})
if err == nil && (opt&(Persist|Indexable) != 0) {
err = b.WaitForPersistence(k, res.Cas, data == nil)
err = b.WaitForPersistence(k, cas, data == nil)
}
return err
return cas, err
}
func (b *Bucket) WriteWithMT(k string, flags, exp int, v interface{},
@ -1018,6 +1165,11 @@ func (b *Bucket) Set(k string, exp int, v interface{}, context ...*memcached.Cli
return b.Write(k, 0, exp, v, 0, context...)
}
// Set a value in this bucket.
func (b *Bucket) SetWithCAS(k string, exp int, v interface{}, context ...*memcached.ClientContext) (uint64, error) {
return b.WriteWithCAS(k, 0, exp, v, 0, context...)
}
// Set a value in this bucket with with flags
func (b *Bucket) SetWithMeta(k string, flags int, exp int, v interface{}, context ...*memcached.ClientContext) (*MutationToken, error) {
return b.WriteWithMT(k, flags, exp, v, 0, context...)
@ -1039,6 +1191,16 @@ func (b *Bucket) Add(k string, exp int, v interface{}, context ...*memcached.Cli
return (err == nil), err
}
// Add adds a value to this bucket; like Set except that nothing
// happens if the key exists. Return the CAS value.
func (b *Bucket) AddWithCAS(k string, exp int, v interface{}, context ...*memcached.ClientContext) (bool, uint64, error) {
cas, err := b.WriteWithCAS(k, 0, exp, v, AddOnly, context...)
if err == ErrKeyExists {
return false, 0, nil
}
return (err == nil), cas, err
}
// AddRaw adds a value to this bucket; like SetRaw except that nothing
// happens if the key exists. The value will be stored as raw bytes.
func (b *Bucket) AddRaw(k string, exp int, v []byte, context ...*memcached.ClientContext) (added bool, err error) {

View file

@ -510,9 +510,11 @@ func (b *Bucket) GetRandomDoc(context ...*memcached.ClientContext) (*gomemcached
// We may need to select the bucket before GetRandomDoc()
// will work. This is sometimes done at startup (see defaultMkConn())
// but not always, depending on the auth type.
_, err = conn.SelectBucket(b.Name)
if err != nil {
return nil, err
if conn.LastBucket() != b.Name {
_, err = conn.SelectBucket(b.Name)
if err != nil {
return nil, err
}
}
// get a randomm document from the connection
@ -533,7 +535,7 @@ func (b *Bucket) CreateScope(scope string) error {
client := pool.client
b.RUnlock()
args := map[string]interface{}{"name": scope}
return client.parsePostURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/collections", args, nil)
return client.parsePostURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/scopes", args, nil)
}
func (b *Bucket) DropScope(scope string) error {
@ -541,7 +543,7 @@ func (b *Bucket) DropScope(scope string) error {
pool := b.pool
client := pool.client
b.RUnlock()
return client.parseDeleteURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/collections/"+uriAdj(scope), nil, nil)
return client.parseDeleteURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/scopes/"+uriAdj(scope), nil, nil)
}
func (b *Bucket) CreateCollection(scope string, collection string) error {
@ -550,7 +552,7 @@ func (b *Bucket) CreateCollection(scope string, collection string) error {
client := pool.client
b.RUnlock()
args := map[string]interface{}{"name": collection}
return client.parsePostURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/collections/"+uriAdj(scope), args, nil)
return client.parsePostURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/scopes/"+uriAdj(scope)+"/collections", args, nil)
}
func (b *Bucket) DropCollection(scope string, collection string) error {
@ -558,7 +560,7 @@ func (b *Bucket) DropCollection(scope string, collection string) error {
pool := b.pool
client := pool.client
b.RUnlock()
return client.parseDeleteURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/collections/"+uriAdj(scope)+"/"+uriAdj(collection), nil, nil)
return client.parseDeleteURLResponseTerse("/pools/default/buckets/"+uriAdj(b.Name)+"/scopes/"+uriAdj(scope)+"/collections/"+uriAdj(collection), nil, nil)
}
func (b *Bucket) FlushCollection(scope string, collection string) error {
@ -703,7 +705,8 @@ func doHTTPRequestForStreaming(req *http.Request) (*http.Response, error) {
if skipVerify {
tr = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
MaxIdleConnsPerHost: MaxIdleConnsPerHost,
}
} else {
// Handle cases with cert
@ -714,7 +717,8 @@ func doHTTPRequestForStreaming(req *http.Request) (*http.Response, error) {
}
tr = &http.Transport{
TLSClientConfig: cfg,
TLSClientConfig: cfg,
MaxIdleConnsPerHost: MaxIdleConnsPerHost,
}
}
@ -751,7 +755,8 @@ func doHTTPRequest(req *http.Request) (*http.Response, error) {
if skipVerify {
tr = &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
MaxIdleConnsPerHost: MaxIdleConnsPerHost,
}
} else {
// Handle cases with cert
@ -762,11 +767,12 @@ func doHTTPRequest(req *http.Request) (*http.Response, error) {
}
tr = &http.Transport{
TLSClientConfig: cfg,
TLSClientConfig: cfg,
MaxIdleConnsPerHost: MaxIdleConnsPerHost,
}
}
client = &http.Client{Transport: tr}
client = &http.Client{Transport: tr, Timeout: ClientTimeOut}
} else if client == nil {
client = HTTPClient
@ -1346,6 +1352,10 @@ func (b *Bucket) GetCollectionsManifest() (*Manifest, error) {
b.RLock()
pools := b.getConnPools(true /* already locked */)
if len(pools) == 0 {
b.RUnlock()
return nil, fmt.Errorf("Unable to get connection to retrieve collections manifest: no connection pool. No collections access to bucket %s.", b.Name)
}
pool := pools[0] // Any pool will do, so use the first one.
b.RUnlock()
client, err := pool.Get()

View file

@ -6,7 +6,6 @@ import (
"github.com/couchbase/goutils/logging"
"io"
"io/ioutil"
"math/rand"
"net"
"net/http"
"time"
@ -109,10 +108,7 @@ func (b *Bucket) UpdateBucket2(streamingFn StreamingFn) error {
return fmt.Errorf("No healthy nodes found")
}
startNode := rand.Intn(len(nodes))
node := nodes[(startNode)%len(nodes)]
streamUrl := fmt.Sprintf("http://%s/pools/default/bucketsStreaming/%s", node.Hostname, uriAdj(b.GetName()))
streamUrl := fmt.Sprintf("%s/pools/default/bucketsStreaming/%s", b.pool.client.BaseURL, uriAdj(b.GetName()))
logging.Infof(" Trying with %s", streamUrl)
req, err := http.NewRequest("GET", streamUrl, nil)
if err != nil {

View file

@ -44,6 +44,7 @@ type ClientIface interface {
GetSubdoc(vb uint16, key string, subPaths []string, context ...*ClientContext) (*gomemcached.MCResponse, error)
Hijack() io.ReadWriteCloser
Incr(vb uint16, key string, amt, def uint64, exp int, context ...*ClientContext) (uint64, error)
LastBucket() string
Observe(vb uint16, key string) (result ObserveResult, err error)
ObserveSeq(vb uint16, vbuuid uint64) (result *ObserveSeqResult, err error)
Receive() (*gomemcached.MCResponse, error)
@ -56,6 +57,7 @@ type ClientIface interface {
SelectBucket(bucket string) (*gomemcached.MCResponse, error)
SetCas(vb uint16, key string, flags int, exp int, cas uint64, body []byte, context ...*ClientContext) (*gomemcached.MCResponse, error)
Stats(key string) ([]StatValue, error)
StatsFunc(key string, fn func(key, val []byte)) error
StatsMap(key string) (map[string]string, error)
StatsMapForSpecifiedStats(key string, statsMap map[string]string) error
Transmit(req *gomemcached.MCRequest) error
@ -74,6 +76,9 @@ type ClientContext struct {
// Collection-based context
CollId uint32
// Impersonate context
User string
// VB-state related context
// nil means not used in this context
VbState *VbStateType
@ -147,6 +152,7 @@ type Client struct {
collectionsEnabled uint32
deadline time.Time
bucket string
}
var (
@ -206,6 +212,13 @@ func (c *Client) SetDeadline(t time.Time) {
c.deadline = t
}
func (c *Client) getOpaque() uint32 {
if c.opaque >= math.MaxInt32 {
c.opaque = uint32(1)
}
return c.opaque + 1
}
// Wrap an existing transport.
func Wrap(conn memcachedConnection) (rv *Client, err error) {
client := &Client{
@ -356,12 +369,21 @@ func (c *Client) EnableFeatures(features Features) (*gomemcached.MCResponse, err
return rv, err
}
// Sets collection info for a request
func (c *Client) setCollection(req *gomemcached.MCRequest, context ...*ClientContext) error {
// Sets collection and user info for a request
func (c *Client) setContext(req *gomemcached.MCRequest, context ...*ClientContext) error {
req.CollIdLen = 0
req.UserLen = 0
collectionId := uint32(0)
if len(context) > 0 {
collectionId = context[0].CollId
uLen := len(context[0].User)
if uLen > 0 {
if uLen > gomemcached.MAX_USER_LEN {
uLen = gomemcached.MAX_USER_LEN
}
req.UserLen = uLen
copy(req.Username[:uLen], context[0].User)
}
}
// if the optional collection is specified, it must be default for clients that haven't turned on collections
@ -376,10 +398,16 @@ func (c *Client) setCollection(req *gomemcached.MCRequest, context ...*ClientCon
}
// Sets collection info in extras
func (c *Client) setExtrasCollection(req *gomemcached.MCRequest, context ...*ClientContext) error {
func (c *Client) setExtrasContext(req *gomemcached.MCRequest, context ...*ClientContext) error {
collectionId := uint32(0)
req.UserLen = 0
if len(context) > 0 {
collectionId = context[0].CollId
uLen := len(context[0].User)
if uLen > 0 {
req.UserLen = uLen
copy(req.Username[:], context[0].User)
}
}
// if the optional collection is specified, it must be default for clients that haven't turned on collections
@ -426,8 +454,9 @@ func (c *Client) Get(vb uint16, key string, context ...*ClientContext) (*gomemca
Opcode: gomemcached.GET,
VBucket: vb,
Key: []byte(key),
Opaque: c.getOpaque(),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -443,8 +472,9 @@ func (c *Client) GetSubdoc(vb uint16, key string, subPaths []string, context ...
Key: []byte(key),
Extras: extraBuf,
Body: valueBuf,
Opaque: c.getOpaque(),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -462,6 +492,7 @@ func (c *Client) GetCollectionsManifest() (*gomemcached.MCResponse, error) {
res, err := c.Send(&gomemcached.MCRequest{
Opcode: gomemcached.GET_COLLECTIONS_MANIFEST,
Opaque: c.getOpaque(),
})
if err != nil && IfResStatusError(res) {
@ -476,6 +507,7 @@ func (c *Client) CollectionsGetCID(scope string, collection string) (*gomemcache
res, err := c.Send(&gomemcached.MCRequest{
Opcode: gomemcached.COLLECTIONS_GET_CID,
Key: []byte(scope + "." + collection),
Opaque: c.getOpaque(),
})
if err != nil && IfResStatusError(res) {
@ -497,8 +529,9 @@ func (c *Client) GetAndTouch(vb uint16, key string, exp int, context ...*ClientC
VBucket: vb,
Key: []byte(key),
Extras: extraBuf,
Opaque: c.getOpaque(),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -511,8 +544,9 @@ func (c *Client) GetMeta(vb uint16, key string, context ...*ClientContext) (*gom
Opcode: gomemcached.GET_META,
VBucket: vb,
Key: []byte(key),
Opaque: c.getOpaque(),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -525,8 +559,9 @@ func (c *Client) Del(vb uint16, key string, context ...*ClientContext) (*gomemca
Opcode: gomemcached.DELETE,
VBucket: vb,
Key: []byte(key),
Opaque: c.getOpaque(),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -537,8 +572,9 @@ func (c *Client) Del(vb uint16, key string, context ...*ClientContext) (*gomemca
func (c *Client) GetRandomDoc(context ...*ClientContext) (*gomemcached.MCResponse, error) {
req := &gomemcached.MCRequest{
Opcode: 0xB6,
Opaque: c.getOpaque(),
}
err := c.setExtrasCollection(req, context...)
err := c.setExtrasContext(req, context...)
if err != nil {
return nil, err
}
@ -638,9 +674,17 @@ func (c *Client) AuthPlain(user, pass string) (*gomemcached.MCResponse, error) {
// select bucket
func (c *Client) SelectBucket(bucket string) (*gomemcached.MCResponse, error) {
return c.Send(&gomemcached.MCRequest{
res, err := c.Send(&gomemcached.MCRequest{
Opcode: gomemcached.SELECT_BUCKET,
Key: []byte(bucket)})
if res != nil {
c.bucket = bucket
}
return res, err
}
func (c *Client) LastBucket() string {
return c.bucket
}
func (c *Client) store(opcode gomemcached.CommandCode, vb uint16,
@ -650,11 +694,11 @@ func (c *Client) store(opcode gomemcached.CommandCode, vb uint16,
VBucket: vb,
Key: []byte(key),
Cas: 0,
Opaque: 0,
Opaque: c.getOpaque(),
Extras: []byte{0, 0, 0, 0, 0, 0, 0, 0},
Body: body}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -669,11 +713,11 @@ func (c *Client) storeCas(opcode gomemcached.CommandCode, vb uint16,
VBucket: vb,
Key: []byte(key),
Cas: cas,
Opaque: 0,
Opaque: c.getOpaque(),
Extras: []byte{0, 0, 0, 0, 0, 0, 0, 0},
Body: body}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -691,7 +735,7 @@ func (c *Client) Incr(vb uint16, key string,
Key: []byte(key),
Extras: make([]byte, 8+8+4),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return 0, err
}
@ -717,7 +761,7 @@ func (c *Client) Decr(vb uint16, key string,
Key: []byte(key),
Extras: make([]byte, 8+8+4),
}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return 0, err
}
@ -759,10 +803,10 @@ func (c *Client) Append(vb uint16, key string, data []byte, context ...*ClientCo
VBucket: vb,
Key: []byte(key),
Cas: 0,
Opaque: 0,
Opaque: c.getOpaque(),
Body: data}
err := c.setCollection(req, context...)
err := c.setContext(req, context...)
if err != nil {
return nil, err
}
@ -839,7 +883,7 @@ func (c *Client) GetBulk(vb uint16, keys []string, rv map[string]*gomemcached.MC
Opcode: gomemcached.GET,
VBucket: vb,
}
err := c.setCollection(memcachedReqPkt, context...)
err := c.setContext(memcachedReqPkt, context...)
if err != nil {
return err
}
@ -1216,6 +1260,34 @@ func (c *Client) Stats(key string) ([]StatValue, error) {
return rv, nil
}
// Stats requests server-side stats.
//
// Use "" as the stat key for toplevel stats.
func (c *Client) StatsFunc(key string, fn func(key, val []byte)) error {
req := &gomemcached.MCRequest{
Opcode: gomemcached.STAT,
Key: []byte(key),
Opaque: 918494,
}
err := c.Transmit(req)
if err != nil {
return err
}
for {
res, _, err := getResponse(c.conn, c.hdrBuf)
if err != nil {
return err
}
if len(res.Key) == 0 {
break
}
fn(res.Key, res.Body)
}
return nil
}
// StatsMap requests server-side stats similarly to Stats, but returns
// them as a map.
//

View file

@ -12,8 +12,13 @@ const (
FrameDurability FrameObjType = iota
FrameDcpStreamId FrameObjType = iota
FrameOpenTracing FrameObjType = iota
FrameImpersonate FrameObjType = iota
)
const MAX_USER_LEN = 15 // TODO half byte shifting to be implemented
// it's not very efficient so we currently truncate user names
const FAST_USER_LEN = 15
type FrameInfo struct {
ObjId FrameObjType
ObjLen int
@ -44,11 +49,12 @@ func (f *FrameInfo) Validate() error {
return ErrorObjLenNotMatch
}
case FrameOpenTracing:
if f.ObjLen == 0 {
return fmt.Errorf("Invalid FrameOpenTracing - length must be > 0")
if f.ObjLen != 1 {
return fmt.Errorf("Invalid FrameImpersonate - length is %v\n", f.ObjLen)
} else if f.ObjLen != len(f.ObjData) {
return ErrorObjLenNotMatch
}
case FrameImpersonate:
default:
return fmt.Errorf("Unknown FrameInfo type")
}
@ -108,16 +114,27 @@ func incrementMarker(bitsToBeIncremented, byteIncrementCnt *int, framingElen, cu
return marker, nil
}
// Right now, halfByteRemaining will always be false, because ObjID and Len haven't gotten that large yet
func (f *FrameInfo) Bytes() (output []byte, halfByteRemaining bool) {
// ObjIdentifier - 4 bits + ObjLength - 4 bits
var idAndLen uint8
idAndLen |= uint8(f.ObjId) << 4
idAndLen |= uint8(f.ObjLen)
output = append(output, byte(idAndLen))
func (f *FrameInfo) Bytes() ([]byte, bool) {
return obj2Bytes(f.ObjId, f.ObjLen, f.ObjData)
}
// Rest is Data
output = append(output, f.ObjData...)
// TODO implement half byte shifting for impersonate user names
// halfByteRemaining will always be false, because ObjID and Len haven't gotten that large yet
// and user names are truncated
func obj2Bytes(id FrameObjType, len int, data []byte) (output []byte, halfByteRemaining bool) {
if len < 16 {
// ObjIdentifier - 4 bits + ObjLength - 4 bits
var idAndLen uint8
idAndLen |= uint8(id) << 4
idAndLen |= uint8(len)
output = append(output, byte(idAndLen))
// Rest is Data
output = append(output, data[:len]...)
} else {
}
return
}

View file

@ -33,6 +33,10 @@ type MCRequest struct {
CollId [binary.MaxVarintLen32]byte
// Length of collection id
CollIdLen int
// Impersonate user name - could go in FramingExtras, but for efficiency
Username [MAX_USER_LEN]byte
// Length of Impersonate user name
UserLen int
// Flexible Framing Extras
FramingExtras []FrameInfo
// Stored length of incoming framing extras
@ -41,7 +45,24 @@ type MCRequest struct {
// Size gives the number of bytes this request requires.
func (req *MCRequest) HdrSize() int {
return HDR_LEN + len(req.Extras) + req.CollIdLen + req.FramingElen + len(req.Key)
rv := HDR_LEN + len(req.Extras) + req.CollIdLen + req.FramingElen + len(req.Key)
if req.UserLen != 0 {
rv += req.UserLen + 1
// half byte shifting required
if req.UserLen > FAST_USER_LEN {
rv++
}
}
for _, e := range req.FramingExtras {
rv += e.ObjLen + 1
// half byte shifting required
if e.ObjLen > FAST_USER_LEN {
rv++
}
}
return rv
}
func (req *MCRequest) Size() int {
@ -125,6 +146,85 @@ func (req *MCRequest) fillRegularHeaderBytes(data []byte) int {
return pos
}
func (req *MCRequest) fillFastFlexHeaderBytes(data []byte) int {
// Byte/ 0 | 1 | 2 | 3 |
// / | | | |
// |0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|0 1 2 3 4 5 6 7|
// +---------------+---------------+---------------+---------------+
// 0| Magic | Opcode | Framing extras| Key Length |
// +---------------+---------------+---------------+---------------+
// 4| Extras length | Data type | vbucket id |
// +---------------+---------------+---------------+---------------+
// 8| Total body length |
// +---------------+---------------+---------------+---------------+
// 12| Opaque |
// +---------------+---------------+---------------+---------------+
// 16| CAS |
// | |
// +---------------+---------------+---------------+---------------+
// Total 24 bytes
pos := 0
data[pos] = FLEX_MAGIC
pos++
data[pos] = byte(req.Opcode)
pos++
data[pos] = byte(req.UserLen + 1)
pos++
data[pos] = byte(len(req.Key) + req.CollIdLen)
pos++
// 4
data[pos] = byte(len(req.Extras))
pos++
// Data type
if req.DataType != 0 {
data[pos] = byte(req.DataType)
}
pos++
binary.BigEndian.PutUint16(data[pos:pos+2], req.VBucket)
pos += 2
// 8
binary.BigEndian.PutUint32(data[pos:pos+4],
uint32(len(req.Body)+req.CollIdLen+len(req.Key)+(req.UserLen+1)+len(req.Extras)+len(req.ExtMeta)))
pos += 4
// 12
binary.BigEndian.PutUint32(data[pos:pos+4], req.Opaque)
pos += 4
// 16
if req.Cas != 0 {
binary.BigEndian.PutUint64(data[pos:pos+8], req.Cas)
}
pos += 8
// 24 Flexible extras
if req.UserLen > 0 {
data[pos] = byte((uint8(FrameImpersonate) << 4) | uint8(req.UserLen))
pos++
copy(data[pos:pos+req.UserLen], req.Username[:req.UserLen])
pos += req.UserLen
}
if len(req.Extras) > 0 {
copy(data[pos:pos+len(req.Extras)], req.Extras)
pos += len(req.Extras)
}
if len(req.Key) > 0 {
if req.CollIdLen > 0 {
copy(data[pos:pos+req.CollIdLen], req.CollId[:])
pos += req.CollIdLen
}
copy(data[pos:pos+len(req.Key)], req.Key)
pos += len(req.Key)
}
return pos
}
// Returns pos and if trailing by half byte
func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
@ -147,16 +247,13 @@ func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
data[0] = FLEX_MAGIC
data[1] = byte(req.Opcode)
data[2] = byte(req.FramingElen)
data[3] = byte(req.Keylen + req.CollIdLen)
data[3] = byte(len(req.Key) + req.CollIdLen)
elen := len(req.Extras)
data[4] = byte(elen)
if req.DataType != 0 {
data[5] = byte(req.DataType)
}
binary.BigEndian.PutUint16(data[6:8], req.VBucket)
binary.BigEndian.PutUint32(data[8:12],
uint32(len(req.Body)+req.Keylen+req.CollIdLen+elen+len(req.ExtMeta)+req.FramingElen))
binary.BigEndian.PutUint32(data[12:16], req.Opaque)
if req.Cas != 0 {
binary.BigEndian.PutUint64(data[16:24], req.Cas)
@ -197,12 +294,46 @@ func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
}
}
// fast impersonate Flexible Extra
if req.UserLen > 0 {
if !mergeMode {
outputBytes, halfByteMode = obj2Bytes(FrameImpersonate, req.UserLen, req.Username[:req.UserLen])
if !halfByteMode {
framingExtras = append(framingExtras, outputBytes...)
frameBytes += len(outputBytes)
} else {
mergeMode = true
mergeModeSrc = outputBytes
}
} else {
outputBytes, halfByteMode = obj2Bytes(FrameImpersonate, req.UserLen, req.Username[:req.UserLen])
outputBytes := ShiftByteSliceRight4Bits(outputBytes)
if halfByteMode {
// Previous halfbyte merge with this halfbyte will result in a complete byte
mergeMode = false
outputBytes = Merge2HalfByteSlices(mergeModeSrc, outputBytes)
framingExtras = append(framingExtras, outputBytes...)
frameBytes += len(outputBytes)
} else {
// Merge half byte with a non-half byte will result in a combined half-byte that will
// become the source for the next iteration
mergeModeSrc = Merge2HalfByteSlices(mergeModeSrc, outputBytes)
}
}
}
if mergeMode {
// Commit the temporary merge area into framingExtras
framingExtras = append(framingExtras, mergeModeSrc...)
frameBytes += len(mergeModeSrc)
}
req.FramingElen = frameBytes
// these have to be set after we have worked out the size of the Flexible Extras
data[2] = byte(req.FramingElen)
binary.BigEndian.PutUint32(data[8:12],
uint32(len(req.Body)+len(req.Key)+req.CollIdLen+elen+len(req.ExtMeta)+req.FramingElen))
copy(data[pos:pos+frameBytes], framingExtras)
pos += frameBytes
@ -219,19 +350,21 @@ func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
}
// Add keys
if req.Keylen > 0 {
if len(req.Key) > 0 {
if mergeMode {
var key []byte
var keylen int
if req.CollIdLen == 0 {
key = req.Key
keylen = req.Keylen
keylen = len(req.Key)
} else {
key = append(key, req.CollId[:]...)
key = append(key, req.Key...)
keylen = req.Keylen + req.CollIdLen
keylen = len(req.Key) + req.CollIdLen
}
outputBytes = ShiftByteSliceRight4Bits(req.Key)
outputBytes = ShiftByteSliceRight4Bits(key)
data = Merge2HalfByteSlices(data, outputBytes)
pos += keylen
} else {
@ -239,8 +372,8 @@ func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
copy(data[pos:pos+req.CollIdLen], req.CollId[:])
pos += req.CollIdLen
}
copy(data[pos:pos+req.Keylen], req.Key)
pos += req.Keylen
copy(data[pos:pos+len(req.Key)], req.Key)
pos += len(req.Key)
}
}
@ -248,17 +381,19 @@ func (req *MCRequest) fillFlexHeaderBytes(data []byte) (int, bool) {
}
func (req *MCRequest) FillHeaderBytes(data []byte) (int, bool) {
if req.FramingElen == 0 {
return req.fillRegularHeaderBytes(data), false
} else {
if len(req.FramingExtras) > 0 || req.UserLen > FAST_USER_LEN {
return req.fillFlexHeaderBytes(data)
} else if req.UserLen > 0 {
return req.fillFastFlexHeaderBytes(data), false
} else {
return req.fillRegularHeaderBytes(data), false
}
}
// HeaderBytes will return the wire representation of the request header
// (with the extras and key).
func (req *MCRequest) HeaderBytes() []byte {
data := make([]byte, HDR_LEN+len(req.Extras)+req.CollIdLen+len(req.Key)+req.FramingElen)
data := make([]byte, req.HdrSize())
req.FillHeaderBytes(data)

View file

@ -67,113 +67,64 @@ var _LEVEL_MAP = map[string]Level{
"none": NONE,
}
// cache logging enablement to improve runtime performance (reduces from multiple tests to a single test on each call)
var (
cachedDebug bool
cachedTrace bool
cachedRequest bool
cachedInfo bool
cachedWarn bool
cachedError bool
cachedSevere bool
cachedFatal bool
)
// maintain the cached logging state
func cacheLoggingChange() {
cachedDebug = !skipLogging(DEBUG)
cachedTrace = !skipLogging(TRACE)
cachedRequest = !skipLogging(REQUEST)
cachedInfo = !skipLogging(INFO)
cachedWarn = !skipLogging(WARN)
cachedError = !skipLogging(ERROR)
cachedSevere = !skipLogging(SEVERE)
cachedFatal = !skipLogging(FATAL)
}
func ParseLevel(name string) (level Level, ok bool) {
level, ok = _LEVEL_MAP[strings.ToLower(name)]
return
}
/*
Pair supports logging of key-value pairs. Keys beginning with _ are
reserved for the logger, e.g. _time, _level, _msg, and _rlevel. The
Pair APIs are designed to avoid heap allocation and garbage
collection.
*/
type Pairs []Pair
type Pair struct {
Name string
Value interface{}
}
/*
Map allows key-value pairs to be specified using map literals or data
structures. For example:
Errorm(msg, Map{...})
Map incurs heap allocation and garbage collection, so the Pair APIs
should be preferred.
*/
type Map map[string]interface{}
// Logger provides a common interface for logging libraries
type Logger interface {
/*
These APIs write all the given pairs in addition to standard logger keys.
*/
Logp(level Level, msg string, kv ...Pair)
// Higher performance
Loga(level Level, f func() string)
Debuga(f func() string)
Tracea(f func() string)
Requesta(rlevel Level, f func() string)
Infoa(f func() string)
Warna(f func() string)
Errora(f func() string)
Severea(f func() string)
Fatala(f func() string)
Debugp(msg string, kv ...Pair)
Tracep(msg string, kv ...Pair)
Requestp(rlevel Level, msg string, kv ...Pair)
Infop(msg string, kv ...Pair)
Warnp(msg string, kv ...Pair)
Errorp(msg string, kv ...Pair)
Severep(msg string, kv ...Pair)
Fatalp(msg string, kv ...Pair)
/*
These APIs write the fields in the given kv Map in addition to standard logger keys.
*/
Logm(level Level, msg string, kv Map)
Debugm(msg string, kv Map)
Tracem(msg string, kv Map)
Requestm(rlevel Level, msg string, kv Map)
Infom(msg string, kv Map)
Warnm(msg string, kv Map)
Errorm(msg string, kv Map)
Severem(msg string, kv Map)
Fatalm(msg string, kv Map)
/*
These APIs only write _msg, _time, _level, and other logger keys. If
the msg contains other fields, use the Pair or Map APIs instead.
*/
// Printf style
Logf(level Level, fmt string, args ...interface{})
Debugf(fmt string, args ...interface{})
Tracef(fmt string, args ...interface{})
Requestf(rlevel Level, fmt string, args ...interface{})
Infof(fmt string, args ...interface{})
Warnf(fmt string, args ...interface{})
Errorf(fmt string, args ...interface{})
Severef(fmt string, args ...interface{})
Fatalf(fmt string, args ...interface{})
/*
These APIs control the logging level
*/
SetLevel(Level) // Set the logging level
Level() Level // Get the current logging level
Level() Level // Get the current logging level
}
var logger Logger = nil
@ -205,169 +156,96 @@ func SetLogger(newLogger Logger) {
} else {
curLevel = newLogger.Level()
}
cacheLoggingChange()
}
func Logp(level Level, msg string, kv ...Pair) {
// we are using deferred unlocking here throughout as we have to do this
// for the anonymous function variants even though it would be more efficient
// to not do this for the printf style variants
// anonymous function variants
func Loga(level Level, f func() string) {
if skipLogging(level) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Logp(level, msg, kv...)
logger.Loga(level, f)
}
func Debugp(msg string, kv ...Pair) {
if skipLogging(DEBUG) {
func Debuga(f func() string) {
if !cachedDebug {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Debugp(msg, kv...)
logger.Debuga(f)
}
func Tracep(msg string, kv ...Pair) {
if skipLogging(TRACE) {
func Tracea(f func() string) {
if !cachedTrace {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Tracep(msg, kv...)
logger.Tracea(f)
}
func Requestp(rlevel Level, msg string, kv ...Pair) {
if skipLogging(REQUEST) {
func Requesta(rlevel Level, f func() string) {
if !cachedRequest {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Requestp(rlevel, msg, kv...)
logger.Requesta(rlevel, f)
}
func Infop(msg string, kv ...Pair) {
if skipLogging(INFO) {
func Infoa(f func() string) {
if !cachedInfo {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Infop(msg, kv...)
logger.Infoa(f)
}
func Warnp(msg string, kv ...Pair) {
if skipLogging(WARN) {
func Warna(f func() string) {
if !cachedWarn {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Warnp(msg, kv...)
logger.Warna(f)
}
func Errorp(msg string, kv ...Pair) {
if skipLogging(ERROR) {
func Errora(f func() string) {
if !cachedError {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Errorp(msg, kv...)
logger.Errora(f)
}
func Severep(msg string, kv ...Pair) {
if skipLogging(SEVERE) {
func Severea(f func() string) {
if !cachedSevere {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Severep(msg, kv...)
logger.Severea(f)
}
func Fatalp(msg string, kv ...Pair) {
if skipLogging(FATAL) {
func Fatala(f func() string) {
if !cachedFatal {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Fatalp(msg, kv...)
logger.Fatala(f)
}
func Logm(level Level, msg string, kv Map) {
if skipLogging(level) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Logm(level, msg, kv)
}
func Debugm(msg string, kv Map) {
if skipLogging(DEBUG) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Debugm(msg, kv)
}
func Tracem(msg string, kv Map) {
if skipLogging(TRACE) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Tracem(msg, kv)
}
func Requestm(rlevel Level, msg string, kv Map) {
if skipLogging(REQUEST) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Requestm(rlevel, msg, kv)
}
func Infom(msg string, kv Map) {
if skipLogging(INFO) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Infom(msg, kv)
}
func Warnm(msg string, kv Map) {
if skipLogging(WARN) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Warnm(msg, kv)
}
func Errorm(msg string, kv Map) {
if skipLogging(ERROR) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Errorm(msg, kv)
}
func Severem(msg string, kv Map) {
if skipLogging(SEVERE) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Severem(msg, kv)
}
func Fatalm(msg string, kv Map) {
if skipLogging(FATAL) {
return
}
loggerMutex.Lock()
defer loggerMutex.Unlock()
logger.Fatalm(msg, kv)
}
// printf-style variants
func Logf(level Level, fmt string, args ...interface{}) {
if skipLogging(level) {
@ -379,7 +257,7 @@ func Logf(level Level, fmt string, args ...interface{}) {
}
func Debugf(fmt string, args ...interface{}) {
if skipLogging(DEBUG) {
if !cachedDebug {
return
}
loggerMutex.Lock()
@ -388,7 +266,7 @@ func Debugf(fmt string, args ...interface{}) {
}
func Tracef(fmt string, args ...interface{}) {
if skipLogging(TRACE) {
if !cachedTrace {
return
}
loggerMutex.Lock()
@ -397,7 +275,7 @@ func Tracef(fmt string, args ...interface{}) {
}
func Requestf(rlevel Level, fmt string, args ...interface{}) {
if skipLogging(REQUEST) {
if !cachedRequest {
return
}
loggerMutex.Lock()
@ -406,7 +284,7 @@ func Requestf(rlevel Level, fmt string, args ...interface{}) {
}
func Infof(fmt string, args ...interface{}) {
if skipLogging(INFO) {
if !cachedInfo {
return
}
loggerMutex.Lock()
@ -415,7 +293,7 @@ func Infof(fmt string, args ...interface{}) {
}
func Warnf(fmt string, args ...interface{}) {
if skipLogging(WARN) {
if !cachedWarn {
return
}
loggerMutex.Lock()
@ -424,7 +302,7 @@ func Warnf(fmt string, args ...interface{}) {
}
func Errorf(fmt string, args ...interface{}) {
if skipLogging(ERROR) {
if !cachedError {
return
}
loggerMutex.Lock()
@ -433,7 +311,7 @@ func Errorf(fmt string, args ...interface{}) {
}
func Severef(fmt string, args ...interface{}) {
if skipLogging(SEVERE) {
if !cachedSevere {
return
}
loggerMutex.Lock()
@ -442,7 +320,7 @@ func Severef(fmt string, args ...interface{}) {
}
func Fatalf(fmt string, args ...interface{}) {
if skipLogging(FATAL) {
if !cachedFatal {
return
}
loggerMutex.Lock()
@ -455,6 +333,7 @@ func SetLevel(level Level) {
defer loggerMutex.Unlock()
logger.SetLevel(level)
curLevel = level
cacheLoggingChange()
}
func LogLevel() Level {

View file

@ -10,11 +10,11 @@
package logging
import (
"bytes"
"encoding/json"
"fmt"
"io"
"log"
"strings"
"time"
)
@ -50,115 +50,61 @@ func NewLogger(out io.Writer, lvl Level, fmtLogging LogEntryFormatter, fmtArgs .
return logger
}
func (gl *goLogger) Logp(level Level, msg string, kv ...Pair) {
// anonymous function variants
func (gl *goLogger) Loga(level Level, f func() string) {
if gl.logger == nil {
return
}
if level <= gl.level {
e := newLogEntry(msg, level)
copyPairs(e, kv)
gl.log(e)
gl.log(level, NONE, f())
}
}
func (gl *goLogger) Debugp(msg string, kv ...Pair) {
gl.Logp(DEBUG, msg, kv...)
func (gl *goLogger) Debuga(f func() string) {
gl.Loga(DEBUG, f)
}
func (gl *goLogger) Tracep(msg string, kv ...Pair) {
gl.Logp(TRACE, msg, kv...)
func (gl *goLogger) Tracea(f func() string) {
gl.Loga(TRACE, f)
}
func (gl *goLogger) Requestp(rlevel Level, msg string, kv ...Pair) {
func (gl *goLogger) Requesta(rlevel Level, f func() string) {
if gl.logger == nil {
return
}
if REQUEST <= gl.level {
e := newLogEntry(msg, REQUEST)
e.Rlevel = rlevel
copyPairs(e, kv)
gl.log(e)
gl.log(REQUEST, rlevel, f())
}
}
func (gl *goLogger) Infop(msg string, kv ...Pair) {
gl.Logp(INFO, msg, kv...)
func (gl *goLogger) Infoa(f func() string) {
gl.Loga(INFO, f)
}
func (gl *goLogger) Warnp(msg string, kv ...Pair) {
gl.Logp(WARN, msg, kv...)
func (gl *goLogger) Warna(f func() string) {
gl.Loga(WARN, f)
}
func (gl *goLogger) Errorp(msg string, kv ...Pair) {
gl.Logp(ERROR, msg, kv...)
func (gl *goLogger) Errora(f func() string) {
gl.Loga(ERROR, f)
}
func (gl *goLogger) Severep(msg string, kv ...Pair) {
gl.Logp(SEVERE, msg, kv...)
func (gl *goLogger) Severea(f func() string) {
gl.Loga(SEVERE, f)
}
func (gl *goLogger) Fatalp(msg string, kv ...Pair) {
gl.Logp(FATAL, msg, kv...)
func (gl *goLogger) Fatala(f func() string) {
gl.Loga(FATAL, f)
}
func (gl *goLogger) Logm(level Level, msg string, kv Map) {
if gl.logger == nil {
return
}
if level <= gl.level {
e := newLogEntry(msg, level)
e.Data = kv
gl.log(e)
}
}
func (gl *goLogger) Debugm(msg string, kv Map) {
gl.Logm(DEBUG, msg, kv)
}
func (gl *goLogger) Tracem(msg string, kv Map) {
gl.Logm(TRACE, msg, kv)
}
func (gl *goLogger) Requestm(rlevel Level, msg string, kv Map) {
if gl.logger == nil {
return
}
if REQUEST <= gl.level {
e := newLogEntry(msg, REQUEST)
e.Rlevel = rlevel
e.Data = kv
gl.log(e)
}
}
func (gl *goLogger) Infom(msg string, kv Map) {
gl.Logm(INFO, msg, kv)
}
func (gl *goLogger) Warnm(msg string, kv Map) {
gl.Logm(WARN, msg, kv)
}
func (gl *goLogger) Errorm(msg string, kv Map) {
gl.Logm(ERROR, msg, kv)
}
func (gl *goLogger) Severem(msg string, kv Map) {
gl.Logm(SEVERE, msg, kv)
}
func (gl *goLogger) Fatalm(msg string, kv Map) {
gl.Logm(FATAL, msg, kv)
}
// printf-style variants
func (gl *goLogger) Logf(level Level, format string, args ...interface{}) {
if gl.logger == nil {
return
}
if level <= gl.level {
e := newLogEntry(fmt.Sprintf(format, args...), level)
gl.log(e)
gl.log(level, NONE, fmt.Sprintf(format, args...))
}
}
@ -175,9 +121,7 @@ func (gl *goLogger) Requestf(rlevel Level, format string, args ...interface{}) {
return
}
if REQUEST <= gl.level {
e := newLogEntry(fmt.Sprintf(format, args...), REQUEST)
e.Rlevel = rlevel
gl.log(e)
gl.log(REQUEST, rlevel, fmt.Sprintf(format, args...))
}
}
@ -209,37 +153,13 @@ func (gl *goLogger) SetLevel(level Level) {
gl.level = level
}
func (gl *goLogger) log(newEntry *logEntry) {
s := gl.entryFormatter.format(newEntry)
gl.logger.Print(s)
}
type logEntry struct {
Time string
Level Level
Rlevel Level
Message string
Data Map
}
func newLogEntry(msg string, level Level) *logEntry {
return &logEntry{
Time: time.Now().Format("2006-01-02T15:04:05.000-07:00"), // time.RFC3339 with milliseconds
Level: level,
Rlevel: NONE,
Message: msg,
}
}
func copyPairs(newEntry *logEntry, pairs []Pair) {
newEntry.Data = make(Map, len(pairs))
for _, p := range pairs {
newEntry.Data[p.Name] = p.Value
}
func (gl *goLogger) log(level Level, rlevel Level, msg string) {
tm := time.Now().Format("2006-01-02T15:04:05.000-07:00") // time.RFC3339 with milliseconds
gl.logger.Print(gl.entryFormatter.format(tm, level, rlevel, msg))
}
type formatter interface {
format(*logEntry) string
format(string, Level, Level, string) string
}
type textFormatter struct {
@ -247,24 +167,20 @@ type textFormatter struct {
// ex. 2016-02-10T09:15:25.498-08:00 [INFO] This is a message from test in text format
func (*textFormatter) format(newEntry *logEntry) string {
b := &bytes.Buffer{}
appendValue(b, newEntry.Time)
if newEntry.Rlevel != NONE {
fmt.Fprintf(b, "[%s,%s] ", newEntry.Level.String(), newEntry.Rlevel.String())
func (*textFormatter) format(tm string, level Level, rlevel Level, msg string) string {
b := &strings.Builder{}
appendValue(b, tm)
if rlevel != NONE {
fmt.Fprintf(b, "[%s,%s] ", level.String(), rlevel.String())
} else {
fmt.Fprintf(b, "[%s] ", newEntry.Level.String())
}
appendValue(b, newEntry.Message)
for key, value := range newEntry.Data {
appendKeyValue(b, key, value)
fmt.Fprintf(b, "[%s] ", level.String())
}
appendValue(b, msg)
b.WriteByte('\n')
s := bytes.NewBuffer(b.Bytes())
return s.String()
return b.String()
}
func appendValue(b *bytes.Buffer, value interface{}) {
func appendValue(b *strings.Builder, value interface{}) {
if _, ok := value.(string); ok {
fmt.Fprintf(b, "%s ", value)
} else {
@ -277,23 +193,19 @@ type keyvalueFormatter struct {
// ex. _time=2016-02-10T09:15:25.498-08:00 _level=INFO _msg=This is a message from test in key-value format
func (*keyvalueFormatter) format(newEntry *logEntry) string {
b := &bytes.Buffer{}
appendKeyValue(b, _TIME, newEntry.Time)
appendKeyValue(b, _LEVEL, newEntry.Level.String())
if newEntry.Rlevel != NONE {
appendKeyValue(b, _RLEVEL, newEntry.Rlevel.String())
}
appendKeyValue(b, _MSG, newEntry.Message)
for key, value := range newEntry.Data {
appendKeyValue(b, key, value)
func (*keyvalueFormatter) format(tm string, level Level, rlevel Level, msg string) string {
b := &strings.Builder{}
appendKeyValue(b, _TIME, tm)
appendKeyValue(b, _LEVEL, level.String())
if rlevel != NONE {
appendKeyValue(b, _RLEVEL, rlevel.String())
}
appendKeyValue(b, _MSG, msg)
b.WriteByte('\n')
s := bytes.NewBuffer(b.Bytes())
return s.String()
return b.String()
}
func appendKeyValue(b *bytes.Buffer, key, value interface{}) {
func appendKeyValue(b *strings.Builder, key, value interface{}) {
if _, ok := value.(string); ok {
fmt.Fprintf(b, "%v=%s ", key, value)
} else {
@ -306,19 +218,19 @@ type jsonFormatter struct {
// ex. {"_level":"INFO","_msg":"This is a message from test in json format","_time":"2016-02-10T09:12:59.518-08:00"}
func (*jsonFormatter) format(newEntry *logEntry) string {
if newEntry.Data == nil {
newEntry.Data = make(Map, 5)
func (*jsonFormatter) format(tm string, level Level, rlevel Level, msg string) string {
data := make(map[string]interface{}, 4)
data[_TIME] = tm
data[_LEVEL] = level.String()
if rlevel != NONE {
data[_RLEVEL] = rlevel.String()
}
newEntry.Data[_TIME] = newEntry.Time
newEntry.Data[_LEVEL] = newEntry.Level.String()
if newEntry.Rlevel != NONE {
newEntry.Data[_RLEVEL] = newEntry.Rlevel.String()
}
newEntry.Data[_MSG] = newEntry.Message
serialized, _ := json.Marshal(newEntry.Data)
s := bytes.NewBuffer(append(serialized, '\n'))
return s.String()
data[_MSG] = msg
serialized, _ := json.Marshal(data)
var b strings.Builder
b.Write(serialized)
b.WriteByte('\n')
return b.String()
}
type ComponentCallback func() string
@ -345,21 +257,17 @@ func (level Level) UniformString() string {
return _LEVEL_UNIFORM[level]
}
func (uf *uniformFormatter) format(newEntry *logEntry) string {
b := &bytes.Buffer{}
appendValue(b, newEntry.Time)
func (uf *uniformFormatter) format(tm string, level Level, rlevel Level, msg string) string {
b := &strings.Builder{}
appendValue(b, tm)
component := uf.callback()
if newEntry.Rlevel != NONE {
if rlevel != NONE {
// not really any accommodation for a composite level in the uniform standard; just output as abbr,abbr
fmt.Fprintf(b, "%s,%s %s ", newEntry.Level.UniformString(), newEntry.Rlevel.UniformString(), component)
fmt.Fprintf(b, "%s,%s %s ", level.UniformString(), rlevel.UniformString(), component)
} else {
fmt.Fprintf(b, "%s %s ", newEntry.Level.UniformString(), component)
}
appendValue(b, newEntry.Message)
for key, value := range newEntry.Data {
appendKeyValue(b, key, value)
fmt.Fprintf(b, "%s %s ", level.UniformString(), component)
}
appendValue(b, msg)
b.WriteByte('\n')
s := bytes.NewBuffer(b.Bytes())
return s.String()
return b.String()
}

View file

@ -1,22 +0,0 @@
sudo: false
language: go
go:
- "1.9.x"
- "1.10.x"
- "1.11.x"
script:
- go get github.com/mattn/goveralls
- go get -u github.com/kisielk/errcheck
- go test -v $(go list ./... | grep -v vendor/)
- go test -race
- go vet
- errcheck
- go test -coverprofile=profile.out -covermode=count
- 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then goveralls -service=travis-ci -coverprofile=profile.out -repotoken $COVERALLS; fi'
notifications:
email:
- marty.schoch@gmail.com

View file

@ -1,16 +0,0 @@
# Contributing to Vellum
We look forward to your contributions, but ask that you first review these guidelines.
### Sign the CLA
As Vellum is a Couchbase project we require contributors accept the [Couchbase Contributor License Agreement](http://review.couchbase.org/static/individual_agreement.html). To sign this agreement log into the Couchbase [code review tool](http://review.couchbase.org/). The Vellum project does not use this code review tool but it is still used to track acceptance of the contributor license agreements.
### Submitting a Pull Request
All types of contributions are welcome, but please keep the following in mind:
- If you're planning a large change, you should really discuss it in a github issue first. This helps avoid duplicate effort and spending time on something that may not be merged.
- Existing tests should continue to pass, new tests for the contribution are nice to have.
- All code should have gone through `go fmt`
- All code should pass `go vet`

View file

@ -1,202 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -1,183 +0,0 @@
# ![vellum](docs/logo.png) vellum
[![Tests](https://github.com/couchbase/vellum/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/couchbase/vellum/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster)
[![Coverage Status](https://coveralls.io/repos/github/couchbase/vellum/badge.svg?branch=master)](https://coveralls.io/github/couchbase/vellum?branch=master)
[![GoDoc](https://godoc.org/github.com/couchbase/vellum?status.svg)](https://godoc.org/github.com/couchbase/vellum)
[![Go Report Card](https://goreportcard.com/badge/github.com/couchbase/vellum)](https://goreportcard.com/report/github.com/couchbase/vellum)
[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
A Go library implementing an FST (finite state transducer) capable of:
- mapping between keys ([]byte) and a value (uint64)
- enumerating keys in lexicographic order
Some additional goals of this implementation:
- bounded memory use while building the FST
- streaming out FST data while building
- mmap FST runtime to support very large FTSs (optional)
## Usage
### Building an FST
To build an FST, create a new builder using the `New()` method. This method takes an `io.Writer` as an argument. As the FST is being built, data will be streamed to the writer as soon as possible. With this builder you **MUST** insert keys in lexicographic order. Inserting keys out of order will result in an error. After inserting the last key into the builder, you **MUST** call `Close()` on the builder. This will flush all remaining data to the underlying writer.
In memory:
```go
var buf bytes.Buffer
builder, err := vellum.New(&buf, nil)
if err != nil {
log.Fatal(err)
}
```
To disk:
```go
f, err := os.Create("/tmp/vellum.fst")
if err != nil {
log.Fatal(err)
}
builder, err := vellum.New(f, nil)
if err != nil {
log.Fatal(err)
}
```
**MUST** insert keys in lexicographic order:
```go
err = builder.Insert([]byte("cat"), 1)
if err != nil {
log.Fatal(err)
}
err = builder.Insert([]byte("dog"), 2)
if err != nil {
log.Fatal(err)
}
err = builder.Insert([]byte("fish"), 3)
if err != nil {
log.Fatal(err)
}
err = builder.Close()
if err != nil {
log.Fatal(err)
}
```
### Using an FST
After closing the builder, the data can be used to instantiate an FST. If the data was written to disk, you can use the `Open()` method to mmap the file. If the data is already in memory, or you wish to load/mmap the data yourself, you can instantiate the FST with the `Load()` method.
Load in memory:
```go
fst, err := vellum.Load(buf.Bytes())
if err != nil {
log.Fatal(err)
}
```
Open from disk:
```go
fst, err := vellum.Open("/tmp/vellum.fst")
if err != nil {
log.Fatal(err)
}
```
Get key/value:
```go
val, exists, err = fst.Get([]byte("dog"))
if err != nil {
log.Fatal(err)
}
if exists {
fmt.Printf("contains dog with val: %d\n", val)
} else {
fmt.Printf("does not contain dog")
}
```
Iterate key/values:
```go
itr, err := fst.Iterator(startKeyInclusive, endKeyExclusive)
for err == nil {
key, val := itr.Current()
fmt.Printf("contains key: %s val: %d", key, val)
err = itr.Next()
}
if err != nil {
log.Fatal(err)
}
```
### How does the FST get built?
A full example of the implementation is beyond the scope of this README, but let's consider a small example where we want to insert 3 key/value pairs.
First we insert "are" with the value 4.
![step1](docs/demo1.png)
Next, we insert "ate" with the value 2.
![step2](docs/demo2.png)
Notice how the values associated with the transitions were adjusted so that by summing them while traversing we still get the expected value.
At this point, we see that state 5 looks like state 3, and state 4 looks like state 2. But, we cannot yet combine them because future inserts could change this.
Now, we insert "see" with value 3. Once it has been added, we now know that states 5 and 4 can longer change. Since they are identical to 3 and 2, we replace them.
![step3](docs/demo3.png)
Again, we see that states 7 and 8 appear to be identical to 2 and 3.
Having inserted our last key, we call `Close()` on the builder.
![step4](docs/demo4.png)
Now, states 7 and 8 can safely be replaced with 2 and 3.
For additional information, see the references at the bottom of this document.
### What does the serialized format look like?
We've broken out a separate document on the [vellum disk format v1](docs/format.md).
### What if I want to use this on a system that doesn't have mmap?
The mmap library itself is guarded with system/architecture build tags, but we've also added an additional build tag in vellum. If you'd like to Open() a file based representation of an FST, but not use mmap, you can build the library with the `nommap` build tag. NOTE: if you do this, the entire FST will be read into memory.
### Can I use this with Unicode strings?
Yes, however this implementation is only aware of the byte representation you choose. In order to find matches, you must work with some canonical byte representation of the string. In the future, some encoding-aware traversals may be possible on top of the lower-level byte transitions.
### How did this library come to be?
In my work on the [Bleve](https://github.com/blevesearch/bleve) project I became aware of the power of the FST for many search-related tasks. The obvious starting point for such a thing in Go was the [mafsa](https://github.com/smartystreets/mafsa) project. While working with mafsa I encountered some issues. First, it did not stream data to disk while building. Second, it chose to use a rune as the fundamental unit of transition in the FST, but I felt using a byte would be more powerful in the end. My hope is that higher-level encoding-aware traversals will be possible when necessary. Finally, as I reported bugs and submitted PRs I learned that the mafsa project was mainly a research project and no longer being maintained. I wanted to build something that could be used in production. As the project advanced more and more techniques from the [BurntSushi/fst](https://github.com/BurntSushi/fst) were adapted to our implementation.
### Are there tools to work with vellum files?
Under the cmd/vellum subdirectory, there's a command-line tool which
features subcommands that can allow you to create, inspect and query
vellum files.
### How can I generate a state transition diagram from a vellum file?
The vellum command-line tool has a "dot" subcommand that can emit
graphviz dot output data from an input vellum file. The dot file can
in turn be converted into an image using graphviz tools. Example...
$ vellum dot myFile.vellum > output.dot
$ dot -Tpng output.dot -o output.png
## Related Work
Much credit goes to two existing projects:
- [mafsa](https://github.com/smartystreets/mafsa)
- [BurntSushi/fst](https://github.com/BurntSushi/fst)
Most of the original implementation here started with my digging into the internals of mafsa. As the implementation progressed, I continued to borrow ideas/approaches from the BurntSushi/fst library as well.
For a great introduction to this topic, please read the blog post [Index 1,600,000,000 Keys with Automata and Rust](http://blog.burntsushi.net/transducers/)

View file

@ -1,85 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
// Automaton represents the general contract of a byte-based finite automaton
type Automaton interface {
// Start returns the start state
Start() int
// IsMatch returns true if and only if the state is a match
IsMatch(int) bool
// CanMatch returns true if and only if it is possible to reach a match
// in zero or more steps
CanMatch(int) bool
// WillAlwaysMatch returns true if and only if the current state matches
// and will always match no matter what steps are taken
WillAlwaysMatch(int) bool
// Accept returns the next state given the input to the specified state
Accept(int, byte) int
}
// AutomatonContains implements an generic Contains() method which works
// on any implementation of Automaton
func AutomatonContains(a Automaton, k []byte) bool {
i := 0
curr := a.Start()
for a.CanMatch(curr) && i < len(k) {
curr = a.Accept(curr, k[i])
if curr == noneAddr {
break
}
i++
}
if i != len(k) {
return false
}
return a.IsMatch(curr)
}
// AlwaysMatch is an Automaton implementation which always matches
type AlwaysMatch struct{}
// Start returns the AlwaysMatch start state
func (m *AlwaysMatch) Start() int {
return 0
}
// IsMatch always returns true
func (m *AlwaysMatch) IsMatch(int) bool {
return true
}
// CanMatch always returns true
func (m *AlwaysMatch) CanMatch(int) bool {
return true
}
// WillAlwaysMatch always returns true
func (m *AlwaysMatch) WillAlwaysMatch(int) bool {
return true
}
// Accept returns the next AlwaysMatch state
func (m *AlwaysMatch) Accept(int, byte) int {
return 0
}
// creating an alwaysMatchAutomaton to avoid unnecessary repeated allocations.
var alwaysMatchAutomaton = &AlwaysMatch{}

View file

@ -1,452 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"bytes"
"io"
)
var defaultBuilderOpts = &BuilderOpts{
Encoder: 1,
RegistryTableSize: 10000,
RegistryMRUSize: 2,
}
// A Builder is used to build a new FST. When possible data is
// streamed out to the underlying Writer as soon as possible.
type Builder struct {
unfinished *unfinishedNodes
registry *registry
last []byte
len int
lastAddr int
encoder encoder
opts *BuilderOpts
builderNodePool *builderNodePool
}
const noneAddr = 1
const emptyAddr = 0
// NewBuilder returns a new Builder which will stream out the
// underlying representation to the provided Writer as the set is built.
func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
if opts == nil {
opts = defaultBuilderOpts
}
builderNodePool := &builderNodePool{}
rv := &Builder{
unfinished: newUnfinishedNodes(builderNodePool),
registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
builderNodePool: builderNodePool,
opts: opts,
lastAddr: noneAddr,
}
var err error
rv.encoder, err = loadEncoder(opts.Encoder, w)
if err != nil {
return nil, err
}
err = rv.encoder.start()
if err != nil {
return nil, err
}
return rv, nil
}
func (b *Builder) Reset(w io.Writer) error {
b.unfinished.Reset()
b.registry.Reset()
b.lastAddr = noneAddr
b.encoder.reset(w)
b.last = nil
b.len = 0
err := b.encoder.start()
if err != nil {
return err
}
return nil
}
// Insert the provided value to the set being built.
// NOTE: values must be inserted in lexicographical order.
func (b *Builder) Insert(key []byte, val uint64) error {
// ensure items are added in lexicographic order
if bytes.Compare(key, b.last) < 0 {
return ErrOutOfOrder
}
if len(key) == 0 {
b.len = 1
b.unfinished.setRootOutput(val)
return nil
}
prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val)
b.len++
err := b.compileFrom(prefixLen)
if err != nil {
return err
}
b.copyLastKey(key)
b.unfinished.addSuffix(key[prefixLen:], out)
return nil
}
func (b *Builder) copyLastKey(key []byte) {
if b.last == nil {
b.last = make([]byte, 0, 64)
} else {
b.last = b.last[:0]
}
b.last = append(b.last, key...)
}
// Close MUST be called after inserting all values.
func (b *Builder) Close() error {
err := b.compileFrom(0)
if err != nil {
return err
}
root := b.unfinished.popRoot()
rootAddr, err := b.compile(root)
if err != nil {
return err
}
return b.encoder.finish(b.len, rootAddr)
}
func (b *Builder) compileFrom(iState int) error {
addr := noneAddr
for iState+1 < len(b.unfinished.stack) {
var node *builderNode
if addr == noneAddr {
node = b.unfinished.popEmpty()
} else {
node = b.unfinished.popFreeze(addr)
}
var err error
addr, err = b.compile(node)
if err != nil {
return nil
}
}
b.unfinished.topLastFreeze(addr)
return nil
}
func (b *Builder) compile(node *builderNode) (int, error) {
if node.final && len(node.trans) == 0 &&
node.finalOutput == 0 {
return 0, nil
}
found, addr, entry := b.registry.entry(node)
if found {
return addr, nil
}
addr, err := b.encoder.encodeState(node, b.lastAddr)
if err != nil {
return 0, err
}
b.lastAddr = addr
entry.addr = addr
return addr, nil
}
type unfinishedNodes struct {
stack []*builderNodeUnfinished
// cache allocates a reasonable number of builderNodeUnfinished
// objects up front and tries to keep reusing them
// because the main data structure is a stack, we assume the
// same access pattern, and don't track items separately
// this means calls get() and pushXYZ() must be paired,
// as well as calls put() and popXYZ()
cache []builderNodeUnfinished
builderNodePool *builderNodePool
}
func (u *unfinishedNodes) Reset() {
u.stack = u.stack[:0]
for i := 0; i < len(u.cache); i++ {
u.cache[i] = builderNodeUnfinished{}
}
u.pushEmpty(false)
}
func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
rv := &unfinishedNodes{
stack: make([]*builderNodeUnfinished, 0, 64),
cache: make([]builderNodeUnfinished, 64),
builderNodePool: p,
}
rv.pushEmpty(false)
return rv
}
// get new builderNodeUnfinished, reusing cache if possible
func (u *unfinishedNodes) get() *builderNodeUnfinished {
if len(u.stack) < len(u.cache) {
return &u.cache[len(u.stack)]
}
// full now allocate a new one
return &builderNodeUnfinished{}
}
// return builderNodeUnfinished, clearing it for reuse
func (u *unfinishedNodes) put() {
if len(u.stack) >= len(u.cache) {
return
// do nothing, not part of cache
}
u.cache[len(u.stack)] = builderNodeUnfinished{}
}
func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte,
out uint64) (int, uint64) {
var i int
for i < len(key) {
if i >= len(u.stack) {
break
}
var addPrefix uint64
if !u.stack[i].hasLastT {
break
}
if u.stack[i].lastIn == key[i] {
commonPre := outputPrefix(u.stack[i].lastOut, out)
addPrefix = outputSub(u.stack[i].lastOut, commonPre)
out = outputSub(out, commonPre)
u.stack[i].lastOut = commonPre
i++
} else {
break
}
if addPrefix != 0 {
u.stack[i].addOutputPrefix(addPrefix)
}
}
return i, out
}
func (u *unfinishedNodes) pushEmpty(final bool) {
next := u.get()
next.node = u.builderNodePool.Get()
next.node.final = final
u.stack = append(u.stack, next)
}
func (u *unfinishedNodes) popRoot() *builderNode {
l := len(u.stack)
var unfinished *builderNodeUnfinished
u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
rv := unfinished.node
u.put()
return rv
}
func (u *unfinishedNodes) popFreeze(addr int) *builderNode {
l := len(u.stack)
var unfinished *builderNodeUnfinished
u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
unfinished.lastCompiled(addr)
rv := unfinished.node
u.put()
return rv
}
func (u *unfinishedNodes) popEmpty() *builderNode {
l := len(u.stack)
var unfinished *builderNodeUnfinished
u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
rv := unfinished.node
u.put()
return rv
}
func (u *unfinishedNodes) setRootOutput(out uint64) {
u.stack[0].node.final = true
u.stack[0].node.finalOutput = out
}
func (u *unfinishedNodes) topLastFreeze(addr int) {
last := len(u.stack) - 1
u.stack[last].lastCompiled(addr)
}
func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) {
if len(bs) == 0 {
return
}
last := len(u.stack) - 1
u.stack[last].hasLastT = true
u.stack[last].lastIn = bs[0]
u.stack[last].lastOut = out
for _, b := range bs[1:] {
next := u.get()
next.node = u.builderNodePool.Get()
next.hasLastT = true
next.lastIn = b
next.lastOut = 0
u.stack = append(u.stack, next)
}
u.pushEmpty(true)
}
type builderNodeUnfinished struct {
node *builderNode
lastOut uint64
lastIn byte
hasLastT bool
}
func (b *builderNodeUnfinished) lastCompiled(addr int) {
if b.hasLastT {
transIn := b.lastIn
transOut := b.lastOut
b.hasLastT = false
b.lastOut = 0
b.node.trans = append(b.node.trans, transition{
in: transIn,
out: transOut,
addr: addr,
})
}
}
func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) {
if b.node.final {
b.node.finalOutput = outputCat(prefix, b.node.finalOutput)
}
for i := range b.node.trans {
b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out)
}
if b.hasLastT {
b.lastOut = outputCat(prefix, b.lastOut)
}
}
type builderNode struct {
finalOutput uint64
trans []transition
final bool
// intrusive linked list
next *builderNode
}
// reset resets the receiver builderNode to a re-usable state.
func (n *builderNode) reset() {
n.final = false
n.finalOutput = 0
for i := range n.trans {
n.trans[i] = emptyTransition
}
n.trans = n.trans[:0]
n.next = nil
}
func (n *builderNode) equiv(o *builderNode) bool {
if n.final != o.final {
return false
}
if n.finalOutput != o.finalOutput {
return false
}
if len(n.trans) != len(o.trans) {
return false
}
for i, ntrans := range n.trans {
otrans := o.trans[i]
if ntrans.in != otrans.in {
return false
}
if ntrans.addr != otrans.addr {
return false
}
if ntrans.out != otrans.out {
return false
}
}
return true
}
var emptyTransition = transition{}
type transition struct {
out uint64
addr int
in byte
}
func outputPrefix(l, r uint64) uint64 {
if l < r {
return l
}
return r
}
func outputSub(l, r uint64) uint64 {
return l - r
}
func outputCat(l, r uint64) uint64 {
return l + r
}
// builderNodePool pools builderNodes using a singly linked list.
//
// NB: builderNode lifecylce is described by the following interactions -
// +------------------------+ +----------------------+
// | Unfinished Nodes | Transfer once | Registry |
// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
// +------------------------+ marked frozen +----------------------+
// ^ |
// | |
// | Put()
// | Get() on +-------------------+ when
// +-new char--------| builderNode Pool |<-----------evicted
// +-------------------+
type builderNodePool struct {
head *builderNode
}
func (p *builderNodePool) Get() *builderNode {
if p.head == nil {
return &builderNode{}
}
head := p.head
p.head = p.head.next
return head
}
func (p *builderNodePool) Put(v *builderNode) {
if v == nil {
return
}
v.reset()
v.next = p.head
p.head = v
}

View file

@ -1,547 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
const maxCommon = 1<<6 - 1
func encodeCommon(in byte) byte {
val := byte((int(commonInputs[in]) + 1) % 256)
if val > maxCommon {
return 0
}
return val
}
func decodeCommon(in byte) byte {
return commonInputsInv[in-1]
}
var commonInputs = []byte{
84, // '\x00'
85, // '\x01'
86, // '\x02'
87, // '\x03'
88, // '\x04'
89, // '\x05'
90, // '\x06'
91, // '\x07'
92, // '\x08'
93, // '\t'
94, // '\n'
95, // '\x0b'
96, // '\x0c'
97, // '\r'
98, // '\x0e'
99, // '\x0f'
100, // '\x10'
101, // '\x11'
102, // '\x12'
103, // '\x13'
104, // '\x14'
105, // '\x15'
106, // '\x16'
107, // '\x17'
108, // '\x18'
109, // '\x19'
110, // '\x1a'
111, // '\x1b'
112, // '\x1c'
113, // '\x1d'
114, // '\x1e'
115, // '\x1f'
116, // ' '
80, // '!'
117, // '"'
118, // '#'
79, // '$'
39, // '%'
30, // '&'
81, // "'"
75, // '('
74, // ')'
82, // '*'
57, // '+'
66, // ','
16, // '-'
12, // '.'
2, // '/'
19, // '0'
20, // '1'
21, // '2'
27, // '3'
32, // '4'
29, // '5'
35, // '6'
36, // '7'
37, // '8'
34, // '9'
24, // ':'
73, // ';'
119, // '<'
23, // '='
120, // '>'
40, // '?'
83, // '@'
44, // 'A'
48, // 'B'
42, // 'C'
43, // 'D'
49, // 'E'
46, // 'F'
62, // 'G'
61, // 'H'
47, // 'I'
69, // 'J'
68, // 'K'
58, // 'L'
56, // 'M'
55, // 'N'
59, // 'O'
51, // 'P'
72, // 'Q'
54, // 'R'
45, // 'S'
52, // 'T'
64, // 'U'
65, // 'V'
63, // 'W'
71, // 'X'
67, // 'Y'
70, // 'Z'
77, // '['
121, // '\\'
78, // ']'
122, // '^'
31, // '_'
123, // '`'
4, // 'a'
25, // 'b'
9, // 'c'
17, // 'd'
1, // 'e'
26, // 'f'
22, // 'g'
13, // 'h'
7, // 'i'
50, // 'j'
38, // 'k'
14, // 'l'
15, // 'm'
10, // 'n'
3, // 'o'
8, // 'p'
60, // 'q'
6, // 'r'
5, // 's'
0, // 't'
18, // 'u'
33, // 'v'
11, // 'w'
41, // 'x'
28, // 'y'
53, // 'z'
124, // '{'
125, // '|'
126, // '}'
76, // '~'
127, // '\x7f'
128, // '\x80'
129, // '\x81'
130, // '\x82'
131, // '\x83'
132, // '\x84'
133, // '\x85'
134, // '\x86'
135, // '\x87'
136, // '\x88'
137, // '\x89'
138, // '\x8a'
139, // '\x8b'
140, // '\x8c'
141, // '\x8d'
142, // '\x8e'
143, // '\x8f'
144, // '\x90'
145, // '\x91'
146, // '\x92'
147, // '\x93'
148, // '\x94'
149, // '\x95'
150, // '\x96'
151, // '\x97'
152, // '\x98'
153, // '\x99'
154, // '\x9a'
155, // '\x9b'
156, // '\x9c'
157, // '\x9d'
158, // '\x9e'
159, // '\x9f'
160, // '\xa0'
161, // '¡'
162, // '¢'
163, // '£'
164, // '¤'
165, // '¥'
166, // '¦'
167, // '§'
168, // '¨'
169, // '©'
170, // 'ª'
171, // '«'
172, // '¬'
173, // '\xad'
174, // '®'
175, // '¯'
176, // '°'
177, // '±'
178, // '²'
179, // '³'
180, // '´'
181, // 'µ'
182, // '¶'
183, // '·'
184, // '¸'
185, // '¹'
186, // 'º'
187, // '»'
188, // '¼'
189, // '½'
190, // '¾'
191, // '¿'
192, // 'À'
193, // 'Á'
194, // 'Â'
195, // 'Ã'
196, // 'Ä'
197, // 'Å'
198, // 'Æ'
199, // 'Ç'
200, // 'È'
201, // 'É'
202, // 'Ê'
203, // 'Ë'
204, // 'Ì'
205, // 'Í'
206, // 'Î'
207, // 'Ï'
208, // 'Ð'
209, // 'Ñ'
210, // 'Ò'
211, // 'Ó'
212, // 'Ô'
213, // 'Õ'
214, // 'Ö'
215, // '×'
216, // 'Ø'
217, // 'Ù'
218, // 'Ú'
219, // 'Û'
220, // 'Ü'
221, // 'Ý'
222, // 'Þ'
223, // 'ß'
224, // 'à'
225, // 'á'
226, // 'â'
227, // 'ã'
228, // 'ä'
229, // 'å'
230, // 'æ'
231, // 'ç'
232, // 'è'
233, // 'é'
234, // 'ê'
235, // 'ë'
236, // 'ì'
237, // 'í'
238, // 'î'
239, // 'ï'
240, // 'ð'
241, // 'ñ'
242, // 'ò'
243, // 'ó'
244, // 'ô'
245, // 'õ'
246, // 'ö'
247, // '÷'
248, // 'ø'
249, // 'ù'
250, // 'ú'
251, // 'û'
252, // 'ü'
253, // 'ý'
254, // 'þ'
255, // 'ÿ'
}
var commonInputsInv = []byte{
't',
'e',
'/',
'o',
'a',
's',
'r',
'i',
'p',
'c',
'n',
'w',
'.',
'h',
'l',
'm',
'-',
'd',
'u',
'0',
'1',
'2',
'g',
'=',
':',
'b',
'f',
'3',
'y',
'5',
'&',
'_',
'4',
'v',
'9',
'6',
'7',
'8',
'k',
'%',
'?',
'x',
'C',
'D',
'A',
'S',
'F',
'I',
'B',
'E',
'j',
'P',
'T',
'z',
'R',
'N',
'M',
'+',
'L',
'O',
'q',
'H',
'G',
'W',
'U',
'V',
',',
'Y',
'K',
'J',
'Z',
'X',
'Q',
';',
')',
'(',
'~',
'[',
']',
'$',
'!',
'\'',
'*',
'@',
'\x00',
'\x01',
'\x02',
'\x03',
'\x04',
'\x05',
'\x06',
'\x07',
'\x08',
'\t',
'\n',
'\x0b',
'\x0c',
'\r',
'\x0e',
'\x0f',
'\x10',
'\x11',
'\x12',
'\x13',
'\x14',
'\x15',
'\x16',
'\x17',
'\x18',
'\x19',
'\x1a',
'\x1b',
'\x1c',
'\x1d',
'\x1e',
'\x1f',
' ',
'"',
'#',
'<',
'>',
'\\',
'^',
'`',
'{',
'|',
'}',
'\x7f',
'\x80',
'\x81',
'\x82',
'\x83',
'\x84',
'\x85',
'\x86',
'\x87',
'\x88',
'\x89',
'\x8a',
'\x8b',
'\x8c',
'\x8d',
'\x8e',
'\x8f',
'\x90',
'\x91',
'\x92',
'\x93',
'\x94',
'\x95',
'\x96',
'\x97',
'\x98',
'\x99',
'\x9a',
'\x9b',
'\x9c',
'\x9d',
'\x9e',
'\x9f',
'\xa0',
'\xa1',
'\xa2',
'\xa3',
'\xa4',
'\xa5',
'\xa6',
'\xa7',
'\xa8',
'\xa9',
'\xaa',
'\xab',
'\xac',
'\xad',
'\xae',
'\xaf',
'\xb0',
'\xb1',
'\xb2',
'\xb3',
'\xb4',
'\xb5',
'\xb6',
'\xb7',
'\xb8',
'\xb9',
'\xba',
'\xbb',
'\xbc',
'\xbd',
'\xbe',
'\xbf',
'\xc0',
'\xc1',
'\xc2',
'\xc3',
'\xc4',
'\xc5',
'\xc6',
'\xc7',
'\xc8',
'\xc9',
'\xca',
'\xcb',
'\xcc',
'\xcd',
'\xce',
'\xcf',
'\xd0',
'\xd1',
'\xd2',
'\xd3',
'\xd4',
'\xd5',
'\xd6',
'\xd7',
'\xd8',
'\xd9',
'\xda',
'\xdb',
'\xdc',
'\xdd',
'\xde',
'\xdf',
'\xe0',
'\xe1',
'\xe2',
'\xe3',
'\xe4',
'\xe5',
'\xe6',
'\xe7',
'\xe8',
'\xe9',
'\xea',
'\xeb',
'\xec',
'\xed',
'\xee',
'\xef',
'\xf0',
'\xf1',
'\xf2',
'\xf3',
'\xf4',
'\xf5',
'\xf6',
'\xf7',
'\xf8',
'\xf9',
'\xfa',
'\xfb',
'\xfc',
'\xfd',
'\xfe',
'\xff',
}

View file

@ -1,314 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"bytes"
"encoding/binary"
"fmt"
"strconv"
)
func init() {
registerDecoder(versionV1, func(data []byte) decoder {
return newDecoderV1(data)
})
}
type decoderV1 struct {
data []byte
}
func newDecoderV1(data []byte) *decoderV1 {
return &decoderV1{
data: data,
}
}
func (d *decoderV1) getRoot() int {
if len(d.data) < footerSizeV1 {
return noneAddr
}
footer := d.data[len(d.data)-footerSizeV1:]
root := binary.LittleEndian.Uint64(footer[8:])
return int(root)
}
func (d *decoderV1) getLen() int {
if len(d.data) < footerSizeV1 {
return 0
}
footer := d.data[len(d.data)-footerSizeV1:]
dlen := binary.LittleEndian.Uint64(footer)
return int(dlen)
}
func (d *decoderV1) stateAt(addr int, prealloc fstState) (fstState, error) {
state, ok := prealloc.(*fstStateV1)
if ok && state != nil {
*state = fstStateV1{} // clear the struct
} else {
state = &fstStateV1{}
}
err := state.at(d.data, addr)
if err != nil {
return nil, err
}
return state, nil
}
type fstStateV1 struct {
data []byte
top int
bottom int
numTrans int
// single trans only
singleTransChar byte
singleTransNext bool
singleTransAddr uint64
singleTransOut uint64
// shared
transSize int
outSize int
// multiple trans only
final bool
transTop int
transBottom int
destTop int
destBottom int
outTop int
outBottom int
outFinal int
}
func (f *fstStateV1) isEncodedSingle() bool {
if f.data[f.top]>>7 > 0 {
return true
}
return false
}
func (f *fstStateV1) at(data []byte, addr int) error {
f.data = data
if addr == emptyAddr {
return f.atZero()
} else if addr == noneAddr {
return f.atNone()
}
if addr > len(data) || addr < 16 {
return fmt.Errorf("invalid address %d/%d", addr, len(data))
}
f.top = addr
f.bottom = addr
if f.isEncodedSingle() {
return f.atSingle(data, addr)
}
return f.atMulti(data, addr)
}
func (f *fstStateV1) atZero() error {
f.top = 0
f.bottom = 1
f.numTrans = 0
f.final = true
f.outFinal = 0
return nil
}
func (f *fstStateV1) atNone() error {
f.top = 0
f.bottom = 1
f.numTrans = 0
f.final = false
f.outFinal = 0
return nil
}
func (f *fstStateV1) atSingle(data []byte, addr int) error {
// handle single transition case
f.numTrans = 1
f.singleTransNext = data[f.top]&transitionNext > 0
f.singleTransChar = data[f.top] & maxCommon
if f.singleTransChar == 0 {
f.bottom-- // extra byte for uncommon
f.singleTransChar = data[f.bottom]
} else {
f.singleTransChar = decodeCommon(f.singleTransChar)
}
if f.singleTransNext {
// now we know the bottom, can compute next addr
f.singleTransAddr = uint64(f.bottom - 1)
f.singleTransOut = 0
} else {
f.bottom-- // extra byte with pack sizes
f.transSize, f.outSize = decodePackSize(data[f.bottom])
f.bottom -= f.transSize // exactly one trans
f.singleTransAddr = readPackedUint(data[f.bottom : f.bottom+f.transSize])
if f.outSize > 0 {
f.bottom -= f.outSize // exactly one out (could be length 0 though)
f.singleTransOut = readPackedUint(data[f.bottom : f.bottom+f.outSize])
} else {
f.singleTransOut = 0
}
// need to wait till we know bottom
if f.singleTransAddr != 0 {
f.singleTransAddr = uint64(f.bottom) - f.singleTransAddr
}
}
return nil
}
func (f *fstStateV1) atMulti(data []byte, addr int) error {
// handle multiple transitions case
f.final = data[f.top]&stateFinal > 0
f.numTrans = int(data[f.top] & maxNumTrans)
if f.numTrans == 0 {
f.bottom-- // extra byte for number of trans
f.numTrans = int(data[f.bottom])
if f.numTrans == 1 {
// can't really be 1 here, this is special case that means 256
f.numTrans = 256
}
}
f.bottom-- // extra byte with pack sizes
f.transSize, f.outSize = decodePackSize(data[f.bottom])
f.transTop = f.bottom
f.bottom -= f.numTrans // one byte for each transition
f.transBottom = f.bottom
f.destTop = f.bottom
f.bottom -= f.numTrans * f.transSize
f.destBottom = f.bottom
if f.outSize > 0 {
f.outTop = f.bottom
f.bottom -= f.numTrans * f.outSize
f.outBottom = f.bottom
if f.final {
f.bottom -= f.outSize
f.outFinal = f.bottom
}
}
return nil
}
func (f *fstStateV1) Address() int {
return f.top
}
func (f *fstStateV1) Final() bool {
return f.final
}
func (f *fstStateV1) FinalOutput() uint64 {
if f.final && f.outSize > 0 {
return readPackedUint(f.data[f.outFinal : f.outFinal+f.outSize])
}
return 0
}
func (f *fstStateV1) NumTransitions() int {
return f.numTrans
}
func (f *fstStateV1) TransitionAt(i int) byte {
if f.isEncodedSingle() {
return f.singleTransChar
}
transitionKeys := f.data[f.transBottom:f.transTop]
return transitionKeys[f.numTrans-i-1]
}
func (f *fstStateV1) TransitionFor(b byte) (int, int, uint64) {
if f.isEncodedSingle() {
if f.singleTransChar == b {
return 0, int(f.singleTransAddr), f.singleTransOut
}
return -1, noneAddr, 0
}
transitionKeys := f.data[f.transBottom:f.transTop]
pos := bytes.IndexByte(transitionKeys, b)
if pos < 0 {
return -1, noneAddr, 0
}
transDests := f.data[f.destBottom:f.destTop]
dest := int(readPackedUint(transDests[pos*f.transSize : pos*f.transSize+f.transSize]))
if dest > 0 {
// convert delta
dest = f.bottom - dest
}
transVals := f.data[f.outBottom:f.outTop]
var out uint64
if f.outSize > 0 {
out = readPackedUint(transVals[pos*f.outSize : pos*f.outSize+f.outSize])
}
return f.numTrans - pos - 1, dest, out
}
func (f *fstStateV1) String() string {
rv := ""
rv += fmt.Sprintf("State: %d (%#x)", f.top, f.top)
if f.final {
rv += " final"
fout := f.FinalOutput()
if fout != 0 {
rv += fmt.Sprintf(" (%d)", fout)
}
}
rv += "\n"
rv += fmt.Sprintf("Data: % x\n", f.data[f.bottom:f.top+1])
for i := 0; i < f.numTrans; i++ {
transChar := f.TransitionAt(i)
_, transDest, transOut := f.TransitionFor(transChar)
rv += fmt.Sprintf(" - %d (%#x) '%s' ---> %d (%#x) with output: %d", transChar, transChar, string(transChar), transDest, transDest, transOut)
rv += "\n"
}
if f.numTrans == 0 {
rv += "\n"
}
return rv
}
func (f *fstStateV1) DotString(num int) string {
rv := ""
label := fmt.Sprintf("%d", num)
final := ""
if f.final {
final = ",peripheries=2"
}
rv += fmt.Sprintf(" %d [label=\"%s\"%s];\n", f.top, label, final)
for i := 0; i < f.numTrans; i++ {
transChar := f.TransitionAt(i)
_, transDest, transOut := f.TransitionFor(transChar)
out := ""
if transOut != 0 {
out = fmt.Sprintf("/%d", transOut)
}
rv += fmt.Sprintf(" %d -> %d [label=\"%s%s\"];\n", f.top, transDest, escapeInput(transChar), out)
}
return rv
}
func escapeInput(b byte) string {
x := strconv.AppendQuoteRune(nil, rune(b))
return string(x[1:(len(x) - 1)])
}

View file

@ -1,227 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"encoding/binary"
"fmt"
"io"
)
const versionV1 = 1
const oneTransition = 1 << 7
const transitionNext = 1 << 6
const stateFinal = 1 << 6
const footerSizeV1 = 16
func init() {
registerEncoder(versionV1, func(w io.Writer) encoder {
return newEncoderV1(w)
})
}
type encoderV1 struct {
bw *writer
}
func newEncoderV1(w io.Writer) *encoderV1 {
return &encoderV1{
bw: newWriter(w),
}
}
func (e *encoderV1) reset(w io.Writer) {
e.bw.Reset(w)
}
func (e *encoderV1) start() error {
header := make([]byte, headerSize)
binary.LittleEndian.PutUint64(header, versionV1)
binary.LittleEndian.PutUint64(header[8:], uint64(0)) // type
n, err := e.bw.Write(header)
if err != nil {
return err
}
if n != headerSize {
return fmt.Errorf("short write of header %d/%d", n, headerSize)
}
return nil
}
func (e *encoderV1) encodeState(s *builderNode, lastAddr int) (int, error) {
if len(s.trans) == 0 && s.final && s.finalOutput == 0 {
return 0, nil
} else if len(s.trans) != 1 || s.final {
return e.encodeStateMany(s)
} else if !s.final && s.trans[0].out == 0 && s.trans[0].addr == lastAddr {
return e.encodeStateOneFinish(s, transitionNext)
}
return e.encodeStateOne(s)
}
func (e *encoderV1) encodeStateOne(s *builderNode) (int, error) {
start := uint64(e.bw.counter)
outPackSize := 0
if s.trans[0].out != 0 {
outPackSize = packedSize(s.trans[0].out)
err := e.bw.WritePackedUintIn(s.trans[0].out, outPackSize)
if err != nil {
return 0, err
}
}
delta := deltaAddr(start, uint64(s.trans[0].addr))
transPackSize := packedSize(delta)
err := e.bw.WritePackedUintIn(delta, transPackSize)
if err != nil {
return 0, err
}
packSize := encodePackSize(transPackSize, outPackSize)
err = e.bw.WriteByte(packSize)
if err != nil {
return 0, err
}
return e.encodeStateOneFinish(s, 0)
}
func (e *encoderV1) encodeStateOneFinish(s *builderNode, next byte) (int, error) {
enc := encodeCommon(s.trans[0].in)
// not a common input
if enc == 0 {
err := e.bw.WriteByte(s.trans[0].in)
if err != nil {
return 0, err
}
}
err := e.bw.WriteByte(oneTransition | next | enc)
if err != nil {
return 0, err
}
return e.bw.counter - 1, nil
}
func (e *encoderV1) encodeStateMany(s *builderNode) (int, error) {
start := uint64(e.bw.counter)
transPackSize := 0
outPackSize := packedSize(s.finalOutput)
anyOutputs := s.finalOutput != 0
for i := range s.trans {
delta := deltaAddr(start, uint64(s.trans[i].addr))
tsize := packedSize(delta)
if tsize > transPackSize {
transPackSize = tsize
}
osize := packedSize(s.trans[i].out)
if osize > outPackSize {
outPackSize = osize
}
anyOutputs = anyOutputs || s.trans[i].out != 0
}
if !anyOutputs {
outPackSize = 0
}
if anyOutputs {
// output final value
if s.final {
err := e.bw.WritePackedUintIn(s.finalOutput, outPackSize)
if err != nil {
return 0, err
}
}
// output transition values (in reverse)
for j := len(s.trans) - 1; j >= 0; j-- {
err := e.bw.WritePackedUintIn(s.trans[j].out, outPackSize)
if err != nil {
return 0, err
}
}
}
// output transition dests (in reverse)
for j := len(s.trans) - 1; j >= 0; j-- {
delta := deltaAddr(start, uint64(s.trans[j].addr))
err := e.bw.WritePackedUintIn(delta, transPackSize)
if err != nil {
return 0, err
}
}
// output transition keys (in reverse)
for j := len(s.trans) - 1; j >= 0; j-- {
err := e.bw.WriteByte(s.trans[j].in)
if err != nil {
return 0, err
}
}
packSize := encodePackSize(transPackSize, outPackSize)
err := e.bw.WriteByte(packSize)
if err != nil {
return 0, err
}
numTrans := encodeNumTrans(len(s.trans))
// if number of transitions wont fit in edge header byte
// write out separately
if numTrans == 0 {
if len(s.trans) == 256 {
// this wouldn't fit in single byte, but reuse value 1
// which would have always fit in the edge header instead
err = e.bw.WriteByte(1)
if err != nil {
return 0, err
}
} else {
err = e.bw.WriteByte(byte(len(s.trans)))
if err != nil {
return 0, err
}
}
}
// finally write edge header
if s.final {
numTrans |= stateFinal
}
err = e.bw.WriteByte(numTrans)
if err != nil {
return 0, err
}
return e.bw.counter - 1, nil
}
func (e *encoderV1) finish(count, rootAddr int) error {
footer := make([]byte, footerSizeV1)
binary.LittleEndian.PutUint64(footer, uint64(count)) // root addr
binary.LittleEndian.PutUint64(footer[8:], uint64(rootAddr)) // root addr
n, err := e.bw.Write(footer)
if err != nil {
return err
}
if n != footerSizeV1 {
return fmt.Errorf("short write of footer %d/%d", n, footerSizeV1)
}
err = e.bw.Flush()
if err != nil {
return err
}
return nil
}

View file

@ -1,87 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"encoding/binary"
"fmt"
"io"
)
const headerSize = 16
type encoderConstructor func(w io.Writer) encoder
type decoderConstructor func([]byte) decoder
var encoders = map[int]encoderConstructor{}
var decoders = map[int]decoderConstructor{}
type encoder interface {
start() error
encodeState(s *builderNode, addr int) (int, error)
finish(count, rootAddr int) error
reset(w io.Writer)
}
func loadEncoder(ver int, w io.Writer) (encoder, error) {
if cons, ok := encoders[ver]; ok {
return cons(w), nil
}
return nil, fmt.Errorf("no encoder for version %d registered", ver)
}
func registerEncoder(ver int, cons encoderConstructor) {
encoders[ver] = cons
}
type decoder interface {
getRoot() int
getLen() int
stateAt(addr int, prealloc fstState) (fstState, error)
}
func loadDecoder(ver int, data []byte) (decoder, error) {
if cons, ok := decoders[ver]; ok {
return cons(data), nil
}
return nil, fmt.Errorf("no decoder for version %d registered", ver)
}
func registerDecoder(ver int, cons decoderConstructor) {
decoders[ver] = cons
}
func decodeHeader(header []byte) (ver int, typ int, err error) {
if len(header) < headerSize {
err = fmt.Errorf("invalid header < 16 bytes")
return
}
ver = int(binary.LittleEndian.Uint64(header[0:8]))
typ = int(binary.LittleEndian.Uint64(header[8:16]))
return
}
// fstState represents a state inside the FTS runtime
// It is the main contract between the FST impl and the decoder
// The FST impl should work only with this interface, while only the decoder
// impl knows the physical representation.
type fstState interface {
Address() int
Final() bool
FinalOutput() uint64
NumTransitions() int
TransitionFor(b byte) (int, int, uint64)
TransitionAt(i int) byte
}

View file

@ -1,300 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"io"
"github.com/willf/bitset"
)
// FST is an in-memory representation of a finite state transducer,
// capable of returning the uint64 value associated with
// each []byte key stored, as well as enumerating all of the keys
// in order.
type FST struct {
f io.Closer
ver int
len int
typ int
data []byte
decoder decoder
}
func new(data []byte, f io.Closer) (rv *FST, err error) {
rv = &FST{
data: data,
f: f,
}
rv.ver, rv.typ, err = decodeHeader(data)
if err != nil {
return nil, err
}
rv.decoder, err = loadDecoder(rv.ver, rv.data)
if err != nil {
return nil, err
}
rv.len = rv.decoder.getLen()
return rv, nil
}
// Contains returns true if this FST contains the specified key.
func (f *FST) Contains(val []byte) (bool, error) {
_, exists, err := f.Get(val)
return exists, err
}
// Get returns the value associated with the key. NOTE: a value of zero
// does not imply the key does not exist, you must consult the second
// return value as well.
func (f *FST) Get(input []byte) (uint64, bool, error) {
return f.get(input, nil)
}
func (f *FST) get(input []byte, prealloc fstState) (uint64, bool, error) {
var total uint64
curr := f.decoder.getRoot()
state, err := f.decoder.stateAt(curr, prealloc)
if err != nil {
return 0, false, err
}
for _, c := range input {
_, curr, output := state.TransitionFor(c)
if curr == noneAddr {
return 0, false, nil
}
state, err = f.decoder.stateAt(curr, state)
if err != nil {
return 0, false, err
}
total += output
}
if state.Final() {
total += state.FinalOutput()
return total, true, nil
}
return 0, false, nil
}
// Version returns the encoding version used by this FST instance.
func (f *FST) Version() int {
return f.ver
}
// Len returns the number of entries in this FST instance.
func (f *FST) Len() int {
return f.len
}
// Type returns the type of this FST instance.
func (f *FST) Type() int {
return f.typ
}
// Close will unmap any mmap'd data (if managed by vellum) and it will close
// the backing file (if managed by vellum). You MUST call Close() for any
// FST instance that is created.
func (f *FST) Close() error {
if f.f != nil {
err := f.f.Close()
if err != nil {
return err
}
}
f.data = nil
f.decoder = nil
return nil
}
// Start returns the start state of this Automaton
func (f *FST) Start() int {
return f.decoder.getRoot()
}
// IsMatch returns if this state is a matching state in this Automaton
func (f *FST) IsMatch(addr int) bool {
match, _ := f.IsMatchWithVal(addr)
return match
}
// CanMatch returns if this state can ever transition to a matching state
// in this Automaton
func (f *FST) CanMatch(addr int) bool {
if addr == noneAddr {
return false
}
return true
}
// WillAlwaysMatch returns if from this state the Automaton will always
// be in a matching state
func (f *FST) WillAlwaysMatch(int) bool {
return false
}
// Accept returns the next state for this Automaton on input of byte b
func (f *FST) Accept(addr int, b byte) int {
next, _ := f.AcceptWithVal(addr, b)
return next
}
// IsMatchWithVal returns if this state is a matching state in this Automaton
// and also returns the final output value for this state
func (f *FST) IsMatchWithVal(addr int) (bool, uint64) {
s, err := f.decoder.stateAt(addr, nil)
if err != nil {
return false, 0
}
return s.Final(), s.FinalOutput()
}
// AcceptWithVal returns the next state for this Automaton on input of byte b
// and also returns the output value for the transition
func (f *FST) AcceptWithVal(addr int, b byte) (int, uint64) {
s, err := f.decoder.stateAt(addr, nil)
if err != nil {
return noneAddr, 0
}
_, next, output := s.TransitionFor(b)
return next, output
}
// Iterator returns a new Iterator capable of enumerating the key/value pairs
// between the provided startKeyInclusive and endKeyExclusive.
func (f *FST) Iterator(startKeyInclusive, endKeyExclusive []byte) (*FSTIterator, error) {
return newIterator(f, startKeyInclusive, endKeyExclusive, nil)
}
// Search returns a new Iterator capable of enumerating the key/value pairs
// between the provided startKeyInclusive and endKeyExclusive that also
// satisfy the provided automaton.
func (f *FST) Search(aut Automaton, startKeyInclusive, endKeyExclusive []byte) (*FSTIterator, error) {
return newIterator(f, startKeyInclusive, endKeyExclusive, aut)
}
// Debug is only intended for debug purposes, it simply asks the underlying
// decoder visit each state, and pass it to the provided callback.
func (f *FST) Debug(callback func(int, interface{}) error) error {
addr := f.decoder.getRoot()
set := bitset.New(uint(addr))
stack := addrStack{addr}
stateNumber := 0
stack, addr = stack[:len(stack)-1], stack[len(stack)-1]
for addr != noneAddr {
if set.Test(uint(addr)) {
stack, addr = stack.Pop()
continue
}
set.Set(uint(addr))
state, err := f.decoder.stateAt(addr, nil)
if err != nil {
return err
}
err = callback(stateNumber, state)
if err != nil {
return err
}
for i := 0; i < state.NumTransitions(); i++ {
tchar := state.TransitionAt(i)
_, dest, _ := state.TransitionFor(tchar)
stack = append(stack, dest)
}
stateNumber++
stack, addr = stack.Pop()
}
return nil
}
type addrStack []int
func (a addrStack) Pop() (addrStack, int) {
l := len(a)
if l < 1 {
return a, noneAddr
}
return a[:l-1], a[l-1]
}
// Reader() returns a Reader instance that a single thread may use to
// retrieve data from the FST
func (f *FST) Reader() (*Reader, error) {
return &Reader{f: f}, nil
}
func (f *FST) GetMinKey() ([]byte, error) {
var rv []byte
curr := f.decoder.getRoot()
state, err := f.decoder.stateAt(curr, nil)
if err != nil {
return nil, err
}
for !state.Final() {
nextTrans := state.TransitionAt(0)
_, curr, _ = state.TransitionFor(nextTrans)
state, err = f.decoder.stateAt(curr, state)
if err != nil {
return nil, err
}
rv = append(rv, nextTrans)
}
return rv, nil
}
func (f *FST) GetMaxKey() ([]byte, error) {
var rv []byte
curr := f.decoder.getRoot()
state, err := f.decoder.stateAt(curr, nil)
if err != nil {
return nil, err
}
for state.NumTransitions() > 0 {
nextTrans := state.TransitionAt(state.NumTransitions() - 1)
_, curr, _ = state.TransitionFor(nextTrans)
state, err = f.decoder.stateAt(curr, state)
if err != nil {
return nil, err
}
rv = append(rv, nextTrans)
}
return rv, nil
}
// A Reader is meant for a single threaded use
type Reader struct {
f *FST
prealloc fstStateV1
}
func (r *Reader) Get(input []byte) (uint64, bool, error) {
return r.f.get(input, &r.prealloc)
}

View file

@ -1,303 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"bytes"
)
// Iterator represents a means of visiting key/value pairs in order.
type Iterator interface {
// Current() returns the key/value pair currently pointed to.
// The []byte of the key is ONLY guaranteed to be valid until
// another call to Next/Seek/Close. If you need it beyond that
// point you MUST make a copy.
Current() ([]byte, uint64)
// Next() advances the iterator to the next key/value pair.
// If no more key/value pairs exist, ErrIteratorDone is returned.
Next() error
// Seek() advances the iterator the specified key, or the next key
// if it does not exist.
// If no keys exist after that point, ErrIteratorDone is returned.
Seek(key []byte) error
// Reset resets the Iterator' internal state to allow for iterator
// reuse (e.g. pooling).
Reset(f *FST, startKeyInclusive, endKeyExclusive []byte, aut Automaton) error
// Close() frees any resources held by this iterator.
Close() error
}
// FSTIterator is a structure for iterating key/value pairs in this FST in
// lexicographic order. Iterators should be constructed with the FSTIterator
// method on the parent FST structure.
type FSTIterator struct {
f *FST
aut Automaton
startKeyInclusive []byte
endKeyExclusive []byte
statesStack []fstState
keysStack []byte
keysPosStack []int
valsStack []uint64
autStatesStack []int
nextStart []byte
}
func newIterator(f *FST, startKeyInclusive, endKeyExclusive []byte,
aut Automaton) (*FSTIterator, error) {
rv := &FSTIterator{}
err := rv.Reset(f, startKeyInclusive, endKeyExclusive, aut)
if err != nil {
return nil, err
}
return rv, nil
}
// Reset resets the Iterator' internal state to allow for iterator
// reuse (e.g. pooling).
func (i *FSTIterator) Reset(f *FST,
startKeyInclusive, endKeyExclusive []byte, aut Automaton) error {
if aut == nil {
aut = alwaysMatchAutomaton
}
i.f = f
i.startKeyInclusive = startKeyInclusive
i.endKeyExclusive = endKeyExclusive
i.aut = aut
return i.pointTo(startKeyInclusive)
}
// pointTo attempts to point us to the specified location
func (i *FSTIterator) pointTo(key []byte) error {
// tried to seek before start
if bytes.Compare(key, i.startKeyInclusive) < 0 {
key = i.startKeyInclusive
}
// tried to see past end
if i.endKeyExclusive != nil &&
bytes.Compare(key, i.endKeyExclusive) > 0 {
key = i.endKeyExclusive
}
// reset any state, pointTo always starts over
i.statesStack = i.statesStack[:0]
i.keysStack = i.keysStack[:0]
i.keysPosStack = i.keysPosStack[:0]
i.valsStack = i.valsStack[:0]
i.autStatesStack = i.autStatesStack[:0]
root, err := i.f.decoder.stateAt(i.f.decoder.getRoot(), nil)
if err != nil {
return err
}
autStart := i.aut.Start()
maxQ := -1
// root is always part of the path
i.statesStack = append(i.statesStack, root)
i.autStatesStack = append(i.autStatesStack, autStart)
for j := 0; j < len(key); j++ {
keyJ := key[j]
curr := i.statesStack[len(i.statesStack)-1]
autCurr := i.autStatesStack[len(i.autStatesStack)-1]
pos, nextAddr, nextVal := curr.TransitionFor(keyJ)
if nextAddr == noneAddr {
// needed transition doesn't exist
// find last trans before the one we needed
for q := curr.NumTransitions() - 1; q >= 0; q-- {
if curr.TransitionAt(q) < keyJ {
maxQ = q
break
}
}
break
}
autNext := i.aut.Accept(autCurr, keyJ)
next, err := i.f.decoder.stateAt(nextAddr, nil)
if err != nil {
return err
}
i.statesStack = append(i.statesStack, next)
i.keysStack = append(i.keysStack, keyJ)
i.keysPosStack = append(i.keysPosStack, pos)
i.valsStack = append(i.valsStack, nextVal)
i.autStatesStack = append(i.autStatesStack, autNext)
continue
}
if !i.statesStack[len(i.statesStack)-1].Final() ||
!i.aut.IsMatch(i.autStatesStack[len(i.autStatesStack)-1]) ||
bytes.Compare(i.keysStack, key) < 0 {
return i.next(maxQ)
}
return nil
}
// Current returns the key and value currently pointed to by the iterator.
// If the iterator is not pointing at a valid value (because Iterator/Next/Seek)
// returned an error previously, it may return nil,0.
func (i *FSTIterator) Current() ([]byte, uint64) {
curr := i.statesStack[len(i.statesStack)-1]
if curr.Final() {
var total uint64
for _, v := range i.valsStack {
total += v
}
total += curr.FinalOutput()
return i.keysStack, total
}
return nil, 0
}
// Next advances this iterator to the next key/value pair. If there is none
// or the advancement goes beyond the configured endKeyExclusive, then
// ErrIteratorDone is returned.
func (i *FSTIterator) Next() error {
return i.next(-1)
}
func (i *FSTIterator) next(lastOffset int) error {
// remember where we started with keysStack in this next() call
i.nextStart = append(i.nextStart[:0], i.keysStack...)
nextOffset := lastOffset + 1
allowCompare := false
OUTER:
for true {
curr := i.statesStack[len(i.statesStack)-1]
autCurr := i.autStatesStack[len(i.autStatesStack)-1]
if curr.Final() && i.aut.IsMatch(autCurr) && allowCompare {
// check to see if new keystack might have gone too far
if i.endKeyExclusive != nil &&
bytes.Compare(i.keysStack, i.endKeyExclusive) >= 0 {
return ErrIteratorDone
}
cmp := bytes.Compare(i.keysStack, i.nextStart)
if cmp > 0 {
// in final state greater than start key
return nil
}
}
numTrans := curr.NumTransitions()
INNER:
for nextOffset < numTrans {
t := curr.TransitionAt(nextOffset)
autNext := i.aut.Accept(autCurr, t)
if !i.aut.CanMatch(autNext) {
// TODO: potential optimization to skip nextOffset
// forwards more directly to something that the
// automaton likes rather than a linear scan?
nextOffset += 1
continue INNER
}
pos, nextAddr, v := curr.TransitionFor(t)
// the next slot in the statesStack might have an
// fstState instance that we can reuse
var nextPrealloc fstState
if len(i.statesStack) < cap(i.statesStack) {
nextPrealloc = i.statesStack[0:cap(i.statesStack)][len(i.statesStack)]
}
// push onto stack
next, err := i.f.decoder.stateAt(nextAddr, nextPrealloc)
if err != nil {
return err
}
i.statesStack = append(i.statesStack, next)
i.keysStack = append(i.keysStack, t)
i.keysPosStack = append(i.keysPosStack, pos)
i.valsStack = append(i.valsStack, v)
i.autStatesStack = append(i.autStatesStack, autNext)
nextOffset = 0
allowCompare = true
continue OUTER
}
// no more transitions, so need to backtrack and stack pop
if len(i.statesStack) <= 1 {
// stack len is 1 (root), can't go back further, we're done
break
}
// if the top of the stack represents a linear chain of states
// (i.e., a suffix of nodes linked by single transitions),
// then optimize by popping the suffix in one shot without
// going back all the way to the OUTER loop
var popNum int
for j := len(i.statesStack) - 1; j > 0; j-- {
if j == 1 || i.statesStack[j].NumTransitions() != 1 {
popNum = len(i.statesStack) - 1 - j
break
}
}
if popNum < 1 { // always pop at least 1 entry from the stacks
popNum = 1
}
nextOffset = i.keysPosStack[len(i.keysPosStack)-popNum] + 1
allowCompare = false
i.statesStack = i.statesStack[:len(i.statesStack)-popNum]
i.keysStack = i.keysStack[:len(i.keysStack)-popNum]
i.keysPosStack = i.keysPosStack[:len(i.keysPosStack)-popNum]
i.valsStack = i.valsStack[:len(i.valsStack)-popNum]
i.autStatesStack = i.autStatesStack[:len(i.autStatesStack)-popNum]
}
return ErrIteratorDone
}
// Seek advances this iterator to the specified key/value pair. If this key
// is not in the FST, Current() will return the next largest key. If this
// seek operation would go past the last key, or outside the configured
// startKeyInclusive/endKeyExclusive then ErrIteratorDone is returned.
func (i *FSTIterator) Seek(key []byte) error {
return i.pointTo(key)
}
// Close will free any resources held by this iterator.
func (i *FSTIterator) Close() error {
// at the moment we don't do anything,
// but wanted this for API completeness
return nil
}

View file

@ -1,10 +0,0 @@
module github.com/couchbase/vellum
go 1.12
require (
github.com/blevesearch/mmap-go v1.0.2
github.com/spf13/cobra v0.0.5
github.com/willf/bitset v1.1.10
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a // indirect
)

View file

@ -1,40 +0,0 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8=
github.com/blevesearch/mmap-go v1.0.2 h1:JtMHb+FgQCTTYIhtMvimw15dJwu1Y5lrZDMOFXVWPk0=
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE=
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM=
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
github.com/spf13/cobra v0.0.5 h1:f0B+LkLX6DtmRH1isoNA9VTtNUK9K8xYd28JNNfOv/s=
github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo=
github.com/spf13/pflag v1.0.3 h1:zPAT6CGy6wXeQ7NtTnaTerfKOsV6V6F8agHXFiazDkg=
github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4=
github.com/spf13/viper v1.3.2/go.mod h1:ZiWeW+zYFKm7srdB9IoDzzZXaJaI5eL9QjNiN/DMA2s=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
github.com/willf/bitset v1.1.10 h1:NotGKqX0KwQ72NUzqrjZq5ipPNDQex9lo3WpaS8L2sc=
github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q=
golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
golang.org/x/sys v0.0.0-20181205085412-a5c9d58dba9a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20181221143128-b4a75ba826a6/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a h1:aYOabOQFp6Vj6W1F80affTUvO9UxmJRx8K0gsfABByQ=
golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

View file

@ -1,203 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View file

@ -1,33 +0,0 @@
# levenshtein
levenshtein automaton
This package makes it fast and simple to build a finite determinic automaton that computes the levenshtein distance from a given string.
# Sample usage:
```
// build a re-usable builder
lb := NewLevenshteinAutomatonBuilder(2, false)
origTerm := "couchbasefts"
dfa := lb.BuildDfa("couchbases", 2)
ed := dfa.eval([]byte(origTerm))
if ed.distance() != 2 {
log.Errorf("expected distance 2, actual: %d", ed.distance())
}
```
This implementation is inspired by [blog post](https://fulmicoton.com/posts/levenshtein/) and is intended to be
a port of original rust implementation: https://github.com/tantivy-search/levenshtein-automata
Micro Benchmark Results against the current vellum/levenshtein is as below.
```
BenchmarkNewEditDistance1-8 30000 52684 ns/op 89985 B/op 295 allocs/op
BenchmarkOlderEditDistance1-8 10000 132931 ns/op 588892 B/op 363 allocs/op
BenchmarkNewEditDistance2-8 10000 199127 ns/op 377532 B/op 1019 allocs/op
BenchmarkOlderEditDistance2-8 2000 988109 ns/op 4236609 B/op 1898 allocs/op
```

View file

@ -1,125 +0,0 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import (
"fmt"
"sort"
"unicode/utf8"
)
type FullCharacteristicVector []uint32
func (fcv FullCharacteristicVector) shiftAndMask(offset, mask uint32) uint32 {
bucketID := offset / 32
align := offset - bucketID*32
if align == 0 {
return fcv[bucketID] & mask
}
left := fcv[bucketID] >> align
right := fcv[bucketID+1] << (32 - align)
return (left | right) & mask
}
type tuple struct {
char rune
fcv FullCharacteristicVector
}
type sortRunes []rune
func (s sortRunes) Less(i, j int) bool {
return s[i] < s[j]
}
func (s sortRunes) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s sortRunes) Len() int {
return len(s)
}
func sortRune(r []rune) []rune {
sort.Sort(sortRunes(r))
return r
}
type Alphabet struct {
charset []tuple
index uint32
}
func (a *Alphabet) resetNext() {
a.index = 0
}
func (a *Alphabet) next() (rune, FullCharacteristicVector, error) {
if int(a.index) >= len(a.charset) {
return 0, nil, fmt.Errorf("eof")
}
rv := a.charset[a.index]
a.index++
return rv.char, rv.fcv, nil
}
func dedupe(in string) string {
lookUp := make(map[rune]struct{}, len(in))
var rv string
for len(in) > 0 {
r, size := utf8.DecodeRuneInString(in)
in = in[size:]
if _, ok := lookUp[r]; !ok {
rv += string(r)
lookUp[r] = struct{}{}
}
}
return rv
}
func queryChars(qChars string) Alphabet {
chars := dedupe(qChars)
inChars := sortRune([]rune(chars))
charsets := make([]tuple, 0, len(inChars))
for _, c := range inChars {
tempChars := qChars
var bits []uint32
for len(tempChars) > 0 {
var chunk string
if len(tempChars) > 32 {
chunk = tempChars[0:32]
tempChars = tempChars[32:]
} else {
chunk = tempChars
tempChars = tempChars[:0]
}
chunkBits := uint32(0)
bit := uint32(1)
for _, chr := range chunk {
if chr == c {
chunkBits |= bit
}
bit <<= 1
}
bits = append(bits, chunkBits)
}
bits = append(bits, 0)
charsets = append(charsets, tuple{char: c, fcv: FullCharacteristicVector(bits)})
}
return Alphabet{charset: charsets}
}

View file

@ -1,250 +0,0 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import (
"fmt"
"math"
)
const SinkState = uint32(0)
type DFA struct {
transitions [][256]uint32
distances []Distance
initState int
ed uint8
}
/// Returns the initial state
func (d *DFA) initialState() int {
return d.initState
}
/// Returns the Levenshtein distance associated to the
/// current state.
func (d *DFA) distance(stateId int) Distance {
return d.distances[stateId]
}
/// Returns the number of states in the `DFA`.
func (d *DFA) numStates() int {
return len(d.transitions)
}
/// Returns the destination state reached after consuming a given byte.
func (d *DFA) transition(fromState int, b uint8) int {
return int(d.transitions[fromState][b])
}
func (d *DFA) eval(bytes []uint8) Distance {
state := d.initialState()
for _, b := range bytes {
state = d.transition(state, b)
}
return d.distance(state)
}
func (d *DFA) Start() int {
return int(d.initialState())
}
func (d *DFA) IsMatch(state int) bool {
if _, ok := d.distance(state).(Exact); ok {
return true
}
return false
}
func (d *DFA) CanMatch(state int) bool {
return state > 0 && state < d.numStates()
}
func (d *DFA) Accept(state int, b byte) int {
return int(d.transition(state, b))
}
// WillAlwaysMatch returns if the specified state will always end in a
// matching state.
func (d *DFA) WillAlwaysMatch(state int) bool {
return false
}
func fill(dest []uint32, val uint32) {
for i := range dest {
dest[i] = val
}
}
func fillTransitions(dest *[256]uint32, val uint32) {
for i := range dest {
dest[i] = val
}
}
type Utf8DFAStateBuilder struct {
dfaBuilder *Utf8DFABuilder
stateID uint32
defaultSuccessor []uint32
}
func (sb *Utf8DFAStateBuilder) addTransitionID(fromStateID uint32, b uint8,
toStateID uint32) {
sb.dfaBuilder.transitions[fromStateID][b] = toStateID
}
func (sb *Utf8DFAStateBuilder) addTransition(in rune, toStateID uint32) {
fromStateID := sb.stateID
chars := []byte(string(in))
lastByte := chars[len(chars)-1]
for i, ch := range chars[:len(chars)-1] {
remNumBytes := len(chars) - i - 1
defaultSuccessor := sb.defaultSuccessor[remNumBytes]
intermediateStateID := sb.dfaBuilder.transitions[fromStateID][ch]
if intermediateStateID == defaultSuccessor {
intermediateStateID = sb.dfaBuilder.allocate()
fillTransitions(&sb.dfaBuilder.transitions[intermediateStateID],
sb.defaultSuccessor[remNumBytes-1])
}
sb.addTransitionID(fromStateID, ch, intermediateStateID)
fromStateID = intermediateStateID
}
toStateIDDecoded := sb.dfaBuilder.getOrAllocate(original(toStateID))
sb.addTransitionID(fromStateID, lastByte, toStateIDDecoded)
}
type Utf8StateId uint32
func original(stateId uint32) Utf8StateId {
return predecessor(stateId, 0)
}
func predecessor(stateId uint32, numSteps uint8) Utf8StateId {
return Utf8StateId(stateId*4 + uint32(numSteps))
}
// Utf8DFABuilder makes it possible to define a DFA
// that takes unicode character, and build a `DFA`
// that operates on utf-8 encoded
type Utf8DFABuilder struct {
index []uint32
distances []Distance
transitions [][256]uint32
initialState uint32
numStates uint32
maxNumStates uint32
}
func withMaxStates(maxStates uint32) *Utf8DFABuilder {
rv := &Utf8DFABuilder{
index: make([]uint32, maxStates*2+100),
distances: make([]Distance, 0, maxStates),
transitions: make([][256]uint32, 0, maxStates),
maxNumStates: maxStates,
}
for i := range rv.index {
rv.index[i] = math.MaxUint32
}
return rv
}
func (dfab *Utf8DFABuilder) allocate() uint32 {
newState := dfab.numStates
dfab.numStates++
dfab.distances = append(dfab.distances, Atleast{d: 255})
dfab.transitions = append(dfab.transitions, [256]uint32{})
return newState
}
func (dfab *Utf8DFABuilder) getOrAllocate(state Utf8StateId) uint32 {
if int(state) >= cap(dfab.index) {
cloneIndex := make([]uint32, int(state)*2)
copy(cloneIndex, dfab.index)
dfab.index = cloneIndex
}
if dfab.index[state] != math.MaxUint32 {
return dfab.index[state]
}
nstate := dfab.allocate()
dfab.index[state] = nstate
return nstate
}
func (dfab *Utf8DFABuilder) setInitialState(iState uint32) {
decodedID := dfab.getOrAllocate(original(iState))
dfab.initialState = decodedID
}
func (dfab *Utf8DFABuilder) build(ed uint8) *DFA {
return &DFA{
transitions: dfab.transitions,
distances: dfab.distances,
initState: int(dfab.initialState),
ed: ed,
}
}
func (dfab *Utf8DFABuilder) addState(state, default_suc_orig uint32,
distance Distance) (*Utf8DFAStateBuilder, error) {
if state > dfab.maxNumStates {
return nil, fmt.Errorf("State id is larger than maxNumStates")
}
stateID := dfab.getOrAllocate(original(state))
dfab.distances[stateID] = distance
defaultSuccID := dfab.getOrAllocate(original(default_suc_orig))
// creates a chain of states of predecessors of `default_suc_orig`.
// Accepting k-bytes (whatever the bytes are) from `predecessor_states[k-1]`
// leads to the `default_suc_orig` state.
predecessorStates := []uint32{defaultSuccID,
defaultSuccID,
defaultSuccID,
defaultSuccID}
for numBytes := uint8(1); numBytes < 4; numBytes++ {
predecessorState := predecessor(default_suc_orig, numBytes)
predecessorStateID := dfab.getOrAllocate(predecessorState)
predecessorStates[numBytes] = predecessorStateID
succ := predecessorStates[numBytes-1]
fillTransitions(&dfab.transitions[predecessorStateID], succ)
}
// 1-byte encoded chars.
fill(dfab.transitions[stateID][0:192], predecessorStates[0])
// 2-bytes encoded chars.
fill(dfab.transitions[stateID][192:224], predecessorStates[1])
// 3-bytes encoded chars.
fill(dfab.transitions[stateID][224:240], predecessorStates[2])
// 4-bytes encoded chars.
fill(dfab.transitions[stateID][240:256], predecessorStates[3])
return &Utf8DFAStateBuilder{
dfaBuilder: dfab,
stateID: stateID,
defaultSuccessor: predecessorStates}, nil
}

View file

@ -1,64 +0,0 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import "fmt"
// StateLimit is the maximum number of states allowed
const StateLimit = 10000
// ErrTooManyStates is returned if you attempt to build a Levenshtein
// automaton which requires too many states.
var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
StateLimit)
// LevenshteinAutomatonBuilder wraps a precomputed
// datastructure that allows to produce small (but not minimal) DFA.
type LevenshteinAutomatonBuilder struct {
pDfa *ParametricDFA
}
// NewLevenshteinAutomatonBuilder creates a
// reusable, threadsafe Levenshtein automaton builder.
// `maxDistance` - maximum distance considered by the automaton.
// `transposition` - assign a distance of 1 for transposition
//
// Building this automaton builder is computationally intensive.
// While it takes only a few milliseconds for `d=2`, it grows
// exponentially with `d`. It is only reasonable to `d <= 5`.
func NewLevenshteinAutomatonBuilder(maxDistance uint8,
transposition bool) (*LevenshteinAutomatonBuilder, error) {
lnfa := newLevenshtein(maxDistance, transposition)
pdfa, err := fromNfa(lnfa)
if err != nil {
return nil, err
}
return &LevenshteinAutomatonBuilder{pDfa: pdfa}, nil
}
// BuildDfa builds the levenshtein automaton for serving
// queries with a given edit distance.
func (lab *LevenshteinAutomatonBuilder) BuildDfa(query string,
fuzziness uint8) (*DFA, error) {
return lab.pDfa.buildDfa(query, fuzziness, false)
}
// MaxDistance returns the MaxEdit distance supported by the
// LevenshteinAutomatonBuilder builder.
func (lab *LevenshteinAutomatonBuilder) MaxDistance() uint8 {
return lab.pDfa.maxDistance
}

View file

@ -1,292 +0,0 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import (
"math"
"sort"
)
/// Levenshtein Distance computed by a Levenshtein Automaton.
///
/// Levenshtein automata can only compute the exact Levenshtein distance
/// up to a given `max_distance`.
///
/// Over this distance, the automaton will invariably
/// return `Distance::AtLeast(max_distance + 1)`.
type Distance interface {
distance() uint8
}
type Exact struct {
d uint8
}
func (e Exact) distance() uint8 {
return e.d
}
type Atleast struct {
d uint8
}
func (a Atleast) distance() uint8 {
return a.d
}
func characteristicVector(query []rune, c rune) uint64 {
chi := uint64(0)
for i := 0; i < len(query); i++ {
if query[i] == c {
chi |= 1 << uint64(i)
}
}
return chi
}
type NFAState struct {
Offset uint32
Distance uint8
InTranspose bool
}
type NFAStates []NFAState
func (ns NFAStates) Len() int {
return len(ns)
}
func (ns NFAStates) Less(i, j int) bool {
if ns[i].Offset != ns[j].Offset {
return ns[i].Offset < ns[j].Offset
}
if ns[i].Distance != ns[j].Distance {
return ns[i].Distance < ns[j].Distance
}
return !ns[i].InTranspose && ns[j].InTranspose
}
func (ns NFAStates) Swap(i, j int) {
ns[i], ns[j] = ns[j], ns[i]
}
func (ns *NFAState) imply(other NFAState) bool {
transposeImply := ns.InTranspose
if !other.InTranspose {
transposeImply = !other.InTranspose
}
deltaOffset := ns.Offset - other.Offset
if ns.Offset < other.Offset {
deltaOffset = other.Offset - ns.Offset
}
if transposeImply {
return uint32(other.Distance) >= (uint32(ns.Distance) + deltaOffset)
}
return uint32(other.Distance) > (uint32(ns.Distance) + deltaOffset)
}
type MultiState struct {
states []NFAState
}
func (ms *MultiState) States() []NFAState {
return ms.states
}
func (ms *MultiState) Clear() {
ms.states = ms.states[:0]
}
func newMultiState() *MultiState {
return &MultiState{states: make([]NFAState, 0)}
}
func (ms *MultiState) normalize() uint32 {
minOffset := uint32(math.MaxUint32)
for _, s := range ms.states {
if s.Offset < minOffset {
minOffset = s.Offset
}
}
if minOffset == uint32(math.MaxUint32) {
minOffset = 0
}
for i := 0; i < len(ms.states); i++ {
ms.states[i].Offset -= minOffset
}
sort.Sort(NFAStates(ms.states))
return minOffset
}
func (ms *MultiState) addStates(nState NFAState) {
for _, s := range ms.states {
if s.imply(nState) {
return
}
}
i := 0
for i < len(ms.states) {
if nState.imply(ms.states[i]) {
ms.states = append(ms.states[:i], ms.states[i+1:]...)
} else {
i++
}
}
ms.states = append(ms.states, nState)
}
func extractBit(bitset uint64, pos uint8) bool {
shift := bitset >> pos
bit := shift & 1
return bit == uint64(1)
}
func dist(left, right uint32) uint32 {
if left > right {
return left - right
}
return right - left
}
type LevenshteinNFA struct {
mDistance uint8
damerau bool
}
func newLevenshtein(maxD uint8, transposition bool) *LevenshteinNFA {
return &LevenshteinNFA{mDistance: maxD,
damerau: transposition,
}
}
func (la *LevenshteinNFA) maxDistance() uint8 {
return la.mDistance
}
func (la *LevenshteinNFA) msDiameter() uint8 {
return 2*la.mDistance + 1
}
func (la *LevenshteinNFA) initialStates() *MultiState {
ms := MultiState{}
nfaState := NFAState{}
ms.addStates(nfaState)
return &ms
}
func (la *LevenshteinNFA) multistateDistance(ms *MultiState,
queryLen uint32) Distance {
minDistance := Atleast{d: la.mDistance + 1}
for _, s := range ms.states {
t := s.Distance + uint8(dist(queryLen, s.Offset))
if t <= uint8(la.mDistance) {
if minDistance.distance() > t {
minDistance.d = t
}
}
}
if minDistance.distance() == la.mDistance+1 {
return Atleast{d: la.mDistance + 1}
}
return minDistance
}
func (la *LevenshteinNFA) simpleTransition(state NFAState,
symbol uint64, ms *MultiState) {
if state.Distance < la.mDistance {
// insertion
ms.addStates(NFAState{Offset: state.Offset,
Distance: state.Distance + 1,
InTranspose: false})
// substitution
ms.addStates(NFAState{Offset: state.Offset + 1,
Distance: state.Distance + 1,
InTranspose: false})
n := la.mDistance + 1 - state.Distance
for d := uint8(1); d < n; d++ {
if extractBit(symbol, d) {
// for d > 0, as many deletion and character match
ms.addStates(NFAState{Offset: state.Offset + 1 + uint32(d),
Distance: state.Distance + d,
InTranspose: false})
}
}
if la.damerau && extractBit(symbol, 1) {
ms.addStates(NFAState{
Offset: state.Offset,
Distance: state.Distance + 1,
InTranspose: true})
}
}
if extractBit(symbol, 0) {
ms.addStates(NFAState{Offset: state.Offset + 1,
Distance: state.Distance,
InTranspose: false})
}
if state.InTranspose && extractBit(symbol, 0) {
ms.addStates(NFAState{Offset: state.Offset + 2,
Distance: state.Distance,
InTranspose: false})
}
}
func (la *LevenshteinNFA) transition(cState *MultiState,
dState *MultiState, scv uint64) {
dState.Clear()
mask := (uint64(1) << la.msDiameter()) - uint64(1)
for _, state := range cState.states {
cv := (scv >> state.Offset) & mask
la.simpleTransition(state, cv, dState)
}
sort.Sort(NFAStates(dState.states))
}
func (la *LevenshteinNFA) computeDistance(query, other []rune) Distance {
cState := la.initialStates()
nState := newMultiState()
for _, i := range other {
nState.Clear()
chi := characteristicVector(query, i)
la.transition(cState, nState, chi)
cState, nState = nState, cState
}
return la.multistateDistance(cState, uint32(len(query)))
}

View file

@ -1,349 +0,0 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package levenshtein
import (
"crypto/md5"
"encoding/json"
"fmt"
"math"
)
type ParametricState struct {
shapeID uint32
offset uint32
}
func newParametricState() ParametricState {
return ParametricState{}
}
func (ps *ParametricState) isDeadEnd() bool {
return ps.shapeID == 0
}
type Transition struct {
destShapeID uint32
deltaOffset uint32
}
func (t *Transition) apply(state ParametricState) ParametricState {
ps := ParametricState{
shapeID: t.destShapeID}
// don't need any offset if we are in the dead state,
// this ensures we have only one dead state.
if t.destShapeID != 0 {
ps.offset = state.offset + t.deltaOffset
}
return ps
}
type ParametricStateIndex struct {
stateIndex []uint32
stateQueue []ParametricState
numOffsets uint32
}
func newParametricStateIndex(queryLen,
numParamState uint32) ParametricStateIndex {
numOffsets := queryLen + 1
if numParamState == 0 {
numParamState = numOffsets
}
maxNumStates := numParamState * numOffsets
psi := ParametricStateIndex{
stateIndex: make([]uint32, maxNumStates),
stateQueue: make([]ParametricState, 0, 150),
numOffsets: numOffsets,
}
for i := uint32(0); i < maxNumStates; i++ {
psi.stateIndex[i] = math.MaxUint32
}
return psi
}
func (psi *ParametricStateIndex) numStates() int {
return len(psi.stateQueue)
}
func (psi *ParametricStateIndex) maxNumStates() int {
return len(psi.stateIndex)
}
func (psi *ParametricStateIndex) get(stateID uint32) ParametricState {
return psi.stateQueue[stateID]
}
func (psi *ParametricStateIndex) getOrAllocate(ps ParametricState) uint32 {
bucket := ps.shapeID*psi.numOffsets + ps.offset
if bucket < uint32(len(psi.stateIndex)) &&
psi.stateIndex[bucket] != math.MaxUint32 {
return psi.stateIndex[bucket]
}
nState := uint32(len(psi.stateQueue))
psi.stateQueue = append(psi.stateQueue, ps)
psi.stateIndex[bucket] = nState
return nState
}
type ParametricDFA struct {
distance []uint8
transitions []Transition
maxDistance uint8
transitionStride uint32
diameter uint32
}
func (pdfa *ParametricDFA) initialState() ParametricState {
return ParametricState{shapeID: 1}
}
// Returns true iff whatever characters come afterward,
// we will never reach a shorter distance
func (pdfa *ParametricDFA) isPrefixSink(state ParametricState, queryLen uint32) bool {
if state.isDeadEnd() {
return true
}
remOffset := queryLen - state.offset
if remOffset < pdfa.diameter {
stateDistances := pdfa.distance[pdfa.diameter*state.shapeID:]
prefixDistance := stateDistances[remOffset]
if prefixDistance > pdfa.maxDistance {
return false
}
for _, d := range stateDistances {
if d < prefixDistance {
return false
}
}
return true
}
return false
}
func (pdfa *ParametricDFA) numStates() int {
return len(pdfa.transitions) / int(pdfa.transitionStride)
}
func min(x, y uint32) uint32 {
if x < y {
return x
}
return y
}
func (pdfa *ParametricDFA) transition(state ParametricState,
chi uint32) Transition {
return pdfa.transitions[pdfa.transitionStride*state.shapeID+chi]
}
func (pdfa *ParametricDFA) getDistance(state ParametricState,
qLen uint32) Distance {
remainingOffset := qLen - state.offset
if state.isDeadEnd() || remainingOffset >= pdfa.diameter {
return Atleast{d: pdfa.maxDistance + 1}
}
dist := pdfa.distance[int(pdfa.diameter*state.shapeID)+int(remainingOffset)]
if dist > pdfa.maxDistance {
return Atleast{d: dist}
}
return Exact{d: dist}
}
func (pdfa *ParametricDFA) computeDistance(left, right string) Distance {
state := pdfa.initialState()
leftChars := []rune(left)
for _, chr := range []rune(right) {
start := state.offset
stop := min(start+pdfa.diameter, uint32(len(leftChars)))
chi := characteristicVector(leftChars[start:stop], chr)
transition := pdfa.transition(state, uint32(chi))
state = transition.apply(state)
if state.isDeadEnd() {
return Atleast{d: pdfa.maxDistance + 1}
}
}
return pdfa.getDistance(state, uint32(len(left)))
}
func (pdfa *ParametricDFA) buildDfa(query string, distance uint8,
prefix bool) (*DFA, error) {
qLen := uint32(len([]rune(query)))
alphabet := queryChars(query)
psi := newParametricStateIndex(qLen, uint32(pdfa.numStates()))
maxNumStates := psi.maxNumStates()
deadEndStateID := psi.getOrAllocate(newParametricState())
if deadEndStateID != 0 {
return nil, fmt.Errorf("Invalid dead end state")
}
initialStateID := psi.getOrAllocate(pdfa.initialState())
dfaBuilder := withMaxStates(uint32(maxNumStates))
mask := uint32((1 << pdfa.diameter) - 1)
var stateID int
for stateID = 0; stateID < StateLimit; stateID++ {
if stateID == psi.numStates() {
break
}
state := psi.get(uint32(stateID))
if prefix && pdfa.isPrefixSink(state, qLen) {
distance := pdfa.getDistance(state, qLen)
dfaBuilder.addState(uint32(stateID), uint32(stateID), distance)
} else {
transition := pdfa.transition(state, 0)
defSuccessor := transition.apply(state)
defSuccessorID := psi.getOrAllocate(defSuccessor)
distance := pdfa.getDistance(state, qLen)
stateBuilder, err := dfaBuilder.addState(uint32(stateID), defSuccessorID, distance)
if err != nil {
return nil, fmt.Errorf("parametric_dfa: buildDfa, err: %v", err)
}
alphabet.resetNext()
chr, cv, err := alphabet.next()
for err == nil {
chi := cv.shiftAndMask(state.offset, mask)
transition := pdfa.transition(state, chi)
destState := transition.apply(state)
destStateID := psi.getOrAllocate(destState)
stateBuilder.addTransition(chr, destStateID)
chr, cv, err = alphabet.next()
}
}
}
if stateID == StateLimit {
return nil, ErrTooManyStates
}
dfaBuilder.setInitialState(initialStateID)
return dfaBuilder.build(distance), nil
}
func fromNfa(nfa *LevenshteinNFA) (*ParametricDFA, error) {
lookUp := newHash()
lookUp.getOrAllocate(*newMultiState())
initialState := nfa.initialStates()
lookUp.getOrAllocate(*initialState)
maxDistance := nfa.maxDistance()
msDiameter := nfa.msDiameter()
numChi := 1 << msDiameter
chiValues := make([]uint64, numChi)
for i := 0; i < numChi; i++ {
chiValues[i] = uint64(i)
}
transitions := make([]Transition, 0, numChi*int(msDiameter))
var stateID int
for stateID = 0; stateID < StateLimit; stateID++ {
if stateID == len(lookUp.items) {
break
}
for _, chi := range chiValues {
destMs := newMultiState()
ms := lookUp.getFromID(stateID)
nfa.transition(ms, destMs, chi)
translation := destMs.normalize()
destID := lookUp.getOrAllocate(*destMs)
transitions = append(transitions, Transition{
destShapeID: uint32(destID),
deltaOffset: translation,
})
}
}
if stateID == StateLimit {
return nil, ErrTooManyStates
}
ns := len(lookUp.items)
diameter := int(msDiameter)
distances := make([]uint8, 0, diameter*ns)
for stateID := 0; stateID < ns; stateID++ {
ms := lookUp.getFromID(stateID)
for offset := 0; offset < diameter; offset++ {
dist := nfa.multistateDistance(ms, uint32(offset))
distances = append(distances, dist.distance())
}
}
return &ParametricDFA{
diameter: uint32(msDiameter),
transitions: transitions,
maxDistance: maxDistance,
transitionStride: uint32(numChi),
distance: distances,
}, nil
}
type hash struct {
index map[[16]byte]int
items []MultiState
}
func newHash() *hash {
return &hash{
index: make(map[[16]byte]int, 100),
items: make([]MultiState, 0, 100),
}
}
func (h *hash) getOrAllocate(m MultiState) int {
size := len(h.items)
var exists bool
var pos int
md5 := getHash(&m)
if pos, exists = h.index[md5]; !exists {
h.index[md5] = size
pos = size
h.items = append(h.items, m)
}
return pos
}
func (h *hash) getFromID(id int) *MultiState {
return &h.items[id]
}
func getHash(ms *MultiState) [16]byte {
msBytes := []byte{}
for _, state := range ms.states {
jsonBytes, _ := json.Marshal(&state)
msBytes = append(msBytes, jsonBytes...)
}
return md5.Sum(msBytes)
}

View file

@ -1,188 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"bytes"
)
// MergeFunc is used to choose the new value for a key when merging a slice
// of iterators, and the same key is observed with multiple values.
// Values presented to the MergeFunc will be in the same order as the
// original slice creating the MergeIterator. This allows some MergeFunc
// implementations to prioritize one iterator over another.
type MergeFunc func([]uint64) uint64
// MergeIterator implements the Iterator interface by traversing a slice
// of iterators and merging the contents of them. If the same key exists
// in mulitipe underlying iterators, a user-provided MergeFunc will be
// invoked to choose the new value.
type MergeIterator struct {
itrs []Iterator
f MergeFunc
currKs [][]byte
currVs []uint64
lowK []byte
lowV uint64
lowIdxs []int
mergeV []uint64
}
// NewMergeIterator creates a new MergeIterator over the provided slice of
// Iterators and with the specified MergeFunc to resolve duplicate keys.
func NewMergeIterator(itrs []Iterator, f MergeFunc) (*MergeIterator, error) {
rv := &MergeIterator{
itrs: itrs,
f: f,
currKs: make([][]byte, len(itrs)),
currVs: make([]uint64, len(itrs)),
lowIdxs: make([]int, 0, len(itrs)),
mergeV: make([]uint64, 0, len(itrs)),
}
rv.init()
if rv.lowK == nil {
return rv, ErrIteratorDone
}
return rv, nil
}
func (m *MergeIterator) init() {
for i, itr := range m.itrs {
m.currKs[i], m.currVs[i] = itr.Current()
}
m.updateMatches()
}
func (m *MergeIterator) updateMatches() {
if len(m.itrs) < 1 {
return
}
m.lowK = m.currKs[0]
m.lowIdxs = m.lowIdxs[:0]
m.lowIdxs = append(m.lowIdxs, 0)
for i := 1; i < len(m.itrs); i++ {
if m.currKs[i] == nil {
continue
}
cmp := bytes.Compare(m.currKs[i], m.lowK)
if m.lowK == nil || cmp < 0 {
// reached a new low
m.lowK = m.currKs[i]
m.lowIdxs = m.lowIdxs[:0]
m.lowIdxs = append(m.lowIdxs, i)
} else if cmp == 0 {
m.lowIdxs = append(m.lowIdxs, i)
}
}
if len(m.lowIdxs) > 1 {
// merge multiple values
m.mergeV = m.mergeV[:0]
for _, vi := range m.lowIdxs {
m.mergeV = append(m.mergeV, m.currVs[vi])
}
m.lowV = m.f(m.mergeV)
} else if len(m.lowIdxs) == 1 {
m.lowV = m.currVs[m.lowIdxs[0]]
}
}
// Current returns the key and value currently pointed to by this iterator.
// If the iterator is not pointing at a valid value (because Iterator/Next/Seek)
// returned an error previously, it may return nil,0.
func (m *MergeIterator) Current() ([]byte, uint64) {
return m.lowK, m.lowV
}
// Next advances this iterator to the next key/value pair. If there is none,
// then ErrIteratorDone is returned.
func (m *MergeIterator) Next() error {
// move all the current low iterators to next
for _, vi := range m.lowIdxs {
err := m.itrs[vi].Next()
if err != nil && err != ErrIteratorDone {
return err
}
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
}
m.updateMatches()
if m.lowK == nil {
return ErrIteratorDone
}
return nil
}
// Seek advances this iterator to the specified key/value pair. If this key
// is not in the FST, Current() will return the next largest key. If this
// seek operation would go past the last key, then ErrIteratorDone is returned.
func (m *MergeIterator) Seek(key []byte) error {
for i := range m.itrs {
err := m.itrs[i].Seek(key)
if err != nil && err != ErrIteratorDone {
return err
}
}
m.updateMatches()
if m.lowK == nil {
return ErrIteratorDone
}
return nil
}
// Close will attempt to close all the underlying Iterators. If any errors
// are encountered, the first will be returned.
func (m *MergeIterator) Close() error {
var rv error
for i := range m.itrs {
// close all iterators, return first error if any
err := m.itrs[i].Close()
if rv == nil {
rv = err
}
}
return rv
}
// MergeMin chooses the minimum value
func MergeMin(vals []uint64) uint64 {
rv := vals[0]
for _, v := range vals[1:] {
if v < rv {
rv = v
}
}
return rv
}
// MergeMax chooses the maximum value
func MergeMax(vals []uint64) uint64 {
rv := vals[0]
for _, v := range vals[1:] {
if v > rv {
rv = v
}
}
return rv
}
// MergeSum sums the values
func MergeSum(vals []uint64) uint64 {
rv := vals[0]
for _, v := range vals[1:] {
rv += v
}
return rv
}

View file

@ -1,55 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
func deltaAddr(base, trans uint64) uint64 {
// transition dest of 0 is special case
if trans == 0 {
return 0
}
return base - trans
}
const packOutMask = 1<<4 - 1
func encodePackSize(transSize, outSize int) byte {
var rv byte
rv = byte(transSize << 4)
rv |= byte(outSize)
return rv
}
func decodePackSize(pack byte) (transSize int, packSize int) {
transSize = int(pack >> 4)
packSize = int(pack & packOutMask)
return
}
const maxNumTrans = 1<<6 - 1
func encodeNumTrans(n int) byte {
if n <= maxNumTrans {
return byte(n)
}
return 0
}
func readPackedUint(data []byte) (rv uint64) {
for i := range data {
shifted := uint64(data[i]) << uint(i*8)
rv |= shifted
}
return
}

View file

@ -1,343 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import (
"regexp/syntax"
"unicode"
unicode_utf8 "unicode/utf8"
"github.com/couchbase/vellum/utf8"
)
type compiler struct {
sizeLimit uint
insts prog
instsPool []inst
sequences utf8.Sequences
rangeStack utf8.RangeStack
startBytes []byte
endBytes []byte
}
func newCompiler(sizeLimit uint) *compiler {
return &compiler{
sizeLimit: sizeLimit,
startBytes: make([]byte, unicode_utf8.UTFMax),
endBytes: make([]byte, unicode_utf8.UTFMax),
}
}
func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
err := c.c(ast)
if err != nil {
return nil, err
}
inst := c.allocInst()
inst.op = OpMatch
c.insts = append(c.insts, inst)
return c.insts, nil
}
func (c *compiler) c(ast *syntax.Regexp) (err error) {
if ast.Flags&syntax.NonGreedy > 1 {
return ErrNoLazy
}
switch ast.Op {
case syntax.OpEndLine, syntax.OpBeginLine,
syntax.OpBeginText, syntax.OpEndText:
return ErrNoEmpty
case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
return ErrNoWordBoundary
case syntax.OpEmptyMatch:
return nil
case syntax.OpLiteral:
for _, r := range ast.Rune {
if ast.Flags&syntax.FoldCase > 0 {
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune0: [2]rune{r, r},
}
next.Rune = next.Rune0[0:2]
// try to find more folded runes
for r1 := unicode.SimpleFold(r); r1 != r; r1 = unicode.SimpleFold(r1) {
next.Rune = append(next.Rune, r1, r1)
}
err = c.c(&next)
if err != nil {
return err
}
} else {
c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
r, r, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
if err != nil {
return err
}
for _, seq := range c.sequences {
c.compileUtf8Ranges(seq)
}
}
}
case syntax.OpAnyChar:
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune0: [2]rune{0, unicode.MaxRune},
}
next.Rune = next.Rune0[:2]
return c.c(&next)
case syntax.OpAnyCharNotNL:
next := syntax.Regexp{
Op: syntax.OpCharClass,
Flags: ast.Flags & syntax.FoldCase,
Rune: []rune{0, 0x09, 0x0B, unicode.MaxRune},
}
return c.c(&next)
case syntax.OpCharClass:
return c.compileClass(ast)
case syntax.OpCapture:
return c.c(ast.Sub[0])
case syntax.OpConcat:
for _, sub := range ast.Sub {
err := c.c(sub)
if err != nil {
return err
}
}
return nil
case syntax.OpAlternate:
if len(ast.Sub) == 0 {
return nil
}
jmpsToEnd := make([]uint, 0, len(ast.Sub)-1)
// does not handle last entry
for i := 0; i < len(ast.Sub)-1; i++ {
sub := ast.Sub[i]
split := c.emptySplit()
j1 := c.top()
err := c.c(sub)
if err != nil {
return err
}
jmpsToEnd = append(jmpsToEnd, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last entry
err := c.c(ast.Sub[len(ast.Sub)-1])
if err != nil {
return err
}
end := uint(len(c.insts))
for _, jmpToEnd := range jmpsToEnd {
c.setJump(jmpToEnd, end)
}
case syntax.OpQuest:
split := c.emptySplit()
j1 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
j2 := c.top()
c.setSplit(split, j1, j2)
case syntax.OpStar:
j1 := c.top()
split := c.emptySplit()
j2 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
jmp := c.emptyJump()
j3 := uint(len(c.insts))
c.setJump(jmp, j1)
c.setSplit(split, j2, j3)
case syntax.OpPlus:
j1 := c.top()
err := c.c(ast.Sub[0])
if err != nil {
return err
}
split := c.emptySplit()
j2 := c.top()
c.setSplit(split, j1, j2)
case syntax.OpRepeat:
if ast.Max == -1 {
for i := 0; i < ast.Min; i++ {
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
next := syntax.Regexp{
Op: syntax.OpStar,
Flags: ast.Flags,
Sub: ast.Sub,
Sub0: ast.Sub0,
Rune: ast.Rune,
Rune0: ast.Rune0,
}
return c.c(&next)
}
for i := 0; i < ast.Min; i++ {
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
splits := make([]uint, 0, ast.Max-ast.Min)
starts := make([]uint, 0, ast.Max-ast.Min)
for i := ast.Min; i < ast.Max; i++ {
splits = append(splits, c.emptySplit())
starts = append(starts, uint(len(c.insts)))
err := c.c(ast.Sub[0])
if err != nil {
return err
}
}
end := uint(len(c.insts))
for i := 0; i < len(splits); i++ {
c.setSplit(splits[i], starts[i], end)
}
}
return c.checkSize()
}
func (c *compiler) checkSize() error {
if uint(len(c.insts)*instSize) > c.sizeLimit {
return ErrCompiledTooBig
}
return nil
}
func (c *compiler) compileClass(ast *syntax.Regexp) error {
if len(ast.Rune) == 0 {
return nil
}
jmps := make([]uint, 0, len(ast.Rune)-2)
// does not do last pair
for i := 0; i < len(ast.Rune)-2; i += 2 {
rstart := ast.Rune[i]
rend := ast.Rune[i+1]
split := c.emptySplit()
j1 := c.top()
err := c.compileClassRange(rstart, rend)
if err != nil {
return err
}
jmps = append(jmps, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last pair
rstart := ast.Rune[len(ast.Rune)-2]
rend := ast.Rune[len(ast.Rune)-1]
err := c.compileClassRange(rstart, rend)
if err != nil {
return err
}
end := c.top()
for _, jmp := range jmps {
c.setJump(jmp, end)
}
return nil
}
func (c *compiler) compileClassRange(startR, endR rune) (err error) {
c.sequences, c.rangeStack, err = utf8.NewSequencesPrealloc(
startR, endR, c.sequences, c.rangeStack, c.startBytes, c.endBytes)
if err != nil {
return err
}
jmps := make([]uint, 0, len(c.sequences)-1)
// does not do last entry
for i := 0; i < len(c.sequences)-1; i++ {
seq := c.sequences[i]
split := c.emptySplit()
j1 := c.top()
c.compileUtf8Ranges(seq)
jmps = append(jmps, c.emptyJump())
j2 := c.top()
c.setSplit(split, j1, j2)
}
// handle last entry
c.compileUtf8Ranges(c.sequences[len(c.sequences)-1])
end := c.top()
for _, jmp := range jmps {
c.setJump(jmp, end)
}
return nil
}
func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
for _, r := range seq {
inst := c.allocInst()
inst.op = OpRange
inst.rangeStart = r.Start
inst.rangeEnd = r.End
c.insts = append(c.insts, inst)
}
}
func (c *compiler) emptySplit() uint {
inst := c.allocInst()
inst.op = OpSplit
c.insts = append(c.insts, inst)
return c.top() - 1
}
func (c *compiler) emptyJump() uint {
inst := c.allocInst()
inst.op = OpJmp
c.insts = append(c.insts, inst)
return c.top() - 1
}
func (c *compiler) setSplit(i, pc1, pc2 uint) {
split := c.insts[i]
split.splitA = pc1
split.splitB = pc2
}
func (c *compiler) setJump(i, pc uint) {
jmp := c.insts[i]
jmp.to = pc
}
func (c *compiler) top() uint {
return uint(len(c.insts))
}
func (c *compiler) allocInst() *inst {
if len(c.instsPool) <= 0 {
c.instsPool = make([]inst, 16)
}
inst := &c.instsPool[0]
c.instsPool = c.instsPool[1:]
return inst
}

View file

@ -1,196 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import (
"encoding/binary"
"fmt"
)
// StateLimit is the maximum number of states allowed
const StateLimit = 10000
// ErrTooManyStates is returned if you attempt to build a Levenshtein
// automaton which requires too many states.
var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
StateLimit)
type dfaBuilder struct {
dfa *dfa
cache map[string]int
keyBuf []byte
}
func newDfaBuilder(insts prog) *dfaBuilder {
d := &dfaBuilder{
dfa: &dfa{
insts: insts,
states: make([]state, 0, 16),
},
cache: make(map[string]int, 1024),
}
// add 0 state that is invalid
d.dfa.states = append(d.dfa.states, state{
next: make([]int, 256),
match: false,
})
return d
}
func (d *dfaBuilder) build() (*dfa, error) {
cur := newSparseSet(uint(len(d.dfa.insts)))
next := newSparseSet(uint(len(d.dfa.insts)))
d.dfa.add(cur, 0)
ns, instsReuse := d.cachedState(cur, nil)
states := intStack{ns}
seen := make(map[int]struct{})
var s int
states, s = states.Pop()
for s != 0 {
for b := 0; b < 256; b++ {
var ns int
ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse)
if ns != 0 {
if _, ok := seen[ns]; !ok {
seen[ns] = struct{}{}
states = states.Push(ns)
}
}
if len(d.dfa.states) > StateLimit {
return nil, ErrTooManyStates
}
}
states, s = states.Pop()
}
return d.dfa, nil
}
func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) (
int, []uint) {
cur.Clear()
for _, ip := range d.dfa.states[state].insts {
cur.Add(ip)
}
d.dfa.run(cur, next, b)
var nextState int
nextState, instsReuse = d.cachedState(next, instsReuse)
d.dfa.states[state].next[b] = nextState
return nextState, instsReuse
}
func instsKey(insts []uint, buf []byte) []byte {
if cap(buf) < 8*len(insts) {
buf = make([]byte, 8*len(insts))
} else {
buf = buf[0 : 8*len(insts)]
}
for i, inst := range insts {
binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
}
return buf
}
func (d *dfaBuilder) cachedState(set *sparseSet,
instsReuse []uint) (int, []uint) {
insts := instsReuse[:0]
if cap(insts) == 0 {
insts = make([]uint, 0, set.Len())
}
var isMatch bool
for i := uint(0); i < uint(set.Len()); i++ {
ip := set.Get(i)
switch d.dfa.insts[ip].op {
case OpRange:
insts = append(insts, ip)
case OpMatch:
isMatch = true
insts = append(insts, ip)
}
}
if len(insts) == 0 {
return 0, insts
}
d.keyBuf = instsKey(insts, d.keyBuf)
v, ok := d.cache[string(d.keyBuf)]
if ok {
return v, insts
}
d.dfa.states = append(d.dfa.states, state{
insts: insts,
next: make([]int, 256),
match: isMatch,
})
newV := len(d.dfa.states) - 1
d.cache[string(d.keyBuf)] = newV
return newV, nil
}
type dfa struct {
insts prog
states []state
}
func (d *dfa) add(set *sparseSet, ip uint) {
if set.Contains(ip) {
return
}
set.Add(ip)
switch d.insts[ip].op {
case OpJmp:
d.add(set, d.insts[ip].to)
case OpSplit:
d.add(set, d.insts[ip].splitA)
d.add(set, d.insts[ip].splitB)
}
}
func (d *dfa) run(from, to *sparseSet, b byte) bool {
to.Clear()
var isMatch bool
for i := uint(0); i < uint(from.Len()); i++ {
ip := from.Get(i)
switch d.insts[ip].op {
case OpMatch:
isMatch = true
case OpRange:
if d.insts[ip].rangeStart <= b &&
b <= d.insts[ip].rangeEnd {
d.add(to, ip+1)
}
}
}
return isMatch
}
type state struct {
insts []uint
next []int
match bool
}
type intStack []int
func (s intStack) Push(v int) intStack {
return append(s, v)
}
func (s intStack) Pop() (intStack, int) {
l := len(s)
if l < 1 {
return s, 0
}
return s[:l-1], s[l-1]
}

View file

@ -1,62 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import "fmt"
// instOp represents a instruction operation
type instOp int
// the enumeration of operations
const (
OpMatch instOp = iota
OpJmp
OpSplit
OpRange
)
// instSize is the approximate size of the an inst struct in bytes
const instSize = 40
type inst struct {
op instOp
to uint
splitA uint
splitB uint
rangeStart byte
rangeEnd byte
}
func (i *inst) String() string {
switch i.op {
case OpJmp:
return fmt.Sprintf("JMP: %d", i.to)
case OpSplit:
return fmt.Sprintf("SPLIT: %d - %d", i.splitA, i.splitB)
case OpRange:
return fmt.Sprintf("RANGE: %x - %x", i.rangeStart, i.rangeEnd)
}
return "MATCH"
}
type prog []*inst
func (p prog) String() string {
rv := "\n"
for i, pi := range p {
rv += fmt.Sprintf("%d %v\n", i, pi)
}
return rv
}

View file

@ -1,119 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
import (
"fmt"
"regexp/syntax"
)
// ErrNoEmpty returned when "zero width assertions" are used
var ErrNoEmpty = fmt.Errorf("zero width assertions not allowed")
// ErrNoWordBoundary returned when word boundaries are used
var ErrNoWordBoundary = fmt.Errorf("word boundaries are not allowed")
// ErrNoBytes returned when byte literals are used
var ErrNoBytes = fmt.Errorf("byte literals are not allowed")
// ErrNoLazy returned when lazy quantifiers are used
var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed")
// ErrCompiledTooBig returned when regular expression parses into
// too many instructions
var ErrCompiledTooBig = fmt.Errorf("too many instructions")
var DefaultLimit = uint(10 * (1 << 20))
// Regexp implements the vellum.Automaton interface for matcing a user
// specified regular expression.
type Regexp struct {
orig string
dfa *dfa
}
// NewRegexp creates a new Regular Expression automaton with the specified
// expression. By default it is limited to approximately 10MB for the
// compiled finite state automaton. If this size is exceeded,
// ErrCompiledTooBig will be returned.
func New(expr string) (*Regexp, error) {
return NewWithLimit(expr, DefaultLimit)
}
// NewRegexpWithLimit creates a new Regular Expression automaton with
// the specified expression. The size of the compiled finite state
// automaton exceeds the user specified size, ErrCompiledTooBig will be
// returned.
func NewWithLimit(expr string, size uint) (*Regexp, error) {
parsed, err := syntax.Parse(expr, syntax.Perl)
if err != nil {
return nil, err
}
return NewParsedWithLimit(expr, parsed, size)
}
func NewParsedWithLimit(expr string, parsed *syntax.Regexp, size uint) (*Regexp, error) {
compiler := newCompiler(size)
insts, err := compiler.compile(parsed)
if err != nil {
return nil, err
}
dfaBuilder := newDfaBuilder(insts)
dfa, err := dfaBuilder.build()
if err != nil {
return nil, err
}
return &Regexp{
orig: expr,
dfa: dfa,
}, nil
}
// Start returns the start state of this automaton.
func (r *Regexp) Start() int {
return 1
}
// IsMatch returns if the specified state is a matching state.
func (r *Regexp) IsMatch(s int) bool {
if s < len(r.dfa.states) {
return r.dfa.states[s].match
}
return false
}
// CanMatch returns if the specified state can ever transition to a matching
// state.
func (r *Regexp) CanMatch(s int) bool {
if s < len(r.dfa.states) && s > 0 {
return true
}
return false
}
// WillAlwaysMatch returns if the specified state will always end in a
// matching state.
func (r *Regexp) WillAlwaysMatch(int) bool {
return false
}
// Accept returns the new state, resulting from the transition byte b
// when currently in the state s.
func (r *Regexp) Accept(s int, b byte) int {
if s < len(r.dfa.states) {
return r.dfa.states[s].next[b]
}
return 0
}

View file

@ -1,54 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package regexp
type sparseSet struct {
dense []uint
sparse []uint
size uint
}
func newSparseSet(size uint) *sparseSet {
return &sparseSet{
dense: make([]uint, size),
sparse: make([]uint, size),
size: 0,
}
}
func (s *sparseSet) Len() int {
return int(s.size)
}
func (s *sparseSet) Add(ip uint) uint {
i := s.size
s.dense[i] = ip
s.sparse[ip] = i
s.size++
return i
}
func (s *sparseSet) Get(i uint) uint {
return s.dense[i]
}
func (s *sparseSet) Contains(ip uint) bool {
i := s.sparse[ip]
return i < s.size && s.dense[i] == ip
}
func (s *sparseSet) Clear() {
s.size = 0
}

View file

@ -1,114 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
type registryCell struct {
addr int
node *builderNode
}
type registry struct {
builderNodePool *builderNodePool
table []registryCell
tableSize uint
mruSize uint
}
func newRegistry(p *builderNodePool, tableSize, mruSize int) *registry {
nsize := tableSize * mruSize
rv := &registry{
builderNodePool: p,
table: make([]registryCell, nsize),
tableSize: uint(tableSize),
mruSize: uint(mruSize),
}
return rv
}
func (r *registry) Reset() {
var empty registryCell
for i := range r.table {
r.builderNodePool.Put(r.table[i].node)
r.table[i] = empty
}
}
func (r *registry) entry(node *builderNode) (bool, int, *registryCell) {
if len(r.table) == 0 {
return false, 0, nil
}
bucket := r.hash(node)
start := r.mruSize * uint(bucket)
end := start + r.mruSize
rc := registryCache(r.table[start:end])
return rc.entry(node, r.builderNodePool)
}
const fnvPrime = 1099511628211
func (r *registry) hash(b *builderNode) int {
var final uint64
if b.final {
final = 1
}
var h uint64 = 14695981039346656037
h = (h ^ final) * fnvPrime
h = (h ^ b.finalOutput) * fnvPrime
for _, t := range b.trans {
h = (h ^ uint64(t.in)) * fnvPrime
h = (h ^ t.out) * fnvPrime
h = (h ^ uint64(t.addr)) * fnvPrime
}
return int(h % uint64(r.tableSize))
}
type registryCache []registryCell
func (r registryCache) entry(node *builderNode, pool *builderNodePool) (bool, int, *registryCell) {
if len(r) == 1 {
if r[0].node != nil && r[0].node.equiv(node) {
return true, r[0].addr, nil
}
pool.Put(r[0].node)
r[0].node = node
return false, 0, &r[0]
}
for i := range r {
if r[i].node != nil && r[i].node.equiv(node) {
addr := r[i].addr
r.promote(i)
return true, addr, nil
}
}
// no match
last := len(r) - 1
pool.Put(r[last].node)
r[last].node = node // discard LRU
r.promote(last)
return false, 0, &r[0]
}
func (r registryCache) promote(i int) {
for i > 0 {
r.swap(i-1, i)
i--
}
}
func (r registryCache) swap(i, j int) {
r[i], r[j] = r[j], r[i]
}

View file

@ -1,55 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
// Transducer represents the general contract of a byte-based finite transducer
type Transducer interface {
// all transducers are also automatons
Automaton
// IsMatchWithValue returns true if and only if the state is a match
// additionally it returns a states final value (if any)
IsMatchWithVal(int) (bool, uint64)
// Accept returns the next state given the input to the specified state
// additionally it returns the value associated with the transition
AcceptWithVal(int, byte) (int, uint64)
}
// TransducerGet implements an generic Get() method which works
// on any implementation of Transducer
// The caller MUST check the boolean return value for a match.
// Zero is a valid value regardless of match status,
// and if it is NOT a match, the value collected so far is returned.
func TransducerGet(t Transducer, k []byte) (bool, uint64) {
var total uint64
i := 0
curr := t.Start()
for t.CanMatch(curr) && i < len(k) {
var transVal uint64
curr, transVal = t.AcceptWithVal(curr, k[i])
if curr == noneAddr {
break
}
total += transVal
i++
}
if i != len(k) {
return false, total
}
match, finalVal := t.IsMatchWithVal(curr)
return match, total + finalVal
}

View file

@ -1,268 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package utf8
import (
"fmt"
"unicode/utf8"
)
// Sequences is a collection of Sequence
type Sequences []Sequence
// NewSequences constructs a collection of Sequence which describe the
// byte ranges covered between the start and end runes.
func NewSequences(start, end rune) (Sequences, error) {
rv, _, err := NewSequencesPrealloc(start, end, nil, nil, nil, nil)
return rv, err
}
func NewSequencesPrealloc(start, end rune,
preallocSequences Sequences,
preallocRangeStack RangeStack,
preallocStartBytes, preallocEndBytes []byte) (Sequences, RangeStack, error) {
rv := preallocSequences[:0]
startBytes := preallocStartBytes
if cap(startBytes) < utf8.UTFMax {
startBytes = make([]byte, utf8.UTFMax)
}
startBytes = startBytes[:utf8.UTFMax]
endBytes := preallocEndBytes
if cap(endBytes) < utf8.UTFMax {
endBytes = make([]byte, utf8.UTFMax)
}
endBytes = endBytes[:utf8.UTFMax]
rangeStack := preallocRangeStack[:0]
rangeStack = rangeStack.Push(scalarRange{start, end})
rangeStack, r := rangeStack.Pop()
TOP:
for r != nilScalarRange {
INNER:
for {
r1, r2 := r.split()
if r1 != nilScalarRange {
rangeStack = rangeStack.Push(scalarRange{r2.start, r2.end})
r.start = r1.start
r.end = r1.end
continue INNER
}
if !r.valid() {
rangeStack, r = rangeStack.Pop()
continue TOP
}
for i := 1; i < utf8.UTFMax; i++ {
max := maxScalarValue(i)
if r.start <= max && max < r.end {
rangeStack = rangeStack.Push(scalarRange{max + 1, r.end})
r.end = max
continue INNER
}
}
asciiRange := r.ascii()
if asciiRange != nilRange {
rv = append(rv, Sequence{
asciiRange,
})
rangeStack, r = rangeStack.Pop()
continue TOP
}
for i := uint(1); i < utf8.UTFMax; i++ {
m := rune((1 << (6 * i)) - 1)
if (r.start & ^m) != (r.end & ^m) {
if (r.start & m) != 0 {
rangeStack = rangeStack.Push(scalarRange{(r.start | m) + 1, r.end})
r.end = r.start | m
continue INNER
}
if (r.end & m) != m {
rangeStack = rangeStack.Push(scalarRange{r.end & ^m, r.end})
r.end = (r.end & ^m) - 1
continue INNER
}
}
}
n, m := r.encode(startBytes, endBytes)
seq, err := SequenceFromEncodedRange(startBytes[0:n], endBytes[0:m])
if err != nil {
return nil, nil, err
}
rv = append(rv, seq)
rangeStack, r = rangeStack.Pop()
continue TOP
}
}
return rv, rangeStack, nil
}
// Sequence is a collection of Range
type Sequence []Range
// SequenceFromEncodedRange creates sequence from the encoded bytes
func SequenceFromEncodedRange(start, end []byte) (Sequence, error) {
if len(start) != len(end) {
return nil, fmt.Errorf("byte slices must be the same length")
}
switch len(start) {
case 2:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
}, nil
case 3:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
Range{start[2], end[2]},
}, nil
case 4:
return Sequence{
Range{start[0], end[0]},
Range{start[1], end[1]},
Range{start[2], end[2]},
Range{start[3], end[3]},
}, nil
}
return nil, fmt.Errorf("invalid encoded byte length")
}
// Matches checks to see if the provided byte slice matches the Sequence
func (u Sequence) Matches(bytes []byte) bool {
if len(bytes) < len(u) {
return false
}
for i := 0; i < len(u); i++ {
if !u[i].matches(bytes[i]) {
return false
}
}
return true
}
func (u Sequence) String() string {
switch len(u) {
case 1:
return fmt.Sprintf("%v", u[0])
case 2:
return fmt.Sprintf("%v%v", u[0], u[1])
case 3:
return fmt.Sprintf("%v%v%v", u[0], u[1], u[2])
case 4:
return fmt.Sprintf("%v%v%v%v", u[0], u[1], u[2], u[3])
default:
return fmt.Sprintf("invalid utf8 sequence")
}
}
// Range describes a single range of byte values
type Range struct {
Start byte
End byte
}
var nilRange = Range{0xff, 0}
func (u Range) matches(b byte) bool {
if u.Start <= b && b <= u.End {
return true
}
return false
}
func (u Range) String() string {
if u.Start == u.End {
return fmt.Sprintf("[%X]", u.Start)
}
return fmt.Sprintf("[%X-%X]", u.Start, u.End)
}
type scalarRange struct {
start rune
end rune
}
var nilScalarRange = scalarRange{0xffff, 0}
func (s *scalarRange) String() string {
return fmt.Sprintf("ScalarRange(%d,%d)", s.start, s.end)
}
// split this scalar range if it overlaps with a surrogate codepoint
func (s *scalarRange) split() (scalarRange, scalarRange) {
if s.start < 0xe000 && s.end > 0xd7ff {
return scalarRange{
start: s.start,
end: 0xd7ff,
},
scalarRange{
start: 0xe000,
end: s.end,
}
}
return nilScalarRange, nilScalarRange
}
func (s *scalarRange) valid() bool {
return s.start <= s.end
}
func (s *scalarRange) ascii() Range {
if s.valid() && s.end <= 0x7f {
return Range{
Start: byte(s.start),
End: byte(s.end),
}
}
return nilRange
}
// start and end MUST have capacity for utf8.UTFMax bytes
func (s *scalarRange) encode(start, end []byte) (int, int) {
n := utf8.EncodeRune(start, s.start)
m := utf8.EncodeRune(end, s.end)
return n, m
}
type RangeStack []scalarRange
func (s RangeStack) Push(v scalarRange) RangeStack {
return append(s, v)
}
func (s RangeStack) Pop() (RangeStack, scalarRange) {
l := len(s)
if l < 1 {
return s, nilScalarRange
}
return s[:l-1], s[l-1]
}
func maxScalarValue(nbytes int) rune {
switch nbytes {
case 1:
return 0x007f
case 2:
return 0x07FF
case 3:
return 0xFFFF
default:
return 0x10FFFF
}
}

View file

@ -1,111 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
Package vellum is a library for building, serializing and executing an FST (finite
state transducer).
There are two distinct phases, building an FST and using it.
When building an FST, you insert keys ([]byte) and their associated value
(uint64). Insert operations MUST be done in lexicographic order. While
building the FST, data is streamed to an underlying Writer. At the conclusion
of building, you MUST call Close() on the builder.
After completion of the build phase, you can either Open() the FST if you
serialized it to disk. Alternatively, if you already have the bytes in
memory, you can use Load(). By default, Open() will use mmap to avoid loading
the entire file into memory.
Once the FST is ready, you can use the Contains() method to see if a keys is
in the FST. You can use the Get() method to see if a key is in the FST and
retrieve it's associated value. And, you can use the Iterator method to
enumerate key/value pairs within a specified range.
*/
package vellum
import (
"errors"
"io"
)
// ErrOutOfOrder is returned when values are not inserted in
// lexicographic order.
var ErrOutOfOrder = errors.New("values not inserted in lexicographic order")
// ErrIteratorDone is returned by Iterator/Next/Seek methods when the
// Current() value pointed to by the iterator is greater than the last
// key in this FST, or outside the configured startKeyInclusive/endKeyExclusive
// range of the Iterator.
var ErrIteratorDone = errors.New("iterator-done")
// BuilderOpts is a structure to let advanced users customize the behavior
// of the builder and some aspects of the generated FST.
type BuilderOpts struct {
Encoder int
RegistryTableSize int
RegistryMRUSize int
}
// New returns a new Builder which will stream out the
// underlying representation to the provided Writer as the set is built.
func New(w io.Writer, opts *BuilderOpts) (*Builder, error) {
return newBuilder(w, opts)
}
// Open loads the FST stored in the provided path
func Open(path string) (*FST, error) {
return open(path)
}
// Load will return the FST represented by the provided byte slice.
func Load(data []byte) (*FST, error) {
return new(data, nil)
}
// Merge will iterate through the provided Iterators, merge duplicate keys
// with the provided MergeFunc, and build a new FST to the provided Writer.
func Merge(w io.Writer, opts *BuilderOpts, itrs []Iterator, f MergeFunc) error {
builder, err := New(w, opts)
if err != nil {
return err
}
itr, err := NewMergeIterator(itrs, f)
for err == nil {
k, v := itr.Current()
err = builder.Insert(k, v)
if err != nil {
return err
}
err = itr.Next()
}
if err != nil && err != ErrIteratorDone {
return err
}
err = itr.Close()
if err != nil {
return err
}
err = builder.Close()
if err != nil {
return err
}
return nil
}

View file

@ -1,60 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nommap
package vellum
import (
"os"
mmap "github.com/blevesearch/mmap-go"
)
type mmapWrapper struct {
f *os.File
mm mmap.MMap
}
func (m *mmapWrapper) Close() (err error) {
if m.mm != nil {
err = m.mm.Unmap()
}
// try to close file even if unmap failed
if m.f != nil {
err2 := m.f.Close()
if err == nil {
// try to return first error
err = err2
}
}
return
}
func open(path string) (*FST, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
mm, err := mmap.Map(f, mmap.RDONLY, 0)
if err != nil {
// mmap failed, try to close the file
_ = f.Close()
return nil, err
}
return new(mm, &mmapWrapper{
f: f,
mm: mm,
})
}

View file

@ -1,27 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build nommap
package vellum
import "io/ioutil"
func open(path string) (*FST, error) {
data, err := ioutil.ReadFile(string)
if err != nil {
return nil, err
}
return new(data, nil)
}

View file

@ -1,92 +0,0 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vellum
import (
"bufio"
"io"
)
// A writer is a buffered writer used by vellum. It counts how many bytes have
// been written and has some convenience methods used for encoding the data.
type writer struct {
w *bufio.Writer
counter int
}
func newWriter(w io.Writer) *writer {
return &writer{
w: bufio.NewWriter(w),
}
}
func (w *writer) Reset(newWriter io.Writer) {
w.w.Reset(newWriter)
w.counter = 0
}
func (w *writer) WriteByte(c byte) error {
err := w.w.WriteByte(c)
if err != nil {
return err
}
w.counter++
return nil
}
func (w *writer) Write(p []byte) (int, error) {
n, err := w.w.Write(p)
w.counter += n
return n, err
}
func (w *writer) Flush() error {
return w.w.Flush()
}
func (w *writer) WritePackedUintIn(v uint64, n int) error {
for shift := uint(0); shift < uint(n*8); shift += 8 {
err := w.WriteByte(byte(v >> shift))
if err != nil {
return err
}
}
return nil
}
func (w *writer) WritePackedUint(v uint64) error {
n := packedSize(v)
return w.WritePackedUintIn(v, n)
}
func packedSize(n uint64) int {
if n < 1<<8 {
return 1
} else if n < 1<<16 {
return 2
} else if n < 1<<24 {
return 3
} else if n < 1<<32 {
return 4
} else if n < 1<<40 {
return 5
} else if n < 1<<48 {
return 6
} else if n < 1<<56 {
return 7
}
return 8
}