Update to last common bleve (#3986)

2018-05-19 14:49:46 +02:00 · 2018-05-19 14:49:46 +02:00 · 917b9641ec
commit 917b9641ec
parent 1b7cd3d0b0
184 changed files with 39576 additions and 121 deletions
--- a/vendor/github.com/couchbase/vellum/regexp/compile.go
+++ b/vendor/github.com/couchbase/vellum/regexp/compile.go
@ -0,0 +1,316 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+	"regexp/syntax"
+	"unicode"
+
+	"github.com/couchbase/vellum/utf8"
+)
+
+type compiler struct {
+	sizeLimit uint
+	insts     prog
+}
+
+func newCompiler(sizeLimit uint) *compiler {
+	return &compiler{
+		sizeLimit: sizeLimit,
+	}
+}
+
+func (c *compiler) compile(ast *syntax.Regexp) (prog, error) {
+	err := c.c(ast)
+	if err != nil {
+		return nil, err
+	}
+	c.insts = append(c.insts, &inst{
+		op: OpMatch,
+	})
+	return c.insts, nil
+}
+
+func (c *compiler) c(ast *syntax.Regexp) error {
+	if ast.Flags&syntax.NonGreedy > 1 {
+		return ErrNoLazy
+	}
+
+	switch ast.Op {
+	case syntax.OpEndLine, syntax.OpBeginLine,
+		syntax.OpBeginText, syntax.OpEndText:
+		return ErrNoEmpty
+	case syntax.OpWordBoundary, syntax.OpNoWordBoundary:
+		return ErrNoWordBoundary
+	case syntax.OpEmptyMatch:
+		return nil
+	case syntax.OpLiteral:
+		for _, r := range ast.Rune {
+			if ast.Flags&syntax.FoldCase > 0 {
+				next := syntax.Regexp{
+					Op:    syntax.OpCharClass,
+					Flags: ast.Flags & syntax.FoldCase,
+					Rune0: [2]rune{r, r},
+				}
+				next.Rune = next.Rune0[0:2]
+				return c.c(&next)
+			}
+			seqs, err := utf8.NewSequences(r, r)
+			if err != nil {
+				return err
+			}
+			for _, seq := range seqs {
+				c.compileUtf8Ranges(seq)
+			}
+		}
+	case syntax.OpAnyChar:
+		next := syntax.Regexp{
+			Op:    syntax.OpCharClass,
+			Flags: ast.Flags & syntax.FoldCase,
+			Rune0: [2]rune{0, unicode.MaxRune},
+		}
+		next.Rune = next.Rune0[:2]
+		return c.c(&next)
+	case syntax.OpAnyCharNotNL:
+		next := syntax.Regexp{
+			Op:    syntax.OpCharClass,
+			Flags: ast.Flags & syntax.FoldCase,
+			Rune:  []rune{0, 0x09, 0x0B, unicode.MaxRune},
+		}
+		return c.c(&next)
+	case syntax.OpCharClass:
+		return c.compileClass(ast)
+	case syntax.OpCapture:
+		return c.c(ast.Sub[0])
+	case syntax.OpConcat:
+		for _, sub := range ast.Sub {
+			err := c.c(sub)
+			if err != nil {
+				return err
+			}
+		}
+		return nil
+	case syntax.OpAlternate:
+		if len(ast.Sub) == 0 {
+			return nil
+		}
+		jmpsToEnd := []uint{}
+
+		// does not handle last entry
+		for i := 0; i < len(ast.Sub)-1; i++ {
+			sub := ast.Sub[i]
+			split := c.emptySplit()
+			j1 := c.top()
+			err := c.c(sub)
+			if err != nil {
+				return err
+			}
+			jmpsToEnd = append(jmpsToEnd, c.emptyJump())
+			j2 := c.top()
+			c.setSplit(split, j1, j2)
+		}
+		// handle last entry
+		err := c.c(ast.Sub[len(ast.Sub)-1])
+		if err != nil {
+			return err
+		}
+		end := uint(len(c.insts))
+		for _, jmpToEnd := range jmpsToEnd {
+			c.setJump(jmpToEnd, end)
+		}
+	case syntax.OpQuest:
+		split := c.emptySplit()
+		j1 := c.top()
+		err := c.c(ast.Sub[0])
+		if err != nil {
+			return err
+		}
+		j2 := c.top()
+		c.setSplit(split, j1, j2)
+
+	case syntax.OpStar:
+		j1 := c.top()
+		split := c.emptySplit()
+		j2 := c.top()
+		err := c.c(ast.Sub[0])
+		if err != nil {
+			return err
+		}
+		jmp := c.emptyJump()
+		j3 := uint(len(c.insts))
+
+		c.setJump(jmp, j1)
+		c.setSplit(split, j2, j3)
+
+	case syntax.OpPlus:
+		j1 := c.top()
+		err := c.c(ast.Sub[0])
+		if err != nil {
+			return err
+		}
+		split := c.emptySplit()
+		j2 := c.top()
+		c.setSplit(split, j1, j2)
+
+	case syntax.OpRepeat:
+		if ast.Max == -1 {
+			for i := 0; i < ast.Min; i++ {
+				err := c.c(ast.Sub[0])
+				if err != nil {
+					return err
+				}
+			}
+			next := syntax.Regexp{
+				Op:    syntax.OpStar,
+				Flags: ast.Flags,
+				Sub:   ast.Sub,
+				Sub0:  ast.Sub0,
+				Rune:  ast.Rune,
+				Rune0: ast.Rune0,
+			}
+			return c.c(&next)
+		}
+		for i := 0; i < ast.Min; i++ {
+			err := c.c(ast.Sub[0])
+			if err != nil {
+				return err
+			}
+		}
+		var splits, starts []uint
+		for i := ast.Min; i < ast.Max; i++ {
+			splits = append(splits, c.emptySplit())
+			starts = append(starts, uint(len(c.insts)))
+			err := c.c(ast.Sub[0])
+			if err != nil {
+				return err
+			}
+		}
+		end := uint(len(c.insts))
+		for i := 0; i < len(splits); i++ {
+			c.setSplit(splits[i], starts[i], end)
+		}
+
+	}
+
+	return c.checkSize()
+}
+
+func (c *compiler) checkSize() error {
+	if uint(len(c.insts)*instSize) > c.sizeLimit {
+		return ErrCompiledTooBig
+	}
+	return nil
+}
+
+func (c *compiler) compileClass(ast *syntax.Regexp) error {
+	if len(ast.Rune) == 0 {
+		return nil
+	}
+	var jmps []uint
+
+	// does not do last pair
+	for i := 0; i < len(ast.Rune)-2; i += 2 {
+		rstart := ast.Rune[i]
+		rend := ast.Rune[i+1]
+
+		split := c.emptySplit()
+		j1 := c.top()
+		err := c.compileClassRange(rstart, rend)
+		if err != nil {
+			return err
+		}
+		jmps = append(jmps, c.emptyJump())
+		j2 := c.top()
+		c.setSplit(split, j1, j2)
+	}
+	// handle last pair
+	rstart := ast.Rune[len(ast.Rune)-2]
+	rend := ast.Rune[len(ast.Rune)-1]
+	err := c.compileClassRange(rstart, rend)
+	if err != nil {
+		return err
+	}
+	end := c.top()
+	for _, jmp := range jmps {
+		c.setJump(jmp, end)
+	}
+	return nil
+}
+
+func (c *compiler) compileClassRange(startR, endR rune) error {
+	seqs, err := utf8.NewSequences(startR, endR)
+	if err != nil {
+		return err
+	}
+	var jmps []uint
+
+	// does not do last entry
+	for i := 0; i < len(seqs)-1; i++ {
+		seq := seqs[i]
+		split := c.emptySplit()
+		j1 := c.top()
+		c.compileUtf8Ranges(seq)
+		jmps = append(jmps, c.emptyJump())
+		j2 := c.top()
+		c.setSplit(split, j1, j2)
+	}
+	// handle last entry
+	c.compileUtf8Ranges(seqs[len(seqs)-1])
+	end := c.top()
+	for _, jmp := range jmps {
+		c.setJump(jmp, end)
+	}
+
+	return nil
+}
+
+func (c *compiler) compileUtf8Ranges(seq utf8.Sequence) {
+	for _, r := range seq {
+		c.insts = append(c.insts, &inst{
+			op:         OpRange,
+			rangeStart: r.Start,
+			rangeEnd:   r.End,
+		})
+	}
+}
+
+func (c *compiler) emptySplit() uint {
+	c.insts = append(c.insts, &inst{
+		op: OpSplit,
+	})
+	return c.top() - 1
+}
+
+func (c *compiler) emptyJump() uint {
+	c.insts = append(c.insts, &inst{
+		op: OpJmp,
+	})
+	return c.top() - 1
+}
+
+func (c *compiler) setSplit(i, pc1, pc2 uint) {
+	split := c.insts[i]
+	split.splitA = pc1
+	split.splitB = pc2
+}
+
+func (c *compiler) setJump(i, pc uint) {
+	jmp := c.insts[i]
+	jmp.to = pc
+}
+
+func (c *compiler) top() uint {
+	return uint(len(c.insts))
+}
--- a/vendor/github.com/couchbase/vellum/regexp/dfa.go
+++ b/vendor/github.com/couchbase/vellum/regexp/dfa.go
@ -0,0 +1,188 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+	"encoding/binary"
+	"fmt"
+)
+
+// StateLimit is the maximum number of states allowed
+const StateLimit = 10000
+
+// ErrTooManyStates is returned if you attempt to build a Levenshtein
+// automaton which requries too many states.
+var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
+	StateLimit)
+
+type dfaBuilder struct {
+	dfa    *dfa
+	cache  map[string]int
+	keyBuf []byte
+}
+
+func newDfaBuilder(insts prog) *dfaBuilder {
+	d := &dfaBuilder{
+		dfa: &dfa{
+			insts:  insts,
+			states: make([]*state, 0, 16),
+		},
+		cache: make(map[string]int, 1024),
+	}
+	// add 0 state that is invalid
+	d.dfa.states = append(d.dfa.states, &state{
+		next:  make([]int, 256),
+		match: false,
+	})
+	return d
+}
+
+func (d *dfaBuilder) build() (*dfa, error) {
+	cur := newSparseSet(uint(len(d.dfa.insts)))
+	next := newSparseSet(uint(len(d.dfa.insts)))
+
+	d.dfa.add(cur, 0)
+	states := intStack{d.cachedState(cur)}
+	seen := make(map[int]struct{})
+	var s int
+	states, s = states.Pop()
+	for s != 0 {
+		for b := 0; b < 256; b++ {
+			ns := d.runState(cur, next, s, byte(b))
+			if ns != 0 {
+				if _, ok := seen[ns]; !ok {
+					seen[ns] = struct{}{}
+					states = states.Push(ns)
+				}
+			}
+			if len(d.dfa.states) > StateLimit {
+				return nil, ErrTooManyStates
+			}
+		}
+		states, s = states.Pop()
+	}
+	return d.dfa, nil
+}
+
+func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte) int {
+	cur.Clear()
+	for _, ip := range d.dfa.states[state].insts {
+		cur.Add(ip)
+	}
+	d.dfa.run(cur, next, b)
+	nextState := d.cachedState(next)
+	d.dfa.states[state].next[b] = nextState
+	return nextState
+}
+
+func instsKey(insts []uint, buf []byte) []byte {
+	if cap(buf) < 8*len(insts) {
+		buf = make([]byte, 8*len(insts))
+	} else {
+		buf = buf[0 : 8*len(insts)]
+	}
+	for i, inst := range insts {
+		binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
+	}
+	return buf
+}
+
+func (d *dfaBuilder) cachedState(set *sparseSet) int {
+	var insts []uint
+	var isMatch bool
+	for i := uint(0); i < uint(set.Len()); i++ {
+		ip := set.Get(i)
+		switch d.dfa.insts[ip].op {
+		case OpRange:
+			insts = append(insts, ip)
+		case OpMatch:
+			isMatch = true
+			insts = append(insts, ip)
+		}
+	}
+	if len(insts) == 0 {
+		return 0
+	}
+	d.keyBuf = instsKey(insts, d.keyBuf)
+	v, ok := d.cache[string(d.keyBuf)]
+	if ok {
+		return v
+	}
+	d.dfa.states = append(d.dfa.states, &state{
+		insts: insts,
+		next:  make([]int, 256),
+		match: isMatch,
+	})
+	newV := len(d.dfa.states) - 1
+	d.cache[string(d.keyBuf)] = newV
+	return newV
+}
+
+type dfa struct {
+	insts  prog
+	states []*state
+}
+
+func (d *dfa) add(set *sparseSet, ip uint) {
+	if set.Contains(ip) {
+		return
+	}
+	set.Add(ip)
+	switch d.insts[ip].op {
+	case OpJmp:
+		d.add(set, d.insts[ip].to)
+	case OpSplit:
+		d.add(set, d.insts[ip].splitA)
+		d.add(set, d.insts[ip].splitB)
+	}
+}
+
+func (d *dfa) run(from, to *sparseSet, b byte) bool {
+	to.Clear()
+	var isMatch bool
+	for i := uint(0); i < uint(from.Len()); i++ {
+		ip := from.Get(i)
+		switch d.insts[ip].op {
+		case OpMatch:
+			isMatch = true
+		case OpRange:
+			if d.insts[ip].rangeStart <= b &&
+				b <= d.insts[ip].rangeEnd {
+				d.add(to, ip+1)
+			}
+		}
+	}
+	return isMatch
+}
+
+type state struct {
+	insts []uint
+	next  []int
+	match bool
+}
+
+type intStack []int
+
+func (s intStack) Push(v int) intStack {
+	return append(s, v)
+}
+
+func (s intStack) Pop() (intStack, int) {
+	l := len(s)
+	if l < 1 {
+		return s, 0
+	}
+	return s[:l-1], s[l-1]
+}
--- a/vendor/github.com/couchbase/vellum/regexp/inst.go
+++ b/vendor/github.com/couchbase/vellum/regexp/inst.go
@ -0,0 +1,62 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import "fmt"
+
+// instOp represents a instruction operation
+type instOp int
+
+// the enumeration of operations
+const (
+	OpMatch instOp = iota
+	OpJmp
+	OpSplit
+	OpRange
+)
+
+// instSize is the approxmiate size of the an inst struct in bytes
+const instSize = 40
+
+type inst struct {
+	op         instOp
+	to         uint
+	splitA     uint
+	splitB     uint
+	rangeStart byte
+	rangeEnd   byte
+}
+
+func (i *inst) String() string {
+	switch i.op {
+	case OpJmp:
+		return fmt.Sprintf("JMP: %d", i.to)
+	case OpSplit:
+		return fmt.Sprintf("SPLIT: %d - %d", i.splitA, i.splitB)
+	case OpRange:
+		return fmt.Sprintf("RANGE: %x - %x", i.rangeStart, i.rangeEnd)
+	}
+	return "MATCH"
+}
+
+type prog []*inst
+
+func (p prog) String() string {
+	rv := "\n"
+	for i, pi := range p {
+		rv += fmt.Sprintf("%d %v\n", i, pi)
+	}
+	return rv
+}
--- a/vendor/github.com/couchbase/vellum/regexp/regexp.go
+++ b/vendor/github.com/couchbase/vellum/regexp/regexp.go
@ -0,0 +1,113 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+	"fmt"
+	"regexp/syntax"
+)
+
+// ErrNoEmpty returned when "zero width assertions" are used
+var ErrNoEmpty = fmt.Errorf("zero width assertions not allowed")
+
+// ErrNoWordBoundary returned when word boundaries are used
+var ErrNoWordBoundary = fmt.Errorf("word boundaries are not allowed")
+
+// ErrNoBytes returned when byte literals are used
+var ErrNoBytes = fmt.Errorf("byte literals are not allowed")
+
+// ErrNoLazy returned when lazy quantifiers are used
+var ErrNoLazy = fmt.Errorf("lazy quantifiers are not allowed")
+
+// ErrCompiledTooBig returned when regular expression parses into
+// too many instructions
+var ErrCompiledTooBig = fmt.Errorf("too many instructions")
+
+// Regexp implements the vellum.Automaton interface for matcing a user
+// specified regular expression.
+type Regexp struct {
+	orig string
+	dfa  *dfa
+}
+
+// NewRegexp creates a new Regular Expression automaton with the specified
+// expression.  By default it is limited to approximately 10MB for the
+// compiled finite state automaton.  If this size is exceeded,
+// ErrCompiledTooBig will be returned.
+func New(expr string) (*Regexp, error) {
+	return NewWithLimit(expr, 10*(1<<20))
+}
+
+// NewRegexpWithLimit creates a new Regular Expression automaton with
+// the specified expression.  The size of the compiled finite state
+// automaton exceeds the user specified size,  ErrCompiledTooBig will be
+// returned.
+func NewWithLimit(expr string, size uint) (*Regexp, error) {
+	parsed, err := syntax.Parse(expr, syntax.Perl)
+	if err != nil {
+		return nil, err
+	}
+	compiler := newCompiler(size)
+	insts, err := compiler.compile(parsed)
+	if err != nil {
+		return nil, err
+	}
+	dfaBuilder := newDfaBuilder(insts)
+	dfa, err := dfaBuilder.build()
+	if err != nil {
+		return nil, err
+	}
+	return &Regexp{
+		orig: expr,
+		dfa:  dfa,
+	}, nil
+}
+
+// Start returns the start state of this automaton.
+func (r *Regexp) Start() int {
+	return 1
+}
+
+// IsMatch returns if the specified state is a matching state.
+func (r *Regexp) IsMatch(s int) bool {
+	if s < len(r.dfa.states) {
+		return r.dfa.states[s].match
+	}
+	return false
+}
+
+// CanMatch returns if the specified state can ever transition to a matching
+// state.
+func (r *Regexp) CanMatch(s int) bool {
+	if s < len(r.dfa.states) && s > 0 {
+		return true
+	}
+	return false
+}
+
+// WillAlwaysMatch returns if the specified state will always end in a
+// matching state.
+func (r *Regexp) WillAlwaysMatch(int) bool {
+	return false
+}
+
+// Accept returns the new state, resulting from the transite byte b
+// when currently in the state s.
+func (r *Regexp) Accept(s int, b byte) int {
+	if s < len(r.dfa.states) {
+		return r.dfa.states[s].next[b]
+	}
+	return 0
+}
--- a/vendor/github.com/couchbase/vellum/regexp/sparse.go
+++ b/vendor/github.com/couchbase/vellum/regexp/sparse.go
@ -0,0 +1,54 @@
+//  Copyright (c) 2017 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+type sparseSet struct {
+	dense  []uint
+	sparse []uint
+	size   uint
+}
+
+func newSparseSet(size uint) *sparseSet {
+	return &sparseSet{
+		dense:  make([]uint, size),
+		sparse: make([]uint, size),
+		size:   0,
+	}
+}
+
+func (s *sparseSet) Len() int {
+	return int(s.size)
+}
+
+func (s *sparseSet) Add(ip uint) uint {
+	i := s.size
+	s.dense[i] = ip
+	s.sparse[ip] = i
+	s.size++
+	return i
+}
+
+func (s *sparseSet) Get(i uint) uint {
+	return s.dense[i]
+}
+
+func (s *sparseSet) Contains(ip uint) bool {
+	i := s.sparse[ip]
+	return i < s.size && s.dense[i] == ip
+}
+
+func (s *sparseSet) Clear() {
+	s.size = 0
+}