Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revision
This commit is contained in:
parent
11e316654e
commit
a380cfd8e0
|
@ -40,14 +40,6 @@
|
|||
revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0"
|
||||
version = "v0.4.7"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace"
|
||||
name = "github.com/Smerity/govarint"
|
||||
packages = ["."]
|
||||
pruneopts = "NUT"
|
||||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146"
|
||||
|
@ -98,7 +90,8 @@
|
|||
revision = "3a771d992973f24aa725d07868b467d1ddfceafb"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32"
|
||||
branch = "master"
|
||||
digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2"
|
||||
name = "github.com/blevesearch/bleve"
|
||||
packages = [
|
||||
".",
|
||||
|
@ -121,7 +114,6 @@
|
|||
"index/scorch",
|
||||
"index/scorch/mergeplan",
|
||||
"index/scorch/segment",
|
||||
"index/scorch/segment/mem",
|
||||
"index/scorch/segment/zap",
|
||||
"index/store",
|
||||
"index/store/boltdb",
|
||||
|
@ -141,9 +133,10 @@
|
|||
"search/query",
|
||||
"search/scorer",
|
||||
"search/searcher",
|
||||
"size",
|
||||
]
|
||||
pruneopts = "NUT"
|
||||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026"
|
||||
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
|
@ -160,14 +153,6 @@
|
|||
pruneopts = "NUT"
|
||||
revision = "db70c57796cc8c310613541dfade3dce627d09c7"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb"
|
||||
name = "github.com/boltdb/bolt"
|
||||
packages = ["."]
|
||||
pruneopts = "NUT"
|
||||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
|
||||
source = "github.com/go-gitea/bolt"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5"
|
||||
name = "github.com/boombuler/barcode"
|
||||
|
@ -217,15 +202,16 @@
|
|||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6"
|
||||
digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215"
|
||||
name = "github.com/couchbase/vellum"
|
||||
packages = [
|
||||
".",
|
||||
"levenshtein2",
|
||||
"regexp",
|
||||
"utf8",
|
||||
]
|
||||
pruneopts = "NUT"
|
||||
revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4"
|
||||
revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65"
|
||||
|
||||
[[projects]]
|
||||
branch = "master"
|
||||
|
@ -287,6 +273,14 @@
|
|||
revision = "1615341f118ae12f353cc8a983f35b584342c9b3"
|
||||
version = "v1.12.0"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d"
|
||||
name = "github.com/etcd-io/bbolt"
|
||||
packages = ["."]
|
||||
pruneopts = "NUT"
|
||||
revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461"
|
||||
version = "v1.3.2"
|
||||
|
||||
[[projects]]
|
||||
digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48"
|
||||
name = "github.com/ethantkoenig/rupture"
|
||||
|
|
|
@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"]
|
|||
name = "code.gitea.io/sdk"
|
||||
|
||||
[[constraint]]
|
||||
# branch = "master"
|
||||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026"
|
||||
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
|
||||
name = "github.com/blevesearch/bleve"
|
||||
#Not targetting v0.7.0 since standard where use only just after this tag
|
||||
|
||||
[[constraint]]
|
||||
revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e"
|
||||
|
@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"]
|
|||
name = "gopkg.in/testfixtures.v2"
|
||||
version = "2.0.0"
|
||||
|
||||
[[override]]
|
||||
name = "github.com/boltdb/bolt"
|
||||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
|
||||
source = "github.com/go-gitea/bolt"
|
||||
|
||||
[[override]]
|
||||
branch = "master"
|
||||
name = "golang.org/x/oauth2"
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Stephen Merity
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
@ -1,229 +0,0 @@
|
|||
package govarint
|
||||
|
||||
import "encoding/binary"
|
||||
import "io"
|
||||
|
||||
type U32VarintEncoder interface {
|
||||
PutU32(x uint32) int
|
||||
Close()
|
||||
}
|
||||
|
||||
type U32VarintDecoder interface {
|
||||
GetU32() (uint32, error)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U64VarintEncoder interface {
|
||||
PutU64(x uint64) int
|
||||
Close()
|
||||
}
|
||||
|
||||
type U64VarintDecoder interface {
|
||||
GetU64() (uint64, error)
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintEncoder struct {
|
||||
w io.Writer
|
||||
index int
|
||||
store [4]uint32
|
||||
temp [17]byte
|
||||
}
|
||||
|
||||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} }
|
||||
|
||||
func (b *U32GroupVarintEncoder) Flush() (int, error) {
|
||||
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
|
||||
// If index is zero, there are no integers to flush
|
||||
if b.index == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
// In the case we're flushing (the group isn't of size four), the non-values should be zero
|
||||
// This ensures the unused entries are all zero in the sizeByte
|
||||
for i := b.index; i < 4; i++ {
|
||||
b.store[i] = 0
|
||||
}
|
||||
length := 1
|
||||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
|
||||
b.temp[0] = 0
|
||||
for i, x := range b.store {
|
||||
size := byte(0)
|
||||
shifts := []byte{24, 16, 8, 0}
|
||||
for _, shift := range shifts {
|
||||
// Always writes at least one byte -- the first one (shift = 0)
|
||||
// Will write more bytes until the rest of the integer is all zeroes
|
||||
if (x>>shift) != 0 || shift == 0 {
|
||||
size += 1
|
||||
b.temp[length] = byte(x >> shift)
|
||||
length += 1
|
||||
}
|
||||
}
|
||||
// We store the size in two of the eight bits in the first byte (sizeByte)
|
||||
// 0 means there is one byte in total, hence why we subtract one from size
|
||||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2)
|
||||
}
|
||||
// If we're flushing without a full group of four, remove the unused bytes we computed
|
||||
// This enables us to realize it's a partial group on decoding thanks to EOF
|
||||
if b.index != 4 {
|
||||
length -= 4 - b.index
|
||||
}
|
||||
_, err := b.w.Write(b.temp[:length])
|
||||
return length, err
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) {
|
||||
bytesWritten := 0
|
||||
b.store[b.index] = x
|
||||
b.index += 1
|
||||
if b.index == 4 {
|
||||
n, err := b.Flush()
|
||||
if err != nil {
|
||||
return n, err
|
||||
}
|
||||
bytesWritten += n
|
||||
b.index = 0
|
||||
}
|
||||
return bytesWritten, nil
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintEncoder) Close() {
|
||||
// On Close, we flush any remaining values that might not have been in a full group
|
||||
b.Flush()
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type U32GroupVarintDecoder struct {
|
||||
r io.ByteReader
|
||||
group [4]uint32
|
||||
pos int
|
||||
finished bool
|
||||
capacity int
|
||||
}
|
||||
|
||||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder {
|
||||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4}
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintDecoder) getGroup() error {
|
||||
// We should always receive a sizeByte if there are more values to read
|
||||
sizeByte, err := b.r.ReadByte()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Calculate the size of the four incoming 32 bit integers
|
||||
// 0b00 means 1 byte to read, 0b01 = 2, etc
|
||||
b.group[0] = uint32((sizeByte >> 6) & 3)
|
||||
b.group[1] = uint32((sizeByte >> 4) & 3)
|
||||
b.group[2] = uint32((sizeByte >> 2) & 3)
|
||||
b.group[3] = uint32(sizeByte & 3)
|
||||
//
|
||||
for index, size := range b.group {
|
||||
b.group[index] = 0
|
||||
// Any error that occurs in earlier byte reads should be repeated at the end one
|
||||
// Hence we only catch and report the final ReadByte's error
|
||||
var err error
|
||||
switch size {
|
||||
case 0:
|
||||
var x byte
|
||||
x, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)
|
||||
case 1:
|
||||
var x, y byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<8 | uint32(y)
|
||||
case 2:
|
||||
var x, y, z byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, _ = b.r.ReadByte()
|
||||
z, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z)
|
||||
case 3:
|
||||
var x, y, z, zz byte
|
||||
x, _ = b.r.ReadByte()
|
||||
y, _ = b.r.ReadByte()
|
||||
z, _ = b.r.ReadByte()
|
||||
zz, err = b.r.ReadByte()
|
||||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz)
|
||||
}
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
// If we hit EOF here, we have found a partial group
|
||||
// We've return any valid entries we have read and return EOF once we run out
|
||||
b.capacity = index
|
||||
b.finished = true
|
||||
break
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
// Reset the pos pointer to the beginning of the read values
|
||||
b.pos = 0
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) {
|
||||
// Check if we have any more values to give out - if not, let's get them
|
||||
if b.pos == b.capacity {
|
||||
// If finished is set, there is nothing else to do
|
||||
if b.finished {
|
||||
return 0, io.EOF
|
||||
}
|
||||
err := b.getGroup()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
// Increment pointer and return the value stored at that point
|
||||
b.pos += 1
|
||||
return b.group[b.pos-1], nil
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type Base128Encoder struct {
|
||||
w io.Writer
|
||||
tmpBytes []byte
|
||||
}
|
||||
|
||||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder {
|
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)}
|
||||
}
|
||||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder {
|
||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)}
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) PutU32(x uint32) (int, error) {
|
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x))
|
||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) PutU64(x uint64) (int, error) {
|
||||
writtenBytes := binary.PutUvarint(b.tmpBytes, x)
|
||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
||||
}
|
||||
|
||||
func (b *Base128Encoder) Close() {
|
||||
}
|
||||
|
||||
///
|
||||
|
||||
type Base128Decoder struct {
|
||||
r io.ByteReader
|
||||
}
|
||||
|
||||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
||||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
||||
|
||||
func (b *Base128Decoder) GetU32() (uint32, error) {
|
||||
v, err := binary.ReadUvarint(b.r)
|
||||
return uint32(v), err
|
||||
}
|
||||
|
||||
func (b *Base128Decoder) GetU64() (uint64, error) {
|
||||
return binary.ReadUvarint(b.r)
|
||||
}
|
|
@ -14,6 +14,22 @@
|
|||
|
||||
package analysis
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTokenLocation int
|
||||
var reflectStaticSizeTokenFreq int
|
||||
|
||||
func init() {
|
||||
var tl TokenLocation
|
||||
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
|
||||
var tf TokenFreq
|
||||
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
|
||||
}
|
||||
|
||||
// TokenLocation represents one occurrence of a term at a particular location in
|
||||
// a field. Start, End and Position have the same meaning as in analysis.Token.
|
||||
// Field and ArrayPositions identify the field value in the source document.
|
||||
|
@ -26,6 +42,12 @@ type TokenLocation struct {
|
|||
Position int
|
||||
}
|
||||
|
||||
func (tl *TokenLocation) Size() int {
|
||||
rv := reflectStaticSizeTokenLocation
|
||||
rv += len(tl.ArrayPositions) * size.SizeOfUint64
|
||||
return rv
|
||||
}
|
||||
|
||||
// TokenFreq represents all the occurrences of a term in all fields of a
|
||||
// document.
|
||||
type TokenFreq struct {
|
||||
|
@ -34,6 +56,15 @@ type TokenFreq struct {
|
|||
frequency int
|
||||
}
|
||||
|
||||
func (tf *TokenFreq) Size() int {
|
||||
rv := reflectStaticSizeTokenFreq
|
||||
rv += len(tf.Term)
|
||||
for _, loc := range tf.Locations {
|
||||
rv += loc.Size()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (tf *TokenFreq) Frequency() int {
|
||||
return tf.frequency
|
||||
}
|
||||
|
@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int {
|
|||
// fields.
|
||||
type TokenFrequencies map[string]*TokenFreq
|
||||
|
||||
func (tfs TokenFrequencies) Size() int {
|
||||
rv := size.SizeOfMap
|
||||
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
|
||||
for k, v := range tfs {
|
||||
rv += len(k)
|
||||
rv += v.Size()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
|
||||
// walk the new token frequencies
|
||||
for tfk, tf := range other {
|
||||
|
|
|
@ -46,11 +46,11 @@ type Parser struct {
|
|||
index int
|
||||
}
|
||||
|
||||
func NewParser(len, position, index int) *Parser {
|
||||
func NewParser(length, position, index int) *Parser {
|
||||
return &Parser{
|
||||
bufferLen: len,
|
||||
buffer: make([]rune, 0, len),
|
||||
tokens: make([]*analysis.Token, 0, len),
|
||||
bufferLen: length,
|
||||
buffer: make([]rune, 0, length),
|
||||
tokens: make([]*analysis.Token, 0, length),
|
||||
position: position,
|
||||
index: index,
|
||||
}
|
||||
|
|
|
@ -21,7 +21,7 @@ import (
|
|||
|
||||
const Name = "unique"
|
||||
|
||||
// UniqueTermFilter retains only the tokens which mark the first occurence of
|
||||
// UniqueTermFilter retains only the tokens which mark the first occurrence of
|
||||
// a term. Tokens whose term appears in a preceding token are dropped.
|
||||
type UniqueTermFilter struct{}
|
||||
|
||||
|
|
|
@ -14,7 +14,19 @@
|
|||
|
||||
package document
|
||||
|
||||
import "fmt"
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDocument int
|
||||
|
||||
func init() {
|
||||
var d Document
|
||||
reflectStaticSizeDocument = int(reflect.TypeOf(d).Size())
|
||||
}
|
||||
|
||||
type Document struct {
|
||||
ID string `json:"id"`
|
||||
|
@ -30,6 +42,21 @@ func NewDocument(id string) *Document {
|
|||
}
|
||||
}
|
||||
|
||||
func (d *Document) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
|
||||
len(d.ID)
|
||||
|
||||
for _, entry := range d.Fields {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range d.CompositeFields {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (d *Document) AddField(f Field) *Document {
|
||||
switch f := f.(type) {
|
||||
case *CompositeField:
|
||||
|
|
|
@ -36,4 +36,6 @@ type Field interface {
|
|||
// that this field represents - this is a common metric for tracking
|
||||
// the rate of indexing
|
||||
NumPlainTextBytes() uint64
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
|
|
@ -16,10 +16,19 @@ package document
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeBooleanField int
|
||||
|
||||
func init() {
|
||||
var f BooleanField
|
||||
reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
|
||||
|
||||
type BooleanField struct {
|
||||
|
@ -30,6 +39,13 @@ type BooleanField struct {
|
|||
numPlainTextBytes uint64
|
||||
}
|
||||
|
||||
func (b *BooleanField) Size() int {
|
||||
return reflectStaticSizeBooleanField + size.SizeOfPtr +
|
||||
len(b.name) +
|
||||
len(b.arrayPositions)*size.SizeOfUint64 +
|
||||
len(b.value)
|
||||
}
|
||||
|
||||
func (b *BooleanField) Name() string {
|
||||
return b.name
|
||||
}
|
||||
|
|
|
@ -15,9 +15,19 @@
|
|||
package document
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeCompositeField int
|
||||
|
||||
func init() {
|
||||
var cf CompositeField
|
||||
reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size())
|
||||
}
|
||||
|
||||
const DefaultCompositeIndexingOptions = IndexField
|
||||
|
||||
type CompositeField struct {
|
||||
|
@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl
|
|||
return rv
|
||||
}
|
||||
|
||||
func (c *CompositeField) Size() int {
|
||||
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
|
||||
len(c.name)
|
||||
|
||||
for k, _ := range c.includedFields {
|
||||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||
}
|
||||
|
||||
for k, _ := range c.excludedFields {
|
||||
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (c *CompositeField) Name() string {
|
||||
return c.name
|
||||
}
|
||||
|
|
|
@ -17,12 +17,21 @@ package document
|
|||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/numeric"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDateTimeField int
|
||||
|
||||
func init() {
|
||||
var f DateTimeField
|
||||
reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
|
||||
const DefaultDateTimePrecisionStep uint = 4
|
||||
|
||||
|
@ -37,6 +46,12 @@ type DateTimeField struct {
|
|||
numPlainTextBytes uint64
|
||||
}
|
||||
|
||||
func (n *DateTimeField) Size() int {
|
||||
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
func (n *DateTimeField) Name() string {
|
||||
return n.name
|
||||
}
|
||||
|
|
|
@ -16,12 +16,21 @@ package document
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/geo"
|
||||
"github.com/blevesearch/bleve/numeric"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeGeoPointField int
|
||||
|
||||
func init() {
|
||||
var f GeoPointField
|
||||
reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
var GeoPrecisionStep uint = 9
|
||||
|
||||
type GeoPointField struct {
|
||||
|
@ -32,6 +41,12 @@ type GeoPointField struct {
|
|||
numPlainTextBytes uint64
|
||||
}
|
||||
|
||||
func (n *GeoPointField) Size() int {
|
||||
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
func (n *GeoPointField) Name() string {
|
||||
return n.name
|
||||
}
|
||||
|
|
|
@ -16,11 +16,20 @@ package document
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/numeric"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeNumericField int
|
||||
|
||||
func init() {
|
||||
var f NumericField
|
||||
reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
|
||||
|
||||
const DefaultPrecisionStep uint = 4
|
||||
|
@ -33,6 +42,12 @@ type NumericField struct {
|
|||
numPlainTextBytes uint64
|
||||
}
|
||||
|
||||
func (n *NumericField) Size() int {
|
||||
return reflectStaticSizeNumericField + size.SizeOfPtr +
|
||||
len(n.name) +
|
||||
len(n.arrayPositions)*size.SizeOfPtr
|
||||
}
|
||||
|
||||
func (n *NumericField) Name() string {
|
||||
return n.name
|
||||
}
|
||||
|
|
|
@ -16,10 +16,19 @@ package document
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTextField int
|
||||
|
||||
func init() {
|
||||
var f TextField
|
||||
reflectStaticSizeTextField = int(reflect.TypeOf(f).Size())
|
||||
}
|
||||
|
||||
const DefaultTextIndexingOptions = IndexField | DocValues
|
||||
|
||||
type TextField struct {
|
||||
|
@ -31,6 +40,13 @@ type TextField struct {
|
|||
numPlainTextBytes uint64
|
||||
}
|
||||
|
||||
func (t *TextField) Size() int {
|
||||
return reflectStaticSizeTextField + size.SizeOfPtr +
|
||||
len(t.name) +
|
||||
len(t.arrayPositions)*size.SizeOfUint64 +
|
||||
len(t.value)
|
||||
}
|
||||
|
||||
func (t *TextField) Name() string {
|
||||
return t.name
|
||||
}
|
||||
|
|
|
@ -0,0 +1,174 @@
|
|||
// The code here was obtained from:
|
||||
// https://github.com/mmcloughlin/geohash
|
||||
|
||||
// The MIT License (MIT)
|
||||
// Copyright (c) 2015 Michael McLoughlin
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
package geo
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
// encoding encapsulates an encoding defined by a given base32 alphabet.
|
||||
type encoding struct {
|
||||
enc string
|
||||
dec [256]byte
|
||||
}
|
||||
|
||||
// newEncoding constructs a new encoding defined by the given alphabet,
|
||||
// which must be a 32-byte string.
|
||||
func newEncoding(encoder string) *encoding {
|
||||
e := new(encoding)
|
||||
e.enc = encoder
|
||||
for i := 0; i < len(e.dec); i++ {
|
||||
e.dec[i] = 0xff
|
||||
}
|
||||
for i := 0; i < len(encoder); i++ {
|
||||
e.dec[encoder[i]] = byte(i)
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
// Decode string into bits of a 64-bit word. The string s may be at most 12
|
||||
// characters.
|
||||
func (e *encoding) decode(s string) uint64 {
|
||||
x := uint64(0)
|
||||
for i := 0; i < len(s); i++ {
|
||||
x = (x << 5) | uint64(e.dec[s[i]])
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
// Encode bits of 64-bit word into a string.
|
||||
func (e *encoding) encode(x uint64) string {
|
||||
b := [12]byte{}
|
||||
for i := 0; i < 12; i++ {
|
||||
b[11-i] = e.enc[x&0x1f]
|
||||
x >>= 5
|
||||
}
|
||||
return string(b[:])
|
||||
}
|
||||
|
||||
// Base32Encoding with the Geohash alphabet.
|
||||
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz")
|
||||
|
||||
// BoundingBox returns the region encoded by the given string geohash.
|
||||
func geoBoundingBox(hash string) geoBox {
|
||||
bits := uint(5 * len(hash))
|
||||
inthash := base32encoding.decode(hash)
|
||||
return geoBoundingBoxIntWithPrecision(inthash, bits)
|
||||
}
|
||||
|
||||
// Box represents a rectangle in latitude/longitude space.
|
||||
type geoBox struct {
|
||||
minLat float64
|
||||
maxLat float64
|
||||
minLng float64
|
||||
maxLng float64
|
||||
}
|
||||
|
||||
// Round returns a point inside the box, making an effort to round to minimal
|
||||
// precision.
|
||||
func (b geoBox) round() (lat, lng float64) {
|
||||
x := maxDecimalPower(b.maxLat - b.minLat)
|
||||
lat = math.Ceil(b.minLat/x) * x
|
||||
x = maxDecimalPower(b.maxLng - b.minLng)
|
||||
lng = math.Ceil(b.minLng/x) * x
|
||||
return
|
||||
}
|
||||
|
||||
// precalculated for performance
|
||||
var exp232 = math.Exp2(32)
|
||||
|
||||
// errorWithPrecision returns the error range in latitude and longitude for in
|
||||
// integer geohash with bits of precision.
|
||||
func errorWithPrecision(bits uint) (latErr, lngErr float64) {
|
||||
b := int(bits)
|
||||
latBits := b / 2
|
||||
lngBits := b - latBits
|
||||
latErr = math.Ldexp(180.0, -latBits)
|
||||
lngErr = math.Ldexp(360.0, -lngBits)
|
||||
return
|
||||
}
|
||||
|
||||
// minDecimalPlaces returns the minimum number of decimal places such that
|
||||
// there must exist an number with that many places within any range of width
|
||||
// r. This is intended for returning minimal precision coordinates inside a
|
||||
// box.
|
||||
func maxDecimalPower(r float64) float64 {
|
||||
m := int(math.Floor(math.Log10(r)))
|
||||
return math.Pow10(m)
|
||||
}
|
||||
|
||||
// Encode the position of x within the range -r to +r as a 32-bit integer.
|
||||
func encodeRange(x, r float64) uint32 {
|
||||
p := (x + r) / (2 * r)
|
||||
return uint32(p * exp232)
|
||||
}
|
||||
|
||||
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
|
||||
func decodeRange(X uint32, r float64) float64 {
|
||||
p := float64(X) / exp232
|
||||
x := 2*r*p - r
|
||||
return x
|
||||
}
|
||||
|
||||
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
|
||||
// ignored, and may take any value.
|
||||
func squash(X uint64) uint32 {
|
||||
X &= 0x5555555555555555
|
||||
X = (X | (X >> 1)) & 0x3333333333333333
|
||||
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f
|
||||
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff
|
||||
X = (X | (X >> 8)) & 0x0000ffff0000ffff
|
||||
X = (X | (X >> 16)) & 0x00000000ffffffff
|
||||
return uint32(X)
|
||||
}
|
||||
|
||||
// Deinterleave the bits of X into 32-bit words containing the even and odd
|
||||
// bitlevels of X, respectively.
|
||||
func deinterleave(X uint64) (uint32, uint32) {
|
||||
return squash(X), squash(X >> 1)
|
||||
}
|
||||
|
||||
// BoundingBoxIntWithPrecision returns the region encoded by the integer
|
||||
// geohash with the specified precision.
|
||||
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox {
|
||||
fullHash := hash << (64 - bits)
|
||||
latInt, lngInt := deinterleave(fullHash)
|
||||
lat := decodeRange(latInt, 90)
|
||||
lng := decodeRange(lngInt, 180)
|
||||
latErr, lngErr := errorWithPrecision(bits)
|
||||
return geoBox{
|
||||
minLat: lat,
|
||||
maxLat: lat + latErr,
|
||||
minLng: lng,
|
||||
maxLng: lng + lngErr,
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
// Decode the string geohash to a (lat, lng) point.
|
||||
func GeoHashDecode(hash string) (lat, lng float64) {
|
||||
box := geoBoundingBox(hash)
|
||||
return box.round()
|
||||
}
|
|
@ -16,6 +16,7 @@ package geo
|
|||
|
||||
import (
|
||||
"reflect"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
|
@ -24,6 +25,8 @@ import (
|
|||
// Container:
|
||||
// slice length 2 (GeoJSON)
|
||||
// first element lon, second element lat
|
||||
// string (coordinates separated by comma, or a geohash)
|
||||
// first element lat, second element lon
|
||||
// map[string]interface{}
|
||||
// exact keys lat and lon or lng
|
||||
// struct
|
||||
|
@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
|||
var foundLon, foundLat bool
|
||||
|
||||
thingVal := reflect.ValueOf(thing)
|
||||
if !thingVal.IsValid() {
|
||||
return lon, lat, false
|
||||
}
|
||||
|
||||
thingTyp := thingVal.Type()
|
||||
|
||||
// is it a slice
|
||||
if thingVal.IsValid() && thingVal.Kind() == reflect.Slice {
|
||||
if thingVal.Kind() == reflect.Slice {
|
||||
// must be length 2
|
||||
if thingVal.Len() == 2 {
|
||||
first := thingVal.Index(0)
|
||||
|
@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
|||
}
|
||||
}
|
||||
|
||||
// is it a string
|
||||
if thingVal.Kind() == reflect.String {
|
||||
geoStr := thingVal.Interface().(string)
|
||||
if strings.Contains(geoStr, ",") {
|
||||
// geo point with coordinates split by comma
|
||||
points := strings.Split(geoStr, ",")
|
||||
for i, point := range points {
|
||||
// trim any leading or trailing white spaces
|
||||
points[i] = strings.TrimSpace(point)
|
||||
}
|
||||
if len(points) == 2 {
|
||||
var err error
|
||||
lat, err = strconv.ParseFloat(points[0], 64)
|
||||
if err == nil {
|
||||
foundLat = true
|
||||
}
|
||||
lon, err = strconv.ParseFloat(points[1], 64)
|
||||
if err == nil {
|
||||
foundLon = true
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// geohash
|
||||
lat, lon = GeoHashDecode(geoStr)
|
||||
foundLat = true
|
||||
foundLon = true
|
||||
}
|
||||
}
|
||||
|
||||
// is it a map
|
||||
if l, ok := thing.(map[string]interface{}); ok {
|
||||
if lval, ok := l["lon"]; ok {
|
||||
|
@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
|||
}
|
||||
|
||||
// now try reflection on struct fields
|
||||
if thingVal.IsValid() && thingVal.Kind() == reflect.Struct {
|
||||
if thingVal.Kind() == reflect.Struct {
|
||||
for i := 0; i < thingVal.NumField(); i++ {
|
||||
fieldName := thingTyp.Field(i).Name
|
||||
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
|
||||
|
@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
|||
// extract numeric value (if possible) and returns a float64
|
||||
func extractNumericVal(v interface{}) (float64, bool) {
|
||||
val := reflect.ValueOf(v)
|
||||
if !val.IsValid() {
|
||||
return 0, false
|
||||
}
|
||||
typ := val.Type()
|
||||
switch typ.Kind() {
|
||||
case reflect.Float32, reflect.Float64:
|
||||
|
|
|
@ -21,6 +21,7 @@ import (
|
|||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/mapping"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
// A Batch groups together multiple Index and Delete
|
||||
|
@ -32,6 +33,9 @@ import (
|
|||
type Batch struct {
|
||||
index Index
|
||||
internal *index.Batch
|
||||
|
||||
lastDocSize uint64
|
||||
totalSize uint64
|
||||
}
|
||||
|
||||
// Index adds the specified index operation to the
|
||||
|
@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error {
|
|||
return err
|
||||
}
|
||||
b.internal.Update(doc)
|
||||
|
||||
b.lastDocSize = uint64(doc.Size() +
|
||||
len(id) + size.SizeOfString) // overhead from internal
|
||||
b.totalSize += b.lastDocSize
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (b *Batch) LastDocSize() uint64 {
|
||||
return b.lastDocSize
|
||||
}
|
||||
|
||||
func (b *Batch) TotalDocsSize() uint64 {
|
||||
return b.totalSize
|
||||
}
|
||||
|
||||
// IndexAdvanced adds the specified index operation to the
|
||||
// batch which skips the mapping. NOTE: the bleve Index is not updated
|
||||
// until the batch is executed.
|
||||
|
@ -102,6 +119,24 @@ func (b *Batch) Reset() {
|
|||
b.internal.Reset()
|
||||
}
|
||||
|
||||
func (b *Batch) Merge(o *Batch) {
|
||||
if o != nil && o.internal != nil {
|
||||
b.internal.Merge(o.internal)
|
||||
if o.LastDocSize() > 0 {
|
||||
b.lastDocSize = o.LastDocSize()
|
||||
}
|
||||
b.totalSize = uint64(b.internal.TotalDocSize())
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Batch) SetPersistedCallback(f index.BatchCallback) {
|
||||
b.internal.SetPersistedCallback(f)
|
||||
}
|
||||
|
||||
func (b *Batch) PersistedCallback() index.BatchCallback {
|
||||
return b.internal.PersistedCallback()
|
||||
}
|
||||
|
||||
// An Index implements all the indexing and searching
|
||||
// capabilities of bleve. An Index can be created
|
||||
// using the New() and Open() methods.
|
||||
|
|
|
@ -15,10 +15,20 @@
|
|||
package index
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeAnalysisResult int
|
||||
|
||||
func init() {
|
||||
var ar AnalysisResult
|
||||
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size())
|
||||
}
|
||||
|
||||
type IndexRow interface {
|
||||
KeySize() int
|
||||
KeyTo([]byte) (int, error)
|
||||
|
@ -39,6 +49,15 @@ type AnalysisResult struct {
|
|||
Length []int
|
||||
}
|
||||
|
||||
func (a *AnalysisResult) Size() int {
|
||||
rv := reflectStaticSizeAnalysisResult
|
||||
for _, analyzedI := range a.Analyzed {
|
||||
rv += analyzedI.Size()
|
||||
}
|
||||
rv += len(a.Length) * size.SizeOfInt
|
||||
return rv
|
||||
}
|
||||
|
||||
type AnalysisWork struct {
|
||||
i Index
|
||||
d *document.Document
|
||||
|
|
|
@ -18,11 +18,23 @@ import (
|
|||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermFieldDoc int
|
||||
var reflectStaticSizeTermFieldVector int
|
||||
|
||||
func init() {
|
||||
var tfd TermFieldDoc
|
||||
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
|
||||
var tfv TermFieldVector
|
||||
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
|
||||
}
|
||||
|
||||
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
|
||||
|
||||
type Index interface {
|
||||
|
@ -68,6 +80,8 @@ type IndexReader interface {
|
|||
Document(id string) (*document.Document, error)
|
||||
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
|
||||
|
||||
DocValueReader(fields []string) (DocValueReader, error)
|
||||
|
||||
Fields() ([]string, error)
|
||||
|
||||
GetInternal(key []byte) ([]byte, error)
|
||||
|
@ -84,6 +98,29 @@ type IndexReader interface {
|
|||
Close() error
|
||||
}
|
||||
|
||||
// The Regexp interface defines the subset of the regexp.Regexp API
|
||||
// methods that are used by bleve indexes, allowing callers to pass in
|
||||
// alternate implementations.
|
||||
type Regexp interface {
|
||||
FindStringIndex(s string) (loc []int)
|
||||
|
||||
LiteralPrefix() (prefix string, complete bool)
|
||||
|
||||
String() string
|
||||
}
|
||||
|
||||
type IndexReaderRegexp interface {
|
||||
FieldDictRegexp(field string, regex string) (FieldDict, error)
|
||||
}
|
||||
|
||||
type IndexReaderFuzzy interface {
|
||||
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
|
||||
}
|
||||
|
||||
type IndexReaderOnly interface {
|
||||
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
|
||||
}
|
||||
|
||||
// FieldTerms contains the terms used by a document, keyed by field
|
||||
type FieldTerms map[string][]string
|
||||
|
||||
|
@ -115,6 +152,11 @@ type TermFieldVector struct {
|
|||
End uint64
|
||||
}
|
||||
|
||||
func (tfv *TermFieldVector) Size() int {
|
||||
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
|
||||
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
// IndexInternalID is an opaque document identifier interal to the index impl
|
||||
type IndexInternalID []byte
|
||||
|
||||
|
@ -134,14 +176,27 @@ type TermFieldDoc struct {
|
|||
Vectors []*TermFieldVector
|
||||
}
|
||||
|
||||
func (tfd *TermFieldDoc) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
|
||||
len(tfd.Term) + len(tfd.ID)
|
||||
|
||||
for _, entry := range tfd.Vectors {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
// Reset allows an already allocated TermFieldDoc to be reused
|
||||
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
|
||||
// remember the []byte used for the ID
|
||||
id := tfd.ID
|
||||
vectors := tfd.Vectors
|
||||
// idiom to copy over from empty TermFieldDoc (0 allocations)
|
||||
*tfd = TermFieldDoc{}
|
||||
// reuse the []byte already allocated (and reset len to 0)
|
||||
tfd.ID = id[:0]
|
||||
tfd.Vectors = vectors[:0]
|
||||
return tfd
|
||||
}
|
||||
|
||||
|
@ -161,6 +216,8 @@ type TermFieldReader interface {
|
|||
// Count returns the number of documents contains the term in this field.
|
||||
Count() uint64
|
||||
Close() error
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type DictEntry struct {
|
||||
|
@ -185,12 +242,18 @@ type DocIDReader interface {
|
|||
// will start there instead. If ID is greater than or equal to the end of
|
||||
// the range, Next() call will return io.EOF.
|
||||
Advance(ID IndexInternalID) (IndexInternalID, error)
|
||||
|
||||
Size() int
|
||||
|
||||
Close() error
|
||||
}
|
||||
|
||||
type BatchCallback func(error)
|
||||
|
||||
type Batch struct {
|
||||
IndexOps map[string]*document.Document
|
||||
InternalOps map[string][]byte
|
||||
persistedCallback BatchCallback
|
||||
}
|
||||
|
||||
func NewBatch() *Batch {
|
||||
|
@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) {
|
|||
b.InternalOps[string(key)] = nil
|
||||
}
|
||||
|
||||
func (b *Batch) SetPersistedCallback(f BatchCallback) {
|
||||
b.persistedCallback = f
|
||||
}
|
||||
|
||||
func (b *Batch) PersistedCallback() BatchCallback {
|
||||
return b.persistedCallback
|
||||
}
|
||||
|
||||
func (b *Batch) String() string {
|
||||
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
|
||||
for k, v := range b.IndexOps {
|
||||
|
@ -238,4 +309,53 @@ func (b *Batch) String() string {
|
|||
func (b *Batch) Reset() {
|
||||
b.IndexOps = make(map[string]*document.Document)
|
||||
b.InternalOps = make(map[string][]byte)
|
||||
b.persistedCallback = nil
|
||||
}
|
||||
|
||||
func (b *Batch) Merge(o *Batch) {
|
||||
for k, v := range o.IndexOps {
|
||||
b.IndexOps[k] = v
|
||||
}
|
||||
for k, v := range o.InternalOps {
|
||||
b.InternalOps[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Batch) TotalDocSize() int {
|
||||
var s int
|
||||
for k, v := range b.IndexOps {
|
||||
if v != nil {
|
||||
s += v.Size() + size.SizeOfString
|
||||
}
|
||||
s += len(k)
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
// Optimizable represents an optional interface that implementable by
|
||||
// optimizable resources (e.g., TermFieldReaders, Searchers). These
|
||||
// optimizable resources are provided the same OptimizableContext
|
||||
// instance, so that they can coordinate via dynamic interface
|
||||
// casting.
|
||||
type Optimizable interface {
|
||||
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
|
||||
}
|
||||
|
||||
// Represents a result of optimization -- see the Finish() method.
|
||||
type Optimized interface{}
|
||||
|
||||
type OptimizableContext interface {
|
||||
// Once all the optimzable resources have been provided the same
|
||||
// OptimizableContext instance, the optimization preparations are
|
||||
// finished or completed via the Finish() method.
|
||||
//
|
||||
// Depending on the optimization being performed, the Finish()
|
||||
// method might return a non-nil Optimized instance. For example,
|
||||
// the Optimized instance might represent an optimized
|
||||
// TermFieldReader instance.
|
||||
Finish() (Optimized, error)
|
||||
}
|
||||
|
||||
type DocValueReader interface {
|
||||
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
|
||||
}
|
||||
|
|
|
@ -19,7 +19,9 @@ import (
|
|||
"sync/atomic"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
)
|
||||
|
||||
type segmentIntroduction struct {
|
||||
|
@ -31,6 +33,12 @@ type segmentIntroduction struct {
|
|||
|
||||
applied chan error
|
||||
persisted chan error
|
||||
persistedCallback index.BatchCallback
|
||||
}
|
||||
|
||||
type persistIntroduction struct {
|
||||
persisted map[uint64]segment.Segment
|
||||
applied notificationChan
|
||||
}
|
||||
|
||||
type epochWatcher struct {
|
||||
|
@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() {
|
|||
var epochWatchers []*epochWatcher
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
@ -64,6 +74,9 @@ OUTER:
|
|||
continue OUTER
|
||||
}
|
||||
|
||||
case persist := <-s.persists:
|
||||
s.introducePersist(persist)
|
||||
|
||||
case revertTo := <-s.revertToSnapshots:
|
||||
err := s.revertToSnapshot(revertTo)
|
||||
if err != nil {
|
||||
|
@ -92,32 +105,38 @@ OUTER:
|
|||
}
|
||||
|
||||
func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||
// acquire lock
|
||||
s.rootLock.Lock()
|
||||
atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1)
|
||||
|
||||
nsegs := len(s.root.segment)
|
||||
s.rootLock.RLock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
nsegs := len(root.segment)
|
||||
|
||||
// prepare new index snapshot
|
||||
newSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
segment: make([]*SegmentSnapshot, 0, nsegs+1),
|
||||
offsets: make([]uint64, 0, nsegs+1),
|
||||
internal: make(map[string][]byte, len(s.root.internal)),
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
internal: make(map[string][]byte, len(root.internal)),
|
||||
refs: 1,
|
||||
creator: "introduceSegment",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
// iterate through current segments
|
||||
var running uint64
|
||||
for i := range s.root.segment {
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i := range root.segment {
|
||||
// see if optimistic work included this segment
|
||||
delta, ok := next.obsoletes[s.root.segment[i].id]
|
||||
delta, ok := next.obsoletes[root.segment[i].id]
|
||||
if !ok {
|
||||
var err error
|
||||
delta, err = s.root.segment[i].segment.DocNumbers(next.ids)
|
||||
delta, err = root.segment[i].segment.DocNumbers(next.ids)
|
||||
if err != nil {
|
||||
s.rootLock.Unlock()
|
||||
next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
|
||||
close(next.applied)
|
||||
_ = newSnapshot.DecRef()
|
||||
|
@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
}
|
||||
|
||||
newss := &SegmentSnapshot{
|
||||
id: s.root.segment[i].id,
|
||||
segment: s.root.segment[i].segment,
|
||||
cachedDocs: s.root.segment[i].cachedDocs,
|
||||
id: root.segment[i].id,
|
||||
segment: root.segment[i].segment,
|
||||
cachedDocs: root.segment[i].cachedDocs,
|
||||
creator: root.segment[i].creator,
|
||||
}
|
||||
|
||||
// apply new obsoletions
|
||||
if s.root.segment[i].deleted == nil {
|
||||
if root.segment[i].deleted == nil {
|
||||
newss.deleted = delta
|
||||
} else {
|
||||
newss.deleted = roaring.Or(s.root.segment[i].deleted, delta)
|
||||
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
|
||||
}
|
||||
if newss.deleted.IsEmpty() {
|
||||
newss.deleted = nil
|
||||
}
|
||||
|
||||
// check for live size before copying
|
||||
if newss.LiveSize() > 0 {
|
||||
newSnapshot.segment = append(newSnapshot.segment, newss)
|
||||
s.root.segment[i].segment.AddRef()
|
||||
root.segment[i].segment.AddRef()
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
running += s.root.segment[i].Count()
|
||||
running += newss.segment.Count()
|
||||
}
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
// append new segment, if any, to end of the new index snapshot
|
||||
if next.data != nil {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: next.id,
|
||||
segment: next.data, // take ownership of next.data's ref-count
|
||||
cachedDocs: &cachedDocs{cache: nil},
|
||||
creator: "introduceSegment",
|
||||
}
|
||||
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
|
||||
// increment numItemsIntroduced which tracks the number of items
|
||||
// queued for persistence.
|
||||
atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1)
|
||||
}
|
||||
// copy old values
|
||||
for key, oldVal := range s.root.internal {
|
||||
for key, oldVal := range root.internal {
|
||||
newSnapshot.internal[key] = oldVal
|
||||
}
|
||||
// set new values and apply deletes
|
||||
|
@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
delete(newSnapshot.internal, key)
|
||||
}
|
||||
}
|
||||
|
||||
newSnapshot.updateSize()
|
||||
s.rootLock.Lock()
|
||||
if next.persisted != nil {
|
||||
s.rootPersisted = append(s.rootPersisted, next.persisted)
|
||||
}
|
||||
if next.persistedCallback != nil {
|
||||
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback)
|
||||
}
|
||||
// swap in new index snapshot
|
||||
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||
// acquire lock
|
||||
func (s *Scorch) introducePersist(persist *persistIntroduction) {
|
||||
atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1)
|
||||
|
||||
s.rootLock.Lock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
nextSnapshotEpoch := s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
s.rootLock.Unlock()
|
||||
|
||||
// prepare new index snapshot
|
||||
currSize := len(s.root.segment)
|
||||
newSize := currSize + 1 - len(nextMerge.old)
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
// empty segments deletion
|
||||
if nextMerge.new == nil {
|
||||
newSize--
|
||||
newIndexSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
epoch: nextSnapshotEpoch,
|
||||
segment: make([]*SegmentSnapshot, len(root.segment)),
|
||||
offsets: make([]uint64, len(root.offsets)),
|
||||
internal: make(map[string][]byte, len(root.internal)),
|
||||
refs: 1,
|
||||
creator: "introducePersist",
|
||||
}
|
||||
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i, segmentSnapshot := range root.segment {
|
||||
// see if this segment has been replaced
|
||||
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: segmentSnapshot.id,
|
||||
segment: replacement,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
creator: "introducePersist",
|
||||
}
|
||||
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
||||
delete(persist.persisted, segmentSnapshot.id)
|
||||
|
||||
// update items persisted incase of a new segment snapshot
|
||||
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
|
||||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||
fileSegments++
|
||||
} else {
|
||||
newIndexSnapshot.segment[i] = root.segment[i]
|
||||
newIndexSnapshot.segment[i].segment.AddRef()
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
newIndexSnapshot.offsets[i] = root.offsets[i]
|
||||
}
|
||||
|
||||
for k, v := range root.internal {
|
||||
newIndexSnapshot.internal[k] = v
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
newIndexSnapshot.updateSize()
|
||||
s.rootLock.Lock()
|
||||
rootPrev := s.root
|
||||
s.root = newIndexSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
}
|
||||
|
||||
close(persist.applied)
|
||||
}
|
||||
|
||||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
|
||||
|
||||
s.rootLock.RLock()
|
||||
root := s.root
|
||||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
newSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
segment: make([]*SegmentSnapshot, 0, newSize),
|
||||
offsets: make([]uint64, 0, newSize),
|
||||
internal: s.root.internal,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
internal: root.internal,
|
||||
refs: 1,
|
||||
creator: "introduceMerge",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
// iterate through current segments
|
||||
newSegmentDeleted := roaring.NewBitmap()
|
||||
var running uint64
|
||||
for i := range s.root.segment {
|
||||
segmentID := s.root.segment[i].id
|
||||
var running, docsToPersistCount, memSegments, fileSegments uint64
|
||||
for i := range root.segment {
|
||||
segmentID := root.segment[i].id
|
||||
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
|
||||
// this segment is going away, see if anything else was deleted since we started the merge
|
||||
if segSnapAtMerge != nil && s.root.segment[i].deleted != nil {
|
||||
if segSnapAtMerge != nil && root.segment[i].deleted != nil {
|
||||
// assume all these deletes are new
|
||||
deletedSince := s.root.segment[i].deleted
|
||||
deletedSince := root.segment[i].deleted
|
||||
// if we already knew about some of them, remove
|
||||
if segSnapAtMerge.deleted != nil {
|
||||
deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted)
|
||||
deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted)
|
||||
}
|
||||
deletedSinceItr := deletedSince.Iterator()
|
||||
for deletedSinceItr.HasNext() {
|
||||
|
@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
// segments left behind in old map after processing
|
||||
// the root segments would be the obsolete segment set
|
||||
delete(nextMerge.old, segmentID)
|
||||
|
||||
} else if s.root.segment[i].LiveSize() > 0 {
|
||||
} else if root.segment[i].LiveSize() > 0 {
|
||||
// this segment is staying
|
||||
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
|
||||
id: s.root.segment[i].id,
|
||||
segment: s.root.segment[i].segment,
|
||||
deleted: s.root.segment[i].deleted,
|
||||
cachedDocs: s.root.segment[i].cachedDocs,
|
||||
id: root.segment[i].id,
|
||||
segment: root.segment[i].segment,
|
||||
deleted: root.segment[i].deleted,
|
||||
cachedDocs: root.segment[i].cachedDocs,
|
||||
creator: root.segment[i].creator,
|
||||
})
|
||||
s.root.segment[i].segment.AddRef()
|
||||
root.segment[i].segment.AddRef()
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
running += s.root.segment[i].Count()
|
||||
running += root.segment[i].segment.Count()
|
||||
|
||||
if isMemorySegment(root.segment[i]) {
|
||||
docsToPersistCount += root.segment[i].Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// In case where all the docs in the newly merged segment getting
|
||||
// deleted by the time we reach here, can skip the introduction.
|
||||
if nextMerge.new != nil &&
|
||||
|
@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
|
||||
deleted: newSegmentDeleted,
|
||||
cachedDocs: &cachedDocs{cache: nil},
|
||||
creator: "introduceMerge",
|
||||
})
|
||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
|
||||
|
||||
switch nextMerge.new.(type) {
|
||||
case *zap.SegmentBase:
|
||||
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality()
|
||||
memSegments++
|
||||
case *zap.Segment:
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
|
||||
|
||||
// swap in new segment
|
||||
newSnapshot.updateSize()
|
||||
|
||||
s.rootLock.Lock()
|
||||
// swap in new index snapshot
|
||||
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||
s.nextSnapshotEpoch++
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
|||
}
|
||||
|
||||
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
|
||||
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
|
||||
|
||||
if revertTo.snapshot == nil {
|
||||
err := fmt.Errorf("Cannot revert to a nil snapshot")
|
||||
revertTo.applied <- err
|
||||
|
@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
internal: revertTo.snapshot.internal,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
refs: 1,
|
||||
creator: "revertToSnapshot",
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
|
||||
var docsToPersistCount, memSegments, fileSegments uint64
|
||||
// iterate through segments
|
||||
for i, segmentSnapshot := range revertTo.snapshot.segment {
|
||||
newSnapshot.segment[i] = &SegmentSnapshot{
|
||||
|
@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
segment: segmentSnapshot.segment,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
creator: segmentSnapshot.creator,
|
||||
}
|
||||
newSnapshot.segment[i].segment.AddRef()
|
||||
|
||||
// remove segment from ineligibleForRemoval map
|
||||
filename := zapFileName(segmentSnapshot.id)
|
||||
delete(s.ineligibleForRemoval, filename)
|
||||
|
||||
if isMemorySegment(segmentSnapshot) {
|
||||
docsToPersistCount += segmentSnapshot.Count()
|
||||
memSegments++
|
||||
} else {
|
||||
fileSegments++
|
||||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||
|
||||
if revertTo.persisted != nil {
|
||||
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
|
||||
}
|
||||
|
||||
newSnapshot.updateSize()
|
||||
|
||||
// swap in new snapshot
|
||||
rootPrev := s.root
|
||||
s.root = newSnapshot
|
||||
|
||||
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||
// release lock
|
||||
s.rootLock.Unlock()
|
||||
|
||||
|
@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
func isMemorySegment(s *SegmentSnapshot) bool {
|
||||
switch s.segment.(type) {
|
||||
case *zap.SegmentBase:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,9 +15,7 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
|
||||
"fmt"
|
||||
"os"
|
||||
"sync/atomic"
|
||||
|
@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() {
|
|||
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
||||
default:
|
||||
// check to see if there is a new snapshot to persist
|
||||
s.rootLock.RLock()
|
||||
s.rootLock.Lock()
|
||||
ourSnapshot := s.root
|
||||
ourSnapshot.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
|
||||
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if ourSnapshot.epoch != lastEpochMergePlanned {
|
||||
startTime := time.Now()
|
||||
|
@ -57,12 +59,21 @@ OUTER:
|
|||
// lets get started
|
||||
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
|
||||
if err != nil {
|
||||
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
|
||||
if err == segment.ErrClosed {
|
||||
// index has been closed
|
||||
_ = ourSnapshot.DecRef()
|
||||
break OUTER
|
||||
}
|
||||
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
|
||||
_ = ourSnapshot.DecRef()
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1)
|
||||
continue OUTER
|
||||
}
|
||||
lastEpochMergePlanned = ourSnapshot.epoch
|
||||
|
||||
atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch)
|
||||
|
||||
s.fireEvent(EventKindMergerProgress, time.Since(startTime))
|
||||
}
|
||||
_ = ourSnapshot.DecRef()
|
||||
|
@ -88,7 +99,10 @@ OUTER:
|
|||
case <-ew.notifyCh:
|
||||
}
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1)
|
||||
}
|
||||
|
||||
s.asyncTasks.Done()
|
||||
}
|
||||
|
||||
|
@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
|
|||
if err != nil {
|
||||
return &mergePlannerOptions, err
|
||||
}
|
||||
|
||||
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return &mergePlannerOptions, nil
|
||||
}
|
||||
|
@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
}
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
|
||||
|
||||
// give this list to the planner
|
||||
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
|
||||
if err != nil {
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
|
||||
return fmt.Errorf("merge planning err: %v", err)
|
||||
}
|
||||
if resultMergePlan == nil {
|
||||
// nothing to do
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
|
||||
return nil
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
|
||||
|
||||
// process tasks in serial for now
|
||||
var notifications []chan *IndexSnapshot
|
||||
for _, task := range resultMergePlan.Tasks {
|
||||
if len(task.Segments) == 0 {
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
|
||||
|
||||
oldMap := make(map[uint64]*SegmentSnapshot)
|
||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
|
||||
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
|
||||
|
||||
for _, planSegment := range task.Segments {
|
||||
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
|
||||
oldMap[segSnapshot.id] = segSnapshot
|
||||
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
|
||||
if segSnapshot.LiveSize() == 0 {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
|
||||
oldMap[segSnapshot.id] = nil
|
||||
} else {
|
||||
segmentsToMerge = append(segmentsToMerge, zapSeg)
|
||||
|
@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
}
|
||||
|
||||
var oldNewDocNums map[uint64][]uint64
|
||||
var segment segment.Segment
|
||||
var seg segment.Segment
|
||||
if len(segmentsToMerge) > 0 {
|
||||
filename := zapFileName(newSegmentID)
|
||||
s.markIneligibleForRemoval(filename)
|
||||
path := s.path + string(os.PathSeparator) + filename
|
||||
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024)
|
||||
|
||||
fileMergeZapStartTime := time.Now()
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
|
||||
newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path,
|
||||
DefaultChunkFactor, s.closeCh, s)
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
|
||||
|
||||
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
|
||||
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime {
|
||||
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
s.unmarkIneligibleForRemoval(filename)
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||
if err == segment.ErrClosed {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("merging failed: %v", err)
|
||||
}
|
||||
segment, err = zap.Open(path)
|
||||
|
||||
seg, err = zap.Open(path)
|
||||
if err != nil {
|
||||
s.unmarkIneligibleForRemoval(filename)
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||
return err
|
||||
}
|
||||
oldNewDocNums = make(map[uint64][]uint64)
|
||||
for i, segNewDocNums := range newDocNums {
|
||||
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge)))
|
||||
}
|
||||
|
||||
sm := &segmentMerge{
|
||||
id: newSegmentID,
|
||||
old: oldMap,
|
||||
oldNewDocNums: oldNewDocNums,
|
||||
new: segment,
|
||||
new: seg,
|
||||
notify: make(chan *IndexSnapshot, 1),
|
||||
}
|
||||
notifications = append(notifications, sm.notify)
|
||||
|
@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
|||
// give it to the introducer
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
_ = segment.Close()
|
||||
return nil
|
||||
_ = seg.Close()
|
||||
return segment.ErrClosed
|
||||
case s.merges <- sm:
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
|
||||
}
|
||||
|
||||
for _, notification := range notifications {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return nil
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1)
|
||||
return segment.ErrClosed
|
||||
case newSnapshot := <-notification:
|
||||
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
|
||||
if newSnapshot != nil {
|
||||
_ = newSnapshot.DecRef()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -219,44 +279,48 @@ type segmentMerge struct {
|
|||
// into the root
|
||||
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
||||
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
|
||||
chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) {
|
||||
var br bytes.Buffer
|
||||
chunkFactor uint32) (*IndexSnapshot, uint64, error) {
|
||||
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
|
||||
|
||||
cr := zap.NewCountHashWriter(&br)
|
||||
memMergeZapStartTime := time.Now()
|
||||
|
||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, dictLocs, fieldsInv, fieldsMap, err :=
|
||||
zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
}
|
||||
|
||||
sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
||||
fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, dictLocs)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
}
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1)
|
||||
|
||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||
|
||||
filename := zapFileName(newSegmentID)
|
||||
path := s.path + string(os.PathSeparator) + filename
|
||||
err = zap.PersistSegmentBase(sb, path)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
|
||||
newDocNums, _, err :=
|
||||
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s)
|
||||
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
|
||||
|
||||
memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
|
||||
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime {
|
||||
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime)
|
||||
}
|
||||
|
||||
segment, err := zap.Open(path)
|
||||
if err != nil {
|
||||
return 0, nil, 0, err
|
||||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
seg, err := zap.Open(path)
|
||||
if err != nil {
|
||||
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// update persisted stats
|
||||
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())
|
||||
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||
|
||||
sm := &segmentMerge{
|
||||
id: newSegmentID,
|
||||
old: make(map[uint64]*SegmentSnapshot),
|
||||
oldNewDocNums: make(map[uint64][]uint64),
|
||||
new: segment,
|
||||
new: seg,
|
||||
notify: make(chan *IndexSnapshot, 1),
|
||||
}
|
||||
|
||||
|
@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
|||
|
||||
select { // send to introducer
|
||||
case <-s.closeCh:
|
||||
_ = segment.DecRef()
|
||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
||||
_ = seg.DecRef()
|
||||
return nil, 0, segment.ErrClosed
|
||||
case s.merges <- sm:
|
||||
}
|
||||
|
||||
select { // wait for introduction to complete
|
||||
case <-s.closeCh:
|
||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
||||
return nil, 0, segment.ErrClosed
|
||||
case newSnapshot := <-sm.notify:
|
||||
return numDocs, newSnapshot, newSegmentID, nil
|
||||
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
|
||||
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
|
||||
return newSnapshot, newSegmentID, nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) {
|
||||
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten)
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package mergeplan
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 {
|
|||
return o.FloorSegmentSize
|
||||
}
|
||||
|
||||
// Suggested default options.
|
||||
// MaxSegmentSizeLimit represents the maximum size of a segment,
|
||||
// this limit comes with hit-1 optimisation/max encoding limit uint31.
|
||||
const MaxSegmentSizeLimit = 1<<31 - 1
|
||||
|
||||
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment
|
||||
// exceeds the MaxSegmentSizeLimit
|
||||
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit")
|
||||
|
||||
// DefaultMergePlanOptions suggests the default options.
|
||||
var DefaultMergePlanOptions = MergePlanOptions{
|
||||
MaxSegmentsPerTier: 10,
|
||||
MaxSegmentSize: 5000000,
|
||||
|
@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
|
|||
if len(roster) > 0 {
|
||||
rosterScore := scoreSegments(roster, o)
|
||||
|
||||
if len(bestRoster) <= 0 || rosterScore < bestRosterScore {
|
||||
if len(bestRoster) == 0 || rosterScore < bestRosterScore {
|
||||
bestRoster = roster
|
||||
bestRosterScore = rosterScore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(bestRoster) <= 0 {
|
||||
if len(bestRoster) == 0 {
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
|
@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan)
|
|||
|
||||
return strings.Join(rv, "\n")
|
||||
}
|
||||
|
||||
// ValidateMergePlannerOptions validates the merge planner options
|
||||
func ValidateMergePlannerOptions(options *MergePlanOptions) error {
|
||||
if options.MaxSegmentSize > MaxSegmentSizeLimit {
|
||||
return ErrMaxSegmentSizeTooLarge
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,420 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
)
|
||||
|
||||
var OptimizeConjunction = true
|
||||
var OptimizeConjunctionUnadorned = true
|
||||
var OptimizeDisjunctionUnadorned = true
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if OptimizeConjunction && kind == "conjunction" {
|
||||
return s.optimizeConjunction(octx)
|
||||
}
|
||||
|
||||
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
|
||||
return s.optimizeConjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
|
||||
return s.optimizeDisjunctionUnadorned(octx)
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunction)
|
||||
if !ok {
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunction struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr0.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr1.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
|
||||
|
||||
for _, tfr := range o.tfrs[2:] {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok || itr.ActualBM == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
bm.And(itr.ActualBM)
|
||||
}
|
||||
|
||||
// in this conjunction optimization, the postings iterators
|
||||
// will all share the same AND'ed together actual bitmap. The
|
||||
// regular conjunction searcher machinery will still be used,
|
||||
// but the underlying bitmap will be smaller.
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if ok && itr.ActualBM != nil {
|
||||
itr.ActualBM = bm
|
||||
itr.Actual = bm.Iterator()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" conjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRConjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
|
||||
var OptimizeTFRConjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned conjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRConjunctionUnadornedTerm,
|
||||
field: OptimizeTFRConjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
OUTER:
|
||||
for i := range o.snapshot.segment {
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
var docNum1HitLast uint64
|
||||
var docNum1HitLastOk bool
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
|
||||
// An empty postings iterator means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
// We optimize zap postings iterators only.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// If the postings iterator is "1-hit" optimized, then we
|
||||
// can perform several optimizations up-front here.
|
||||
docNum1Hit, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
if docNum1Hit == zap.DocNum1HitFinished {
|
||||
// An empty docNum here means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
|
||||
// The docNum1Hit doesn't match the previous
|
||||
// docNum1HitLast, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
docNum1HitLast = docNum1Hit
|
||||
docNum1HitLastOk = true
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM == nil {
|
||||
// An empty actual bitmap means the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Collect the actual bitmap for more processing later.
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
|
||||
if docNum1HitLastOk {
|
||||
// We reach here if all the 1-hit optimized posting
|
||||
// iterators had the same 1-hit docNum, so we can check if
|
||||
// our collected actual bitmaps also have that docNum.
|
||||
for _, bm := range actualBMs {
|
||||
if !bm.Contains(uint32(docNum1HitLast)) {
|
||||
// The docNum1Hit isn't in one of our actual
|
||||
// bitmaps, so the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
||||
// The actual bitmaps and docNum1Hits all contain or have
|
||||
// the same 1-hit docNum, so that's our AND'ed result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit(
|
||||
docNum1HitLast, zap.NormBits1Hit, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 0 {
|
||||
// If we've collected no actual bitmaps at this point,
|
||||
// then the entire AND is empty.
|
||||
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
if len(actualBMs) == 1 {
|
||||
// If we've only 1 actual bitmap, then that's our result.
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
actualBMs[0], false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
continue OUTER
|
||||
}
|
||||
|
||||
// Else, AND together our collected bitmaps as our result.
|
||||
bm := roaring.And(actualBMs[0], actualBMs[1])
|
||||
|
||||
for _, actualBM := range actualBMs[2:] {
|
||||
bm.And(actualBM)
|
||||
}
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||
bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// An "unadorned" disjunction optimization is appropriate when
|
||||
// additional or subsidiary information like freq-norm's and
|
||||
// term-vectors are not required, and instead only the internal-id's
|
||||
// are needed.
|
||||
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
|
||||
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||
if octx == nil {
|
||||
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
|
||||
}
|
||||
|
||||
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if o.snapshot != s.snapshot {
|
||||
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
|
||||
}
|
||||
|
||||
o.tfrs = append(o.tfrs, s)
|
||||
|
||||
return o, nil
|
||||
}
|
||||
|
||||
type OptimizeTFRDisjunctionUnadorned struct {
|
||||
snapshot *IndexSnapshot
|
||||
|
||||
tfrs []*IndexSnapshotTermFieldReader
|
||||
}
|
||||
|
||||
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
|
||||
var OptimizeTFRDisjunctionUnadornedField = "*"
|
||||
|
||||
// Finish of an unadorned disjunction optimization will compute a
|
||||
// termFieldReader with an "actual" bitmap that represents the
|
||||
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||
// provide any freq-norm or termVector associated information.
|
||||
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||
if len(o.tfrs) <= 1 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
var cMax uint64
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
c := itr.ActualBM.GetCardinality()
|
||||
if cMax < c {
|
||||
cMax = c
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic to skip the optimization if all the constituent
|
||||
// bitmaps are too small, where the processing & resource
|
||||
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
// We use an artificial term and field because the optimized
|
||||
// termFieldReader can represent multiple terms and fields.
|
||||
oTFR := &IndexSnapshotTermFieldReader{
|
||||
term: OptimizeTFRDisjunctionUnadornedTerm,
|
||||
field: OptimizeTFRDisjunctionUnadornedField,
|
||||
snapshot: o.snapshot,
|
||||
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||
segmentOffset: 0,
|
||||
includeFreq: false,
|
||||
includeNorm: false,
|
||||
includeTermVectors: false,
|
||||
}
|
||||
|
||||
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||
|
||||
for i := range o.snapshot.segment {
|
||||
docNums = docNums[:0]
|
||||
actualBMs = actualBMs[:0]
|
||||
|
||||
for _, tfr := range o.tfrs {
|
||||
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
docNum, ok := itr.DocNum1Hit()
|
||||
if ok {
|
||||
docNums = append(docNums, uint32(docNum))
|
||||
continue
|
||||
}
|
||||
|
||||
if itr.ActualBM != nil {
|
||||
actualBMs = append(actualBMs, itr.ActualBM)
|
||||
}
|
||||
}
|
||||
|
||||
var bm *roaring.Bitmap
|
||||
if len(actualBMs) > 2 {
|
||||
bm = roaring.HeapOr(actualBMs...)
|
||||
} else if len(actualBMs) == 2 {
|
||||
bm = roaring.Or(actualBMs[0], actualBMs[1])
|
||||
} else if len(actualBMs) == 1 {
|
||||
bm = actualBMs[0].Clone()
|
||||
}
|
||||
|
||||
if bm == nil {
|
||||
bm = roaring.New()
|
||||
}
|
||||
|
||||
bm.AddMany(docNums)
|
||||
|
||||
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false)
|
||||
if err != nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
return oTFR, nil
|
||||
}
|
|
@ -16,9 +16,12 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
@ -27,23 +30,57 @@ import (
|
|||
"time"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
var DefaultChunkFactor uint32 = 1024
|
||||
|
||||
// Arbitrary number, need to make it configurable.
|
||||
// Lower values like 10/making persister really slow
|
||||
// doesn't work well as it is creating more files to
|
||||
// persist for in next persist iteration and spikes the # FDs.
|
||||
// Ideal value should let persister also proceed at
|
||||
// an optimum pace so that the merger can skip
|
||||
// many intermediate snapshots.
|
||||
// This needs to be based on empirical data.
|
||||
// TODO - may need to revisit this approach/value.
|
||||
var epochDistance = uint64(5)
|
||||
// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
|
||||
// persistence of segments with the default safe batch option.
|
||||
// If the default safe batch option results in high number of
|
||||
// files on disk, then users may initialise this configuration parameter
|
||||
// with higher values so that the persister will nap a bit within it's
|
||||
// work loop to favour better in-memory merging of segments to result
|
||||
// in fewer segment files on disk. But that may come with an indexing
|
||||
// performance overhead.
|
||||
// Unsafe batch users are advised to override this to higher value
|
||||
// for better performance especially with high data density.
|
||||
var DefaultPersisterNapTimeMSec int = 0 // ms
|
||||
|
||||
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
|
||||
// persister. At times of a slow merger progress with heavy file merging
|
||||
// operations, its better to pace down the persister for letting the merger
|
||||
// to catch up within a range defined by this parameter.
|
||||
// Fewer files on disk (as per the merge plan) would result in keeping the
|
||||
// file handle usage under limit, faster disk merger and a healthier index.
|
||||
// Its been observed that such a loosely sync'ed introducer-persister-merger
|
||||
// trio results in better overall performance.
|
||||
var DefaultPersisterNapUnderNumFiles int = 1000
|
||||
|
||||
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
|
||||
|
||||
type persisterOptions struct {
|
||||
// PersisterNapTimeMSec controls the wait/delay injected into
|
||||
// persistence workloop to improve the chances for
|
||||
// a healthier and heavier in-memory merging
|
||||
PersisterNapTimeMSec int
|
||||
|
||||
// PersisterNapTimeMSec > 0, and the number of files is less than
|
||||
// PersisterNapUnderNumFiles, then the persister will sleep
|
||||
// PersisterNapTimeMSec amount of time to improve the chances for
|
||||
// a healthier and heavier in-memory merging
|
||||
PersisterNapUnderNumFiles int
|
||||
|
||||
// MemoryPressurePauseThreshold let persister to have a better leeway
|
||||
// for prudently performing the memory merge of segments on a memory
|
||||
// pressure situation. Here the config value is an upper threshold
|
||||
// for the number of paused application threads. The default value would
|
||||
// be a very high number to always favour the merging of memory segments.
|
||||
MemoryPressurePauseThreshold uint64
|
||||
}
|
||||
|
||||
type notificationChan chan struct{}
|
||||
|
||||
|
@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() {
|
|||
var persistWatchers []*epochWatcher
|
||||
var lastPersistedEpoch, lastMergedEpoch uint64
|
||||
var ew *epochWatcher
|
||||
po, err := s.parsePersisterOptions()
|
||||
if err != nil {
|
||||
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
|
||||
s.asyncTasks.Done()
|
||||
return
|
||||
}
|
||||
|
||||
OUTER:
|
||||
for {
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
|
@ -65,11 +111,13 @@ OUTER:
|
|||
if ew != nil && ew.epoch > lastMergedEpoch {
|
||||
lastMergedEpoch = ew.epoch
|
||||
}
|
||||
persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
||||
&lastMergedEpoch, persistWatchers)
|
||||
|
||||
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
||||
lastMergedEpoch, persistWatchers, po)
|
||||
|
||||
var ourSnapshot *IndexSnapshot
|
||||
var ourPersisted []chan error
|
||||
var ourPersistedCallbacks []index.BatchCallback
|
||||
|
||||
// check to see if there is a new snapshot to persist
|
||||
s.rootLock.Lock()
|
||||
|
@ -78,13 +126,17 @@ OUTER:
|
|||
ourSnapshot.AddRef()
|
||||
ourPersisted = s.rootPersisted
|
||||
s.rootPersisted = nil
|
||||
ourPersistedCallbacks = s.persistedCallbacks
|
||||
s.persistedCallbacks = nil
|
||||
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
|
||||
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
|
||||
}
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if ourSnapshot != nil {
|
||||
startTime := time.Now()
|
||||
|
||||
err := s.persistSnapshot(ourSnapshot)
|
||||
err := s.persistSnapshot(ourSnapshot, po)
|
||||
for _, ch := range ourPersisted {
|
||||
if err != nil {
|
||||
ch <- err
|
||||
|
@ -92,10 +144,22 @@ OUTER:
|
|||
close(ch)
|
||||
}
|
||||
if err != nil {
|
||||
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
|
||||
if err == segment.ErrClosed {
|
||||
// index has been closed
|
||||
_ = ourSnapshot.DecRef()
|
||||
break OUTER
|
||||
}
|
||||
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
|
||||
_ = ourSnapshot.DecRef()
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
|
||||
continue OUTER
|
||||
}
|
||||
for i := range ourPersistedCallbacks {
|
||||
ourPersistedCallbacks[i](err)
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
|
||||
|
||||
lastPersistedEpoch = ourSnapshot.epoch
|
||||
for _, ew := range persistWatchers {
|
||||
|
@ -115,6 +179,8 @@ OUTER:
|
|||
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
|
||||
|
||||
if changed {
|
||||
s.removeOldData()
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
|
@ -133,17 +199,21 @@ OUTER:
|
|||
|
||||
s.removeOldData() // might as well cleanup while waiting
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
case <-w.notifyCh:
|
||||
// woken up, next loop should pick up work
|
||||
continue OUTER
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
|
||||
case ew = <-s.persisterNotifier:
|
||||
// if the watchers are already caught up then let them wait,
|
||||
// else let them continue to do the catch up
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,32 +230,88 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
|
|||
return watchersNext
|
||||
}
|
||||
|
||||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64,
|
||||
persistWatchers []*epochWatcher) []*epochWatcher {
|
||||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
|
||||
persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) {
|
||||
|
||||
// first, let the watchers proceed if they lag behind
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
|
||||
// check the merger lag by counting the segment files on disk,
|
||||
// On finding fewer files on disk, persister takes a short pause
|
||||
// for sufficient in-memory segments to pile up for the next
|
||||
// memory merge cum persist loop.
|
||||
// On finding too many files on disk, persister pause until the merger
|
||||
// catches up to reduce the segment file count under the threshold.
|
||||
// But if there is memory pressure, then skip this sleep maneuvers.
|
||||
numFilesOnDisk, _ := s.diskFileStats()
|
||||
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
|
||||
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
|
||||
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
|
||||
|
||||
case ew := <-s.persisterNotifier:
|
||||
// unblock the merger in meantime
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
lastMergedEpoch = ew.epoch
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
|
||||
}
|
||||
return lastMergedEpoch, persistWatchers
|
||||
}
|
||||
|
||||
OUTER:
|
||||
// check for slow merger and await until the merger catch up
|
||||
for lastPersistedEpoch > *lastMergedEpoch+epochDistance {
|
||||
for po.PersisterNapUnderNumFiles > 0 &&
|
||||
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
|
||||
lastMergedEpoch < lastPersistedEpoch {
|
||||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
break OUTER
|
||||
case ew := <-s.persisterNotifier:
|
||||
persistWatchers = append(persistWatchers, ew)
|
||||
*lastMergedEpoch = ew.epoch
|
||||
lastMergedEpoch = ew.epoch
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
|
||||
|
||||
// let the watchers proceed if they lag behind
|
||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||
|
||||
numFilesOnDisk, _ = s.diskFileStats()
|
||||
}
|
||||
|
||||
return persistWatchers
|
||||
return lastMergedEpoch, persistWatchers
|
||||
}
|
||||
|
||||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
|
||||
func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
|
||||
po := persisterOptions{
|
||||
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
|
||||
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
|
||||
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
|
||||
}
|
||||
if v, ok := s.config["scorchPersisterOptions"]; ok {
|
||||
b, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return &po, err
|
||||
}
|
||||
|
||||
err = json.Unmarshal(b, &po)
|
||||
if err != nil {
|
||||
return &po, err
|
||||
}
|
||||
}
|
||||
return &po, nil
|
||||
}
|
||||
|
||||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
|
||||
po *persisterOptions) error {
|
||||
// Perform in-memory segment merging only when the memory pressure is
|
||||
// below the configured threshold, else the persister performs the
|
||||
// direct persistence of segments.
|
||||
if s.paused() < po.MemoryPressurePauseThreshold {
|
||||
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -193,6 +319,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
|
|||
if persisted {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
return s.persistSnapshotDirect(snapshot)
|
||||
}
|
||||
|
@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
|||
return false, nil
|
||||
}
|
||||
|
||||
_, newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
||||
newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
||||
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
|
||||
if err != nil {
|
||||
return false, err
|
||||
|
@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
|||
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
|
||||
internal: snapshot.internal,
|
||||
epoch: snapshot.epoch,
|
||||
creator: "persistSnapshotMaybeMerge",
|
||||
}
|
||||
|
||||
// copy to the equiv the segments that weren't replaced
|
||||
|
@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
|||
return err
|
||||
}
|
||||
|
||||
// persist meta values
|
||||
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
buf := make([]byte, binary.MaxVarintLen32)
|
||||
binary.BigEndian.PutUint32(buf, zap.Version)
|
||||
err = metaBucket.Put([]byte("version"), buf)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// persist internal values
|
||||
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
|
||||
if err != nil {
|
||||
|
@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
|||
}
|
||||
}
|
||||
|
||||
s.rootLock.Lock()
|
||||
newIndexSnapshot := &IndexSnapshot{
|
||||
parent: s,
|
||||
epoch: s.nextSnapshotEpoch,
|
||||
segment: make([]*SegmentSnapshot, len(s.root.segment)),
|
||||
offsets: make([]uint64, len(s.root.offsets)),
|
||||
internal: make(map[string][]byte, len(s.root.internal)),
|
||||
refs: 1,
|
||||
}
|
||||
s.nextSnapshotEpoch++
|
||||
for i, segmentSnapshot := range s.root.segment {
|
||||
// see if this segment has been replaced
|
||||
if replacement, ok := newSegments[segmentSnapshot.id]; ok {
|
||||
newSegmentSnapshot := &SegmentSnapshot{
|
||||
id: segmentSnapshot.id,
|
||||
segment: replacement,
|
||||
deleted: segmentSnapshot.deleted,
|
||||
cachedDocs: segmentSnapshot.cachedDocs,
|
||||
}
|
||||
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
||||
delete(newSegments, segmentSnapshot.id)
|
||||
// update items persisted incase of a new segment snapshot
|
||||
atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count())
|
||||
} else {
|
||||
newIndexSnapshot.segment[i] = s.root.segment[i]
|
||||
newIndexSnapshot.segment[i].segment.AddRef()
|
||||
}
|
||||
newIndexSnapshot.offsets[i] = s.root.offsets[i]
|
||||
}
|
||||
for k, v := range s.root.internal {
|
||||
newIndexSnapshot.internal[k] = v
|
||||
persist := &persistIntroduction{
|
||||
persisted: newSegments,
|
||||
applied: make(notificationChan),
|
||||
}
|
||||
|
||||
rootPrev := s.root
|
||||
s.root = newIndexSnapshot
|
||||
s.rootLock.Unlock()
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return segment.ErrClosed
|
||||
case s.persists <- persist:
|
||||
}
|
||||
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return segment.ErrClosed
|
||||
case <-persist.applied:
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'}
|
|||
var boltPathKey = []byte{'p'}
|
||||
var boltDeletedKey = []byte{'d'}
|
||||
var boltInternalKey = []byte{'i'}
|
||||
var boltMetaDataKey = []byte{'m'}
|
||||
|
||||
func (s *Scorch) loadFromBolt() error {
|
||||
return s.rootBolt.View(func(tx *bolt.Tx) error {
|
||||
|
@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error {
|
|||
continue
|
||||
}
|
||||
if foundRoot {
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
snapshot := snapshots.Bucket(k)
|
||||
if snapshot == nil {
|
||||
log.Printf("snapshot key, but bucket missing %x, continuing", k)
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
indexSnapshot, err := s.loadSnapshot(snapshot)
|
||||
if err != nil {
|
||||
log.Printf("unable to load snapshot, %v, continuing", err)
|
||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
||||
s.AddEligibleForRemoval(snapshotEpoch)
|
||||
continue
|
||||
}
|
||||
indexSnapshot.epoch = snapshotEpoch
|
||||
|
@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error {
|
|||
return err
|
||||
}
|
||||
s.nextSegmentID++
|
||||
s.nextSnapshotEpoch = snapshotEpoch + 1
|
||||
s.rootLock.Lock()
|
||||
if s.root != nil {
|
||||
_ = s.root.DecRef()
|
||||
}
|
||||
s.nextSnapshotEpoch = snapshotEpoch + 1
|
||||
rootPrev := s.root
|
||||
s.root = indexSnapshot
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if rootPrev != nil {
|
||||
_ = rootPrev.DecRef()
|
||||
}
|
||||
|
||||
foundRoot = true
|
||||
}
|
||||
return nil
|
||||
|
@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
|||
snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
|
||||
snapshot := snapshots.Bucket(snapshotKey)
|
||||
if snapshot == nil {
|
||||
return nil
|
||||
return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
|
||||
}
|
||||
rv, err = s.loadSnapshot(snapshot)
|
||||
return err
|
||||
|
@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
|||
}
|
||||
|
||||
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||
|
||||
rv := &IndexSnapshot{
|
||||
parent: s,
|
||||
internal: make(map[string][]byte),
|
||||
refs: 1,
|
||||
creator: "loadSnapshot",
|
||||
}
|
||||
|
||||
var running uint64
|
||||
c := snapshot.Cursor()
|
||||
for k, _ := c.First(); k != nil; k, _ = c.Next() {
|
||||
|
@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
|||
_ = rv.DecRef()
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
} else if k[0] != boltMetaDataKey[0] {
|
||||
segmentBucket := snapshot.Bucket(k)
|
||||
if segmentBucket == nil {
|
||||
_ = rv.DecRef()
|
||||
|
@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
|||
running += segmentSnapshot.segment.Count()
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
|
@ -604,8 +731,10 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
|
|||
_ = segment.Close()
|
||||
return nil, fmt.Errorf("error reading deleted bytes: %v", err)
|
||||
}
|
||||
if !deletedBitmap.IsEmpty() {
|
||||
rv.deleted = deletedBitmap
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
|||
return 0, err
|
||||
}
|
||||
|
||||
if len(persistedEpochs) <= NumSnapshotsToKeep {
|
||||
if len(persistedEpochs) <= s.numSnapshotsToKeep {
|
||||
// we need to keep everything
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// make a map of epochs to protect from deletion
|
||||
protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep)
|
||||
for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] {
|
||||
protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
|
||||
for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
|
||||
protectedEpochs[epoch] = struct{}{}
|
||||
}
|
||||
|
||||
|
@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
|||
s.eligibleForRemoval = newEligible
|
||||
s.rootLock.Unlock()
|
||||
|
||||
if len(epochsToRemove) <= 0 {
|
||||
if len(epochsToRemove) == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -1,110 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package scorch
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
||||
}
|
||||
|
||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq,
|
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors)
|
||||
}
|
||||
|
||||
// DocIDReader returns an iterator over all doc ids
|
||||
// The caller must close returned instance to release associated resources.
|
||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderAll()
|
||||
}
|
||||
|
||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
|
||||
return r.root.DocIDReaderOnly(ids)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) {
|
||||
return r.root.FieldDict(field)
|
||||
}
|
||||
|
||||
// FieldDictRange is currently defined to include the start and end terms
|
||||
func (r *Reader) FieldDictRange(field string, startTerm []byte,
|
||||
endTerm []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictRange(field, startTerm, endTerm)
|
||||
}
|
||||
|
||||
func (r *Reader) FieldDictPrefix(field string,
|
||||
termPrefix []byte) (index.FieldDict, error) {
|
||||
return r.root.FieldDictPrefix(field, termPrefix)
|
||||
}
|
||||
|
||||
func (r *Reader) Document(id string) (*document.Document, error) {
|
||||
return r.root.Document(id)
|
||||
}
|
||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor)
|
||||
}
|
||||
|
||||
func (r *Reader) Fields() ([]string, error) {
|
||||
return r.root.Fields()
|
||||
}
|
||||
|
||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) {
|
||||
return r.root.GetInternal(key)
|
||||
}
|
||||
|
||||
func (r *Reader) DocCount() (uint64, error) {
|
||||
return r.root.DocCount()
|
||||
}
|
||||
|
||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) {
|
||||
return r.root.ExternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
|
||||
return r.root.InternalID(id)
|
||||
}
|
||||
|
||||
func (r *Reader) DumpAll() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpDoc(id string) chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) DumpFields() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (r *Reader) Close() error {
|
||||
return r.root.DecRef()
|
||||
}
|
|
@ -17,6 +17,7 @@ package scorch
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
@ -27,23 +28,24 @@ import (
|
|||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
const Name = "scorch"
|
||||
|
||||
const Version uint8 = 1
|
||||
const Version uint8 = 2
|
||||
|
||||
var ErrClosed = fmt.Errorf("scorch closed")
|
||||
|
||||
type Scorch struct {
|
||||
readOnly bool
|
||||
version uint8
|
||||
config map[string]interface{}
|
||||
analysisQueue *index.AnalysisQueue
|
||||
stats *Stats
|
||||
stats Stats
|
||||
nextSegmentID uint64
|
||||
path string
|
||||
|
||||
|
@ -52,12 +54,15 @@ type Scorch struct {
|
|||
rootLock sync.RWMutex
|
||||
root *IndexSnapshot // holds 1 ref-count on the root
|
||||
rootPersisted []chan error // closed when root is persisted
|
||||
persistedCallbacks []index.BatchCallback
|
||||
nextSnapshotEpoch uint64
|
||||
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
|
||||
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
|
||||
|
||||
numSnapshotsToKeep int
|
||||
closeCh chan struct{}
|
||||
introductions chan *segmentIntroduction
|
||||
persists chan *persistIntroduction
|
||||
merges chan *segmentMerge
|
||||
introducerNotifier chan *epochWatcher
|
||||
revertToSnapshots chan *snapshotReversion
|
||||
|
@ -67,6 +72,23 @@ type Scorch struct {
|
|||
|
||||
onEvent func(event Event)
|
||||
onAsyncError func(err error)
|
||||
|
||||
iStats internalStats
|
||||
|
||||
pauseLock sync.RWMutex
|
||||
|
||||
pauseCount uint64
|
||||
}
|
||||
|
||||
type internalStats struct {
|
||||
persistEpoch uint64
|
||||
persistSnapshotSize uint64
|
||||
mergeEpoch uint64
|
||||
mergeSnapshotSize uint64
|
||||
newSegBufBytesAdded uint64
|
||||
newSegBufBytesRemoved uint64
|
||||
analysisBytesAdded uint64
|
||||
analysisBytesRemoved uint64
|
||||
}
|
||||
|
||||
func NewScorch(storeName string,
|
||||
|
@ -80,8 +102,7 @@ func NewScorch(storeName string,
|
|||
closeCh: make(chan struct{}),
|
||||
ineligibleForRemoval: map[string]bool{},
|
||||
}
|
||||
rv.stats = &Stats{i: rv}
|
||||
rv.root = &IndexSnapshot{parent: rv, refs: 1}
|
||||
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
|
||||
ro, ok := config["read_only"].(bool)
|
||||
if ok {
|
||||
rv.readOnly = ro
|
||||
|
@ -101,9 +122,30 @@ func NewScorch(storeName string,
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *Scorch) paused() uint64 {
|
||||
s.pauseLock.Lock()
|
||||
pc := s.pauseCount
|
||||
s.pauseLock.Unlock()
|
||||
return pc
|
||||
}
|
||||
|
||||
func (s *Scorch) incrPause() {
|
||||
s.pauseLock.Lock()
|
||||
s.pauseCount++
|
||||
s.pauseLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) decrPause() {
|
||||
s.pauseLock.Lock()
|
||||
s.pauseCount--
|
||||
s.pauseLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
|
||||
if s.onEvent != nil {
|
||||
s.incrPause()
|
||||
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
||||
s.decrPause()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) {
|
|||
if s.onAsyncError != nil {
|
||||
s.onAsyncError(err)
|
||||
}
|
||||
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||
}
|
||||
|
||||
func (s *Scorch) Open() error {
|
||||
|
@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error {
|
|||
}
|
||||
}
|
||||
|
||||
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment)))
|
||||
|
||||
s.introductions = make(chan *segmentIntroduction)
|
||||
s.persists = make(chan *persistIntroduction)
|
||||
s.merges = make(chan *segmentMerge)
|
||||
s.introducerNotifier = make(chan *epochWatcher, 1)
|
||||
s.revertToSnapshots = make(chan *snapshotReversion)
|
||||
|
@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error {
|
|||
}
|
||||
}
|
||||
|
||||
s.numSnapshotsToKeep = NumSnapshotsToKeep
|
||||
if v, ok := s.config["numSnapshotsToKeep"]; ok {
|
||||
var t int
|
||||
if t, err = parseToInteger(v); err != nil {
|
||||
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err)
|
||||
}
|
||||
if t > 0 {
|
||||
s.numSnapshotsToKeep = t
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -255,6 +312,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
|
|||
|
||||
// FIXME could sort ids list concurrent with analysis?
|
||||
|
||||
if len(batch.IndexOps) > 0 {
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
|
@ -264,47 +322,63 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
|
|||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// wait for analysis result
|
||||
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
|
||||
var itemsDeQueued uint64
|
||||
var totalAnalysisSize int
|
||||
for itemsDeQueued < numUpdates {
|
||||
result := <-resultChan
|
||||
resultSize := result.Size()
|
||||
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
|
||||
totalAnalysisSize += resultSize
|
||||
analysisResults[itemsDeQueued] = result
|
||||
itemsDeQueued++
|
||||
}
|
||||
close(resultChan)
|
||||
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
|
||||
|
||||
atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start)))
|
||||
atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start)))
|
||||
|
||||
indexStart := time.Now()
|
||||
|
||||
// notify handlers that we're about to introduce a segment
|
||||
s.fireEvent(EventKindBatchIntroductionStart, 0)
|
||||
|
||||
var newSegment segment.Segment
|
||||
var bufBytes uint64
|
||||
if len(analysisResults) > 0 {
|
||||
newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor)
|
||||
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
|
||||
} else {
|
||||
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
|
||||
}
|
||||
|
||||
err = s.prepareSegment(newSegment, ids, batch.InternalOps)
|
||||
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
|
||||
if err != nil {
|
||||
if newSegment != nil {
|
||||
_ = newSegment.Close()
|
||||
}
|
||||
atomic.AddUint64(&s.stats.errors, 1)
|
||||
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||
} else {
|
||||
atomic.AddUint64(&s.stats.updates, numUpdates)
|
||||
atomic.AddUint64(&s.stats.deletes, numDeletes)
|
||||
atomic.AddUint64(&s.stats.batches, 1)
|
||||
atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes)
|
||||
atomic.AddUint64(&s.stats.TotUpdates, numUpdates)
|
||||
atomic.AddUint64(&s.stats.TotDeletes, numDeletes)
|
||||
atomic.AddUint64(&s.stats.TotBatches, 1)
|
||||
atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes)
|
||||
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart)))
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||
internalOps map[string][]byte) error {
|
||||
internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
|
||||
|
||||
// new introduction
|
||||
introduction := &segmentIntroduction{
|
||||
|
@ -314,6 +388,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
obsoletes: make(map[uint64]*roaring.Bitmap),
|
||||
internal: internalOps,
|
||||
applied: make(chan error),
|
||||
persistedCallback: persistedCallback,
|
||||
}
|
||||
|
||||
if !s.unsafeBatch {
|
||||
|
@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
root.AddRef()
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
defer func() { _ = root.DecRef() }()
|
||||
|
||||
for _, seg := range root.segment {
|
||||
delta, err := seg.segment.DocNumbers(ids)
|
||||
if err != nil {
|
||||
|
@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
introduction.obsoletes[seg.id] = delta
|
||||
}
|
||||
|
||||
_ = root.DecRef()
|
||||
introStartTime := time.Now()
|
||||
|
||||
s.introductions <- introduction
|
||||
|
||||
|
@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
|||
err = <-introduction.persisted
|
||||
}
|
||||
|
||||
introTime := uint64(time.Since(introStartTime))
|
||||
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime)
|
||||
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime {
|
||||
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error {
|
|||
// Reader returns a low-level accessor on the index data. Close it to
|
||||
// release associated resources.
|
||||
func (s *Scorch) Reader() (index.IndexReader, error) {
|
||||
return s.currentSnapshot(), nil
|
||||
}
|
||||
|
||||
func (s *Scorch) currentSnapshot() *IndexSnapshot {
|
||||
s.rootLock.RLock()
|
||||
rv := &Reader{root: s.root}
|
||||
rv.root.AddRef()
|
||||
rv := s.root
|
||||
if rv != nil {
|
||||
rv.AddRef()
|
||||
}
|
||||
s.rootLock.RUnlock()
|
||||
return rv, nil
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *Scorch) Stats() json.Marshaler {
|
||||
return s.stats
|
||||
return &s.stats
|
||||
}
|
||||
|
||||
func (s *Scorch) diskFileStats() (uint64, uint64) {
|
||||
var numFilesOnDisk, numBytesUsedDisk uint64
|
||||
if s.path != "" {
|
||||
finfos, err := ioutil.ReadDir(s.path)
|
||||
if err == nil {
|
||||
for _, finfo := range finfos {
|
||||
if !finfo.IsDir() {
|
||||
numBytesUsedDisk += uint64(finfo.Size())
|
||||
numFilesOnDisk++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return numFilesOnDisk, numBytesUsedDisk
|
||||
}
|
||||
|
||||
func (s *Scorch) StatsMap() map[string]interface{} {
|
||||
m, _ := s.stats.statsMap()
|
||||
m := s.stats.ToMap()
|
||||
|
||||
numFilesOnDisk, numBytesUsedDisk := s.diskFileStats()
|
||||
|
||||
m["CurOnDiskBytes"] = numBytesUsedDisk
|
||||
m["CurOnDiskFiles"] = numFilesOnDisk
|
||||
|
||||
// TODO: consider one day removing these backwards compatible
|
||||
// names for apps using the old names
|
||||
m["updates"] = m["TotUpdates"]
|
||||
m["deletes"] = m["TotDeletes"]
|
||||
m["batches"] = m["TotBatches"]
|
||||
m["errors"] = m["TotOnErrors"]
|
||||
m["analysis_time"] = m["TotAnalysisTime"]
|
||||
m["index_time"] = m["TotIndexTime"]
|
||||
m["term_searchers_started"] = m["TotTermSearchersStarted"]
|
||||
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
|
||||
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
|
||||
m["num_items_introduced"] = m["TotIntroducedItems"]
|
||||
m["num_items_persisted"] = m["TotPersistedItems"]
|
||||
m["num_recs_to_persist"] = m["TotItemsToPersist"]
|
||||
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
|
||||
m["num_files_on_disk"] = m["CurOnDiskFiles"]
|
||||
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
|
||||
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]
|
||||
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"]
|
||||
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
|
||||
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
|
@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
|
|||
rv.Analyzed[i] = tokenFreqs
|
||||
rv.Length[i] = fieldLength
|
||||
|
||||
if len(d.CompositeFields) > 0 {
|
||||
if len(d.CompositeFields) > 0 && field.Name() != "_id" {
|
||||
// see if any of the composite fields need this
|
||||
for _, compositeField := range d.CompositeFields {
|
||||
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
|
||||
|
@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
|
|||
s.rootLock.Unlock()
|
||||
}
|
||||
|
||||
func (s *Scorch) MemoryUsed() uint64 {
|
||||
var memUsed uint64
|
||||
s.rootLock.RLock()
|
||||
if s.root != nil {
|
||||
for _, segmentSnapshot := range s.root.segment {
|
||||
memUsed += 8 /* size of id -> uint64 */ +
|
||||
segmentSnapshot.segment.SizeInBytes()
|
||||
if segmentSnapshot.deleted != nil {
|
||||
memUsed += segmentSnapshot.deleted.GetSizeInBytes()
|
||||
func (s *Scorch) MemoryUsed() (memUsed uint64) {
|
||||
indexSnapshot := s.currentSnapshot()
|
||||
if indexSnapshot == nil {
|
||||
return
|
||||
}
|
||||
memUsed += segmentSnapshot.cachedDocs.sizeInBytes()
|
||||
|
||||
defer func() {
|
||||
_ = indexSnapshot.Close()
|
||||
}()
|
||||
|
||||
// Account for current root snapshot overhead
|
||||
memUsed += uint64(indexSnapshot.Size())
|
||||
|
||||
// Account for snapshot that the persister may be working on
|
||||
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch)
|
||||
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize)
|
||||
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch {
|
||||
// the snapshot that the persister is working on isn't the same as
|
||||
// the current snapshot
|
||||
memUsed += persistSnapshotSize
|
||||
}
|
||||
|
||||
// Account for snapshot that the merger may be working on
|
||||
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch)
|
||||
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize)
|
||||
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch {
|
||||
// the snapshot that the merger is working on isn't the same as
|
||||
// the current snapshot
|
||||
memUsed += mergeSnapshotSize
|
||||
}
|
||||
s.rootLock.RUnlock()
|
||||
|
||||
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) -
|
||||
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved))
|
||||
|
||||
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) -
|
||||
atomic.LoadUint64(&s.iStats.analysisBytesRemoved))
|
||||
|
||||
return memUsed
|
||||
}
|
||||
|
||||
|
@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) {
|
|||
func init() {
|
||||
registry.RegisterIndexType(Name, NewScorch)
|
||||
}
|
||||
|
||||
func parseToInteger(i interface{}) (int, error) {
|
||||
switch v := i.(type) {
|
||||
case float64:
|
||||
return int(v), nil
|
||||
case int:
|
||||
return v, nil
|
||||
|
||||
default:
|
||||
return 0, fmt.Errorf("expects int or float64 value")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ package segment
|
|||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
)
|
||||
|
||||
type EmptySegment struct{}
|
||||
|
@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) DocID(num uint64) ([]byte, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) Count() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (e *EmptySegment) Size() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (e *EmptySegment) AddRef() {
|
||||
}
|
||||
|
||||
|
@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error {
|
|||
|
||||
type EmptyDictionary struct{}
|
||||
|
||||
func (e *EmptyDictionary) PostingsList(term string,
|
||||
except *roaring.Bitmap) (PostingsList, error) {
|
||||
func (e *EmptyDictionary) PostingsList(term []byte,
|
||||
except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) {
|
||||
return &EmptyPostingsList{}, nil
|
||||
}
|
||||
|
||||
|
@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
|
|||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
|
||||
includeCount bool) DictionaryIterator {
|
||||
return &EmptyDictionaryIterator{}
|
||||
}
|
||||
|
||||
type EmptyDictionaryIterator struct{}
|
||||
|
||||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type EmptyPostingsList struct{}
|
||||
|
||||
func (e *EmptyPostingsList) Iterator() PostingsIterator {
|
||||
func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool,
|
||||
prealloc PostingsIterator) PostingsIterator {
|
||||
return &EmptyPostingsIterator{}
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsList) Size() int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsList) Count() uint64 {
|
||||
return 0
|
||||
}
|
||||
|
@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{}
|
|||
func (e *EmptyPostingsIterator) Next() (Posting, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (e *EmptyPostingsIterator) Size() int {
|
||||
return 0
|
||||
}
|
||||
|
||||
var AnEmptyPostingsIterator = &EmptyPostingsIterator{}
|
||||
|
|
|
@ -1,321 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
)
|
||||
|
||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
|
||||
s := New()
|
||||
|
||||
// ensure that _id field get fieldID 0
|
||||
s.getOrDefineField("_id")
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory
|
||||
s.initializeDict(results)
|
||||
|
||||
// walk each doc
|
||||
for _, result := range results {
|
||||
s.processDocument(result)
|
||||
}
|
||||
|
||||
// go back and sort the dictKeys
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
// compute memory usage of segment
|
||||
s.updateSizeInBytes()
|
||||
|
||||
// professional debugging
|
||||
//
|
||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
||||
// log.Printf("dicts: %v\n", s.Dicts)
|
||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
||||
// for i, posting := range s.Postings {
|
||||
// log.Printf("posting %d: %v\n", i, posting)
|
||||
// }
|
||||
// for i, freq := range s.Freqs {
|
||||
// log.Printf("freq %d: %v\n", i, freq)
|
||||
// }
|
||||
// for i, norm := range s.Norms {
|
||||
// log.Printf("norm %d: %v\n", i, norm)
|
||||
// }
|
||||
// for i, field := range s.Locfields {
|
||||
// log.Printf("field %d: %v\n", i, field)
|
||||
// }
|
||||
// for i, start := range s.Locstarts {
|
||||
// log.Printf("start %d: %v\n", i, start)
|
||||
// }
|
||||
// for i, end := range s.Locends {
|
||||
// log.Printf("end %d: %v\n", i, end)
|
||||
// }
|
||||
// for i, pos := range s.Locpos {
|
||||
// log.Printf("pos %d: %v\n", i, pos)
|
||||
// }
|
||||
// for i, apos := range s.Locarraypos {
|
||||
// log.Printf("apos %d: %v\n", i, apos)
|
||||
// }
|
||||
// log.Printf("stored: %v\n", s.Stored)
|
||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// fill Dicts/DictKeys and preallocate memory for postings
|
||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) {
|
||||
var numPostingsLists int
|
||||
|
||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
||||
|
||||
var numTokenFrequencies int
|
||||
var totLocs int
|
||||
|
||||
// initial scan for all fieldID's to sort them
|
||||
for _, result := range results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := s.Dicts[fieldID][term]
|
||||
if !exists {
|
||||
numPostingsLists++
|
||||
pidPlus1 = uint64(numPostingsLists)
|
||||
s.Dicts[fieldID][term] = pidPlus1
|
||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
|
||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
|
||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
|
||||
}
|
||||
pid := pidPlus1 - 1
|
||||
numTermsPerPostingsList[pid] += 1
|
||||
numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
numTokenFrequencies += len(tfs)
|
||||
}
|
||||
|
||||
for _, result := range results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.Postings[i] = roaring.New()
|
||||
}
|
||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
s.PostingsLocs[i] = roaring.New()
|
||||
}
|
||||
|
||||
// Preallocate big, contiguous backing arrays.
|
||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
||||
|
||||
// Point top-level slices to the backing arrays.
|
||||
s.Freqs = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Norms = make([][]float32, numPostingsLists)
|
||||
|
||||
s.Locfields = make([][]uint16, numPostingsLists)
|
||||
|
||||
s.Locstarts = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locends = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locpos = auint64Backing[0:numPostingsLists]
|
||||
auint64Backing = auint64Backing[numPostingsLists:]
|
||||
|
||||
s.Locarraypos = make([][][]uint64, numPostingsLists)
|
||||
|
||||
// Point sub-slices to the backing arrays.
|
||||
for pid, numTerms := range numTermsPerPostingsList {
|
||||
s.Freqs[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numTerms:]
|
||||
|
||||
s.Norms[pid] = float32Backing[0:0]
|
||||
float32Backing = float32Backing[numTerms:]
|
||||
}
|
||||
|
||||
for pid, numLocs := range numLocsPerPostingsList {
|
||||
s.Locfields[pid] = uint16Backing[0:0]
|
||||
uint16Backing = uint16Backing[numLocs:]
|
||||
|
||||
s.Locstarts[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locends[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locpos[pid] = uint64Backing[0:0]
|
||||
uint64Backing = uint64Backing[numLocs:]
|
||||
|
||||
s.Locarraypos[pid] = auint64Backing[0:0]
|
||||
auint64Backing = auint64Backing[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) processDocument(result *index.AnalysisResult) {
|
||||
// used to collate information across fields
|
||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap))
|
||||
fieldLens := make(map[uint16]int, len(s.FieldsMap))
|
||||
|
||||
docNum := uint64(s.addDocument())
|
||||
|
||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[field] += l
|
||||
if existingFreqs, ok := docMap[field]; ok {
|
||||
existingFreqs.MergeAll(name, tf)
|
||||
} else {
|
||||
docMap[field] = tf
|
||||
}
|
||||
}
|
||||
|
||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
|
||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
|
||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
|
||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l, tf := field.Analyze()
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
l := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
processField(fieldID, field.Name(), l, tf)
|
||||
if field.Options().IsStored() {
|
||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
|
||||
}
|
||||
|
||||
if field.Options().IncludeDocValues() {
|
||||
s.DocValueFields[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
// now that its been rolled up into docMap, walk that
|
||||
for fieldID, tokenFrequencies := range docMap {
|
||||
for term, tokenFreq := range tokenFrequencies {
|
||||
pid := s.Dicts[fieldID][term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.AddInt(int(docNum))
|
||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency()))
|
||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
||||
locationBS := s.PostingsLocs[pid]
|
||||
if len(tokenFreq.Locations) > 0 {
|
||||
locationBS.AddInt(int(docNum))
|
||||
for _, loc := range tokenFreq.Locations {
|
||||
var locf = fieldID
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
s.Locfields[pid] = append(s.Locfields[pid], locf)
|
||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start))
|
||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End))
|
||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position))
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions)
|
||||
} else {
|
||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) getOrDefineField(name string) int {
|
||||
fieldIDPlus1, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[name] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, name)
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
||||
}
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
func (s *Segment) addDocument() int {
|
||||
docNum := len(s.Stored)
|
||||
s.Stored = append(s.Stored, map[uint16][][]byte{})
|
||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
|
||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
|
||||
return docNum
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
|
@ -1,103 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// Dictionary is the in-memory representation of the term dictionary
|
||||
type Dictionary struct {
|
||||
segment *Segment
|
||||
field string
|
||||
fieldID uint16
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string,
|
||||
except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return &PostingsList{
|
||||
dictionary: d,
|
||||
term: term,
|
||||
postingsID: d.segment.Dicts[d.fieldID][term],
|
||||
except: except,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this dictionary
|
||||
func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
}
|
||||
|
||||
// PrefixIterator returns an iterator which only visits terms having the
|
||||
// the specified prefix
|
||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
prefix: prefix,
|
||||
offset: offset,
|
||||
}
|
||||
}
|
||||
|
||||
// RangeIterator returns an iterator which only visits terms between the
|
||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
|
||||
return &DictionaryIterator{
|
||||
d: d,
|
||||
offset: offset,
|
||||
end: end,
|
||||
}
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
prefix string
|
||||
end string
|
||||
offset int
|
||||
|
||||
dictEntry index.DictEntry // reused across Next()'s
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
|
||||
return nil, nil
|
||||
}
|
||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
|
||||
// check prefix
|
||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
|
||||
return nil, nil
|
||||
}
|
||||
// check end (bleve.index API demands inclusive end)
|
||||
if d.end != "" && next > d.end {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
d.offset++
|
||||
postingID := d.d.segment.Dicts[d.d.fieldID][next]
|
||||
d.dictEntry.Term = next
|
||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality()
|
||||
return &d.dictEntry, nil
|
||||
}
|
|
@ -1,178 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// PostingsList is an in-memory represenation of a postings list
|
||||
type PostingsList struct {
|
||||
dictionary *Dictionary
|
||||
term string
|
||||
postingsID uint64
|
||||
except *roaring.Bitmap
|
||||
}
|
||||
|
||||
// Count returns the number of items on this postings list
|
||||
func (p *PostingsList) Count() uint64 {
|
||||
var rv uint64
|
||||
if p.postingsID > 0 {
|
||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
|
||||
if p.except != nil {
|
||||
except := p.except.GetCardinality()
|
||||
if except > rv {
|
||||
// avoid underflow
|
||||
except = rv
|
||||
}
|
||||
rv -= except
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Iterator returns an iterator for this postings list
|
||||
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
||||
rv := &PostingsIterator{
|
||||
postings: p,
|
||||
}
|
||||
if p.postingsID > 0 {
|
||||
allbits := p.dictionary.segment.Postings[p.postingsID-1]
|
||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
|
||||
rv.all = allbits.Iterator()
|
||||
if p.except != nil {
|
||||
allExcept := allbits.Clone()
|
||||
allExcept.AndNot(p.except)
|
||||
rv.actual = allExcept.Iterator()
|
||||
} else {
|
||||
rv.actual = allbits.Iterator()
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// PostingsIterator provides a way to iterate through the postings list
|
||||
type PostingsIterator struct {
|
||||
postings *PostingsList
|
||||
all roaring.IntIterable
|
||||
locations *roaring.Bitmap
|
||||
offset int
|
||||
locoffset int
|
||||
actual roaring.IntIterable
|
||||
}
|
||||
|
||||
// Next returns the next posting on the postings list, or nil at the end
|
||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
||||
if i.actual == nil || !i.actual.HasNext() {
|
||||
return nil, nil
|
||||
}
|
||||
n := i.actual.Next()
|
||||
allN := i.all.Next()
|
||||
|
||||
// n is the next actual hit (excluding some postings)
|
||||
// allN is the next hit in the full postings
|
||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
||||
// incr the all iterator, and check again
|
||||
for allN != n {
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
allN = i.all.Next()
|
||||
}
|
||||
rv := &Posting{
|
||||
iterator: i,
|
||||
docNum: uint64(n),
|
||||
offset: i.offset,
|
||||
locoffset: i.locoffset,
|
||||
hasLoc: i.locations.Contains(n),
|
||||
}
|
||||
|
||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
||||
i.offset++
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Posting is a single entry in a postings list
|
||||
type Posting struct {
|
||||
iterator *PostingsIterator
|
||||
docNum uint64
|
||||
offset int
|
||||
locoffset int
|
||||
hasLoc bool
|
||||
}
|
||||
|
||||
// Number returns the document number of this posting in this segment
|
||||
func (p *Posting) Number() uint64 {
|
||||
return p.docNum
|
||||
}
|
||||
|
||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
||||
func (p *Posting) Frequency() uint64 {
|
||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
|
||||
}
|
||||
|
||||
// Norm returns the normalization factor for this posting
|
||||
func (p *Posting) Norm() float64 {
|
||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
|
||||
}
|
||||
|
||||
// Locations returns the location information for each occurance
|
||||
func (p *Posting) Locations() []segment.Location {
|
||||
if !p.hasLoc {
|
||||
return nil
|
||||
}
|
||||
freq := int(p.Frequency())
|
||||
rv := make([]segment.Location, freq)
|
||||
for i := 0; i < freq; i++ {
|
||||
rv[i] = &Location{
|
||||
p: p,
|
||||
offset: p.locoffset + i,
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// Location represents the location of a single occurance
|
||||
type Location struct {
|
||||
p *Posting
|
||||
offset int
|
||||
}
|
||||
|
||||
// Field returns the name of the field (useful in composite fields to know
|
||||
// which original field the value came from)
|
||||
func (l *Location) Field() string {
|
||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
||||
}
|
||||
|
||||
// Start returns the start byte offset of this occurance
|
||||
func (l *Location) Start() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// End returns the end byte offset of this occurance
|
||||
func (l *Location) End() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// Pos returns the 1-based phrase position of this occurance
|
||||
func (l *Location) Pos() uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
||||
|
||||
// ArrayPositions returns the array position vector associated with this occurance
|
||||
func (l *Location) ArrayPositions() []uint64 {
|
||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
||||
}
|
|
@ -1,289 +0,0 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package mem
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// _id field is always guaranteed to have fieldID of 0
|
||||
const idFieldID uint16 = 0
|
||||
|
||||
// KNOWN ISSUES
|
||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
||||
// at the segment level, based on the first definition of a
|
||||
// field we see. in normal bleve usage this is fine, all
|
||||
// instances of a field definition will be the same. however,
|
||||
// advanced users may violate this and provide unique field
|
||||
// definitions with each document. this segment does not
|
||||
// support this usage.
|
||||
|
||||
// TODO
|
||||
// - need better testing of multiple docs, iterating freqs, locations and
|
||||
// and verifying the correct results are returned
|
||||
|
||||
// Segment is an in memory implementation of scorch.Segment
|
||||
type Segment struct {
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Postings list
|
||||
// postings list id -> bitmap by docNum
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// Postings list has locations
|
||||
PostingsLocs []*roaring.Bitmap
|
||||
|
||||
// Term frequencies
|
||||
// postings list id -> Freqs (one for each hit in bitmap)
|
||||
Freqs [][]uint64
|
||||
|
||||
// Field norms
|
||||
// postings list id -> Norms (one for each hit in bitmap)
|
||||
Norms [][]float32
|
||||
|
||||
// Field/start/end/pos/locarraypos
|
||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
||||
Locfields [][]uint16
|
||||
Locstarts [][]uint64
|
||||
Locends [][]uint64
|
||||
Locpos [][]uint64
|
||||
Locarraypos [][][]uint64
|
||||
|
||||
// Stored field values
|
||||
// docNum -> field id -> slice of values (each value []byte)
|
||||
Stored []map[uint16][][]byte
|
||||
|
||||
// Stored field types
|
||||
// docNum -> field id -> slice of types (each type byte)
|
||||
StoredTypes []map[uint16][]byte
|
||||
|
||||
// Stored field array positions
|
||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
||||
StoredPos []map[uint16][][]uint64
|
||||
|
||||
// For storing the docValue persisted fields
|
||||
DocValueFields map[uint16]bool
|
||||
|
||||
// Footprint of the segment, updated when analyzed document mutations
|
||||
// are added into the segment
|
||||
sizeInBytes uint64
|
||||
}
|
||||
|
||||
// New builds a new empty Segment
|
||||
func New() *Segment {
|
||||
return &Segment{
|
||||
FieldsMap: map[string]uint16{},
|
||||
DocValueFields: map[uint16]bool{},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Segment) updateSizeInBytes() {
|
||||
var sizeInBytes uint64
|
||||
|
||||
// FieldsMap, FieldsInv
|
||||
for k, _ := range s.FieldsMap {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
2 /* size of uint16 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
|
||||
// Dicts, DictKeys
|
||||
for _, entry := range s.Dicts {
|
||||
for k, _ := range entry {
|
||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
||||
8 /* size of uint64 */)
|
||||
}
|
||||
// overhead from the data structures
|
||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Postings, PostingsLocs
|
||||
for i := 0; i < len(s.Postings); i++ {
|
||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
|
||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Freqs, Norms
|
||||
for i := 0; i < len(s.Freqs); i++ {
|
||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
|
||||
len(s.Norms[i])*4 /* size of float32 */) +
|
||||
(segment.SizeOfSlice * 2)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
||||
|
||||
// Location data
|
||||
for i := 0; i < len(s.Locfields); i++ {
|
||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
|
||||
len(s.Locstarts[i])*8 /* size of uint64 */ +
|
||||
len(s.Locends[i])*8 /* size of uint64 */ +
|
||||
len(s.Locpos[i])*8 /* size of uint64 */)
|
||||
|
||||
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
}
|
||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
||||
|
||||
// Stored data
|
||||
for i := 0; i < len(s.Stored); i++ {
|
||||
for _, v := range s.Stored[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredTypes[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
|
||||
}
|
||||
|
||||
for _, v := range s.StoredPos[i] {
|
||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
||||
for _, arr := range v {
|
||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
|
||||
segment.SizeOfSlice
|
||||
}
|
||||
sizeInBytes += segment.SizeOfSlice
|
||||
}
|
||||
|
||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfMap * 3)
|
||||
}
|
||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
||||
sizeInBytes += (segment.SizeOfSlice * 3)
|
||||
|
||||
// DocValueFields
|
||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
|
||||
segment.SizeOfMap
|
||||
|
||||
// SizeInBytes
|
||||
sizeInBytes += uint64(8)
|
||||
|
||||
s.sizeInBytes = sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) SizeInBytes() uint64 {
|
||||
return s.sizeInBytes
|
||||
}
|
||||
|
||||
func (s *Segment) AddRef() {
|
||||
}
|
||||
|
||||
func (s *Segment) DecRef() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Fields returns the field names used in this segment
|
||||
func (s *Segment) Fields() []string {
|
||||
return s.FieldsInv
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
// ensure document number exists
|
||||
if int(num) > len(s.Stored)-1 {
|
||||
return nil
|
||||
}
|
||||
docFields := s.Stored[int(num)]
|
||||
st := s.StoredTypes[int(num)]
|
||||
sp := s.StoredPos[int(num)]
|
||||
for field, values := range docFields {
|
||||
for i, value := range values {
|
||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
||||
if !keepGoing {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Segment) getField(name string) (int, error) {
|
||||
fieldID, ok := s.FieldsMap[name]
|
||||
if !ok {
|
||||
return 0, fmt.Errorf("no field named %s", name)
|
||||
}
|
||||
return int(fieldID - 1), nil
|
||||
}
|
||||
|
||||
// Dictionary returns the term dictionary for the specified field
|
||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
fieldID, err := s.getField(field)
|
||||
if err != nil {
|
||||
// no such field, return empty dictionary
|
||||
return &segment.EmptyDictionary{}, nil
|
||||
}
|
||||
return &Dictionary{
|
||||
segment: s,
|
||||
field: field,
|
||||
fieldID: uint16(fieldID),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment
|
||||
// (this has no notion of deleted docs)
|
||||
func (s *Segment) Count() uint64 {
|
||||
return uint64(len(s.Stored))
|
||||
}
|
||||
|
||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
||||
// provided _id strings
|
||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
||||
rv := roaring.New()
|
||||
|
||||
// guard against empty segment
|
||||
if len(s.FieldsMap) > 0 {
|
||||
idDictionary := s.Dicts[idFieldID]
|
||||
|
||||
for _, id := range ids {
|
||||
postingID := idDictionary[id]
|
||||
if postingID > 0 {
|
||||
rv.Or(s.Postings[postingID-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// Close releases all resources associated with this segment
|
||||
func (s *Segment) Close() error {
|
||||
return nil
|
||||
}
|
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
|
@ -0,0 +1,75 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package segment
|
||||
|
||||
import (
|
||||
"regexp/syntax"
|
||||
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
|
||||
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||
|
||||
parsed, err := syntax.Parse(pattern, syntax.Perl)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
|
||||
if err != nil {
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
prefix := LiteralPrefix(parsed)
|
||||
if prefix != "" {
|
||||
prefixBeg := []byte(prefix)
|
||||
prefixEnd := IncrementBytes(prefixBeg)
|
||||
return re, prefixBeg, prefixEnd, nil
|
||||
}
|
||||
|
||||
return re, nil, nil, nil
|
||||
}
|
||||
|
||||
// Returns the literal prefix given the parse tree for a regexp
|
||||
func LiteralPrefix(s *syntax.Regexp) string {
|
||||
// traverse the left-most branch in the parse tree as long as the
|
||||
// node represents a concatenation
|
||||
for s != nil && s.Op == syntax.OpConcat {
|
||||
if len(s.Sub) < 1 {
|
||||
return ""
|
||||
}
|
||||
|
||||
s = s.Sub[0]
|
||||
}
|
||||
|
||||
if s.Op == syntax.OpLiteral {
|
||||
return string(s.Rune)
|
||||
}
|
||||
|
||||
return "" // no literal prefix
|
||||
}
|
||||
|
||||
func IncrementBytes(in []byte) []byte {
|
||||
rv := make([]byte, len(in))
|
||||
copy(rv, in)
|
||||
for i := len(rv) - 1; i >= 0; i-- {
|
||||
rv[i] = rv[i] + 1
|
||||
if rv[i] != 0 {
|
||||
return rv // didn't overflow, so stop
|
||||
}
|
||||
}
|
||||
return nil // overflowed
|
||||
}
|
|
@ -15,15 +15,14 @@
|
|||
package segment
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
)
|
||||
|
||||
// Overhead from go data structures when deployed on a 64-bit system.
|
||||
const SizeOfMap uint64 = 8
|
||||
const SizeOfPointer uint64 = 8
|
||||
const SizeOfSlice uint64 = 24
|
||||
const SizeOfString uint64 = 16
|
||||
var ErrClosed = fmt.Errorf("index closed")
|
||||
|
||||
// DocumentFieldValueVisitor defines a callback to be visited for each
|
||||
// stored field value. The return value determines if the visitor
|
||||
|
@ -34,6 +33,9 @@ type Segment interface {
|
|||
Dictionary(field string) (TermDictionary, error)
|
||||
|
||||
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
|
||||
|
||||
DocID(num uint64) ([]byte, error)
|
||||
|
||||
Count() uint64
|
||||
|
||||
DocNumbers([]string) (*roaring.Bitmap, error)
|
||||
|
@ -42,18 +44,21 @@ type Segment interface {
|
|||
|
||||
Close() error
|
||||
|
||||
SizeInBytes() uint64
|
||||
Size() int
|
||||
|
||||
AddRef()
|
||||
DecRef() error
|
||||
}
|
||||
|
||||
type TermDictionary interface {
|
||||
PostingsList(term string, except *roaring.Bitmap) (PostingsList, error)
|
||||
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
|
||||
|
||||
Iterator() DictionaryIterator
|
||||
PrefixIterator(prefix string) DictionaryIterator
|
||||
RangeIterator(start, end string) DictionaryIterator
|
||||
AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
|
||||
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
|
||||
}
|
||||
|
||||
type DictionaryIterator interface {
|
||||
|
@ -61,7 +66,9 @@ type DictionaryIterator interface {
|
|||
}
|
||||
|
||||
type PostingsList interface {
|
||||
Iterator() PostingsIterator
|
||||
Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator
|
||||
|
||||
Size() int
|
||||
|
||||
Count() uint64
|
||||
|
||||
|
@ -77,6 +84,14 @@ type PostingsIterator interface {
|
|||
// implementations may return a shared instance to reduce memory
|
||||
// allocations.
|
||||
Next() (Posting, error)
|
||||
|
||||
// Advance will return the posting with the specified doc number
|
||||
// or if there is no such posting, the next posting.
|
||||
// Callers MUST NOT attempt to pass a docNum that is less than or
|
||||
// equal to the currently visited posting doc Num.
|
||||
Advance(docNum uint64) (Posting, error)
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type Posting interface {
|
||||
|
@ -86,6 +101,8 @@ type Posting interface {
|
|||
Norm() float64
|
||||
|
||||
Locations() []Location
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type Location interface {
|
||||
|
@ -94,6 +111,7 @@ type Location interface {
|
|||
End() uint64
|
||||
Pos() uint64
|
||||
ArrayPositions() []uint64
|
||||
Size() int
|
||||
}
|
||||
|
||||
// DocumentFieldTermVisitable is implemented by various scorch segment
|
||||
|
@ -101,10 +119,17 @@ type Location interface {
|
|||
// postings or other indexed values.
|
||||
type DocumentFieldTermVisitable interface {
|
||||
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error
|
||||
visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error)
|
||||
|
||||
// VisitableDocValueFields implementation should return
|
||||
// the list of fields which are document value persisted and
|
||||
// therefore visitable by the above VisitDocumentFieldTerms method.
|
||||
VisitableDocValueFields() ([]string, error)
|
||||
}
|
||||
|
||||
type DocVisitState interface {
|
||||
}
|
||||
|
||||
type StatsReporter interface {
|
||||
ReportBytesWritten(bytesWritten uint64)
|
||||
}
|
||||
|
|
|
@ -16,19 +16,13 @@ package zap
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"os"
|
||||
"sort"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
const version uint32 = 3
|
||||
const Version uint32 = 11
|
||||
|
||||
const Type string = "zap"
|
||||
|
||||
const fieldNotUninverted = math.MaxUint64
|
||||
|
||||
|
@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// PersistSegment takes the in-memory segment and persists it to
|
||||
// the specified path in the zap file format.
|
||||
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cleanup := func() {
|
||||
_ = f.Close()
|
||||
_ = os.Remove(path)
|
||||
}
|
||||
|
||||
// buffer the output
|
||||
br := bufio.NewWriter(f)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
|
||||
persistBase(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
||||
chunkFactor, cr.Sum32(), cr)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
||||
dictLocs []uint64, err error) {
|
||||
docValueOffset = uint64(fieldNotUninverted)
|
||||
|
||||
if len(memSegment.Stored) > 0 {
|
||||
storedIndexOffset, err = persistStored(memSegment, cr)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
} else {
|
||||
dictLocs = make([]uint64, len(memSegment.FieldsInv))
|
||||
}
|
||||
|
||||
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
|
||||
if err != nil {
|
||||
return 0, 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
||||
dictLocs, nil
|
||||
}
|
||||
|
||||
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
|
||||
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
|
||||
|
||||
for docNum, storedValues := range memSegment.Stored {
|
||||
if docNum != 0 {
|
||||
// reset buffer if necessary
|
||||
curr = 0
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
}
|
||||
|
||||
st := memSegment.StoredTypes[docNum]
|
||||
sp := memSegment.StoredPos[docNum]
|
||||
|
||||
// encode fields in order
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
|
||||
stf := st[uint16(fieldID)]
|
||||
spf := sp[uint16(fieldID)]
|
||||
|
||||
var err2 error
|
||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
||||
if err2 != nil {
|
||||
return 0, err2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaEncoder.Close()
|
||||
metaBytes := metaBuf.Bytes()
|
||||
|
||||
// compress the data
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[docNum] = uint64(w.Count())
|
||||
|
||||
// write out the meta len and compressed data len
|
||||
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// now write the meta
|
||||
_, err = w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
// return value is the start of the stored index
|
||||
rv := uint64(w.Count())
|
||||
// now write out the stored doc index
|
||||
for docNum := range memSegment.Stored {
|
||||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistStoredFieldValues(fieldID int,
|
||||
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
|
||||
curr int, metaEncoder *govarint.Base128Encoder, data []byte) (
|
||||
curr int, metaEncode varintEncoder, data []byte) (
|
||||
int, []byte, error) {
|
||||
for i := 0; i < len(storedFieldValues); i++ {
|
||||
// encode field
|
||||
_, err := metaEncoder.PutU64(uint64(fieldID))
|
||||
_, err := metaEncode(uint64(fieldID))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode type
|
||||
_, err = metaEncoder.PutU64(uint64(stf[i]))
|
||||
_, err = metaEncode(uint64(stf[i]))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode start offset
|
||||
_, err = metaEncoder.PutU64(uint64(curr))
|
||||
_, err = metaEncode(uint64(curr))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// end len
|
||||
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
||||
_, err = metaEncode(uint64(len(storedFieldValues[i])))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode number of array pos
|
||||
_, err = metaEncoder.PutU64(uint64(len(spf[i])))
|
||||
_, err = metaEncode(uint64(len(spf[i])))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// encode all array positions
|
||||
for _, pos := range spf[i] {
|
||||
_, err = metaEncoder.PutU64(pos)
|
||||
_, err = metaEncode(pos)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int,
|
|||
return curr, data, nil
|
||||
}
|
||||
|
||||
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
|
||||
var freqOffsets, locOfffsets []uint64
|
||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
for postingID := range memSegment.Postings {
|
||||
if postingID != 0 {
|
||||
tfEncoder.Reset()
|
||||
}
|
||||
freqs := memSegment.Freqs[postingID]
|
||||
norms := memSegment.Norms[postingID]
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
var offset int
|
||||
for postingsListItr.HasNext() {
|
||||
|
||||
docNum := uint64(postingsListItr.Next())
|
||||
|
||||
// put freq
|
||||
err := tfEncoder.Add(docNum, freqs[offset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put norm
|
||||
norm := norms[offset]
|
||||
normBits := math.Float32bits(norm)
|
||||
err = tfEncoder.Add(docNum, uint64(normBits))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
offset++
|
||||
}
|
||||
|
||||
// record where this postings freq info starts
|
||||
freqOffsets = append(freqOffsets, uint64(w.Count()))
|
||||
|
||||
tfEncoder.Close()
|
||||
_, err := tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// now do it again for the locations
|
||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
for postingID := range memSegment.Postings {
|
||||
if postingID != 0 {
|
||||
locEncoder.Reset()
|
||||
}
|
||||
freqs := memSegment.Freqs[postingID]
|
||||
locfields := memSegment.Locfields[postingID]
|
||||
locpos := memSegment.Locpos[postingID]
|
||||
locstarts := memSegment.Locstarts[postingID]
|
||||
locends := memSegment.Locends[postingID]
|
||||
locarraypos := memSegment.Locarraypos[postingID]
|
||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
||||
var offset int
|
||||
var locOffset int
|
||||
for postingsListItr.HasNext() {
|
||||
docNum := uint64(postingsListItr.Next())
|
||||
for i := 0; i < int(freqs[offset]); i++ {
|
||||
if len(locfields) > 0 {
|
||||
// put field
|
||||
err := locEncoder.Add(docNum, uint64(locfields[locOffset]))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put pos
|
||||
err = locEncoder.Add(docNum, locpos[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put start
|
||||
err = locEncoder.Add(docNum, locstarts[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put end
|
||||
err = locEncoder.Add(docNum, locends[locOffset])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put the number of array positions to follow
|
||||
num := len(locarraypos[locOffset])
|
||||
err = locEncoder.Add(docNum, uint64(num))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
// put each array position
|
||||
for _, pos := range locarraypos[locOffset] {
|
||||
err = locEncoder.Add(docNum, pos)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
locOffset++
|
||||
}
|
||||
offset++
|
||||
}
|
||||
|
||||
// record where this postings loc info starts
|
||||
locOfffsets = append(locOfffsets, uint64(w.Count()))
|
||||
locEncoder.Close()
|
||||
_, err := locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
return freqOffsets, locOfffsets, nil
|
||||
}
|
||||
|
||||
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
|
||||
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
|
||||
var reuseBuf bytes.Buffer
|
||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
||||
for postingID := range memSegment.PostingsLocs {
|
||||
// record where we start this posting loc
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
// write out the length and bitmap
|
||||
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
|
||||
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
|
||||
rv = make([]uint64, 0, len(memSegment.Postings))
|
||||
var reuseBuf bytes.Buffer
|
||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
||||
for postingID := range memSegment.Postings {
|
||||
// record where we start this posting list
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
// write out the term info, loc info, and loc posting list offset
|
||||
_, err = writeUvarints(w, freqOffsets[postingID],
|
||||
locOffsets[postingID], postingsListLocs[postingID])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write out the length and bitmap
|
||||
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
|
||||
rv := make([]uint64, 0, len(memSegment.DictKeys))
|
||||
|
||||
varintBuf := make([]byte, binary.MaxVarintLen64)
|
||||
|
||||
var buffer bytes.Buffer
|
||||
for fieldID, fieldTerms := range memSegment.DictKeys {
|
||||
if fieldID != 0 {
|
||||
buffer.Reset()
|
||||
}
|
||||
|
||||
// start a new vellum for this field
|
||||
builder, err := vellum.New(&buffer, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dict := memSegment.Dicts[fieldID]
|
||||
// now walk the dictionary in order of fieldTerms (already sorted)
|
||||
for _, fieldTerm := range fieldTerms {
|
||||
postingID := dict[fieldTerm] - 1
|
||||
postingsAddr := postingsLocs[postingID]
|
||||
err = builder.Insert([]byte(fieldTerm), postingsAddr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
err = builder.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
rv = append(rv, uint64(w.Count()))
|
||||
|
||||
vellumData := buffer.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
|
||||
_, err = w.Write(varintBuf[:n])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = w.Write(vellumData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
type docIDRange []uint64
|
||||
|
||||
func (a docIDRange) Len() int { return len(a) }
|
||||
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
|
||||
|
||||
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
||||
chunkFactor uint32) (map[uint16]uint64, error) {
|
||||
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
||||
|
||||
for fieldID := range memSegment.DocValueFields {
|
||||
field := memSegment.FieldsInv[fieldID]
|
||||
docTermMap := make(map[uint64][]byte, 0)
|
||||
dict, err := memSegment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dictItr := dict.Iterator()
|
||||
next, err := dictItr.Next()
|
||||
for err == nil && next != nil {
|
||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
||||
if err1 != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
postingsItr := postings.Iterator()
|
||||
nextPosting, err2 := postingsItr.Next()
|
||||
for err2 == nil && nextPosting != nil {
|
||||
docNum := nextPosting.Number()
|
||||
docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...)
|
||||
docTermMap[docNum] = append(docTermMap[docNum], termSeparator)
|
||||
nextPosting, err2 = postingsItr.Next()
|
||||
}
|
||||
if err2 != nil {
|
||||
return nil, err2
|
||||
}
|
||||
|
||||
next, err = dictItr.Next()
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// sort wrt to docIDs
|
||||
var docNumbers docIDRange
|
||||
for k := range docTermMap {
|
||||
docNumbers = append(docNumbers, k)
|
||||
}
|
||||
sort.Sort(docNumbers)
|
||||
|
||||
for _, docNum := range docNumbers {
|
||||
err = fdvEncoder.Add(docNum, docTermMap[docNum])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
fieldChunkOffsets[fieldID] = uint64(w.Count())
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// persist the doc value details for this field
|
||||
_, err = fdvEncoder.Write(w)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// reseting encoder for the next field
|
||||
fdvEncoder.Reset()
|
||||
}
|
||||
|
||||
return fieldChunkOffsets, nil
|
||||
}
|
||||
|
||||
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
||||
chunkFactor uint32) (uint64, error) {
|
||||
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
fieldDocValuesOffset := uint64(w.Count())
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
offset := uint64(0)
|
||||
ok := true
|
||||
for fieldID := range memSegment.FieldsInv {
|
||||
// if the field isn't configured for docValue, then mark
|
||||
// the offset accordingly
|
||||
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
|
||||
offset = fieldNotUninverted
|
||||
}
|
||||
n := binary.PutUvarint(buf, uint64(offset))
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return fieldDocValuesOffset, nil
|
||||
}
|
||||
|
||||
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
|
||||
var br bytes.Buffer
|
||||
|
||||
cr := NewCountHashWriter(&br)
|
||||
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
|
||||
persistBase(memSegment, cr, chunkFactor)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
||||
memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
|
||||
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
|
||||
}
|
||||
|
||||
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
||||
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
|
||||
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
|
||||
|
@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
|||
fieldsIndexOffset: fieldsIndexOffset,
|
||||
docValueOffset: docValueOffset,
|
||||
dictLocs: dictLocs,
|
||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
||||
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||
}
|
||||
sb.updateSize()
|
||||
|
||||
err := sb.loadDvIterators()
|
||||
err := sb.loadDvReaders()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -18,10 +18,18 @@ import (
|
|||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
"reflect"
|
||||
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var reflectStaticSizeMetaData int
|
||||
|
||||
func init() {
|
||||
var md MetaData
|
||||
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
|
||||
}
|
||||
|
||||
var termSeparator byte = 0xff
|
||||
var termSeparatorSplitSlice = []byte{termSeparator}
|
||||
|
||||
|
@ -30,29 +38,36 @@ type chunkedContentCoder struct {
|
|||
chunkSize uint64
|
||||
currChunk uint64
|
||||
chunkLens []uint64
|
||||
|
||||
w io.Writer
|
||||
progressiveWrite bool
|
||||
|
||||
chunkMetaBuf bytes.Buffer
|
||||
chunkBuf bytes.Buffer
|
||||
|
||||
chunkMeta []MetaData
|
||||
|
||||
compressed []byte // temp buf for snappy compression
|
||||
}
|
||||
|
||||
// MetaData represents the data information inside a
|
||||
// chunk.
|
||||
type MetaData struct {
|
||||
DocNum uint64 // docNum of the data inside the chunk
|
||||
DocDvLoc uint64 // starting offset for a given docid
|
||||
DocDvLen uint64 // length of data inside the chunk for the given docid
|
||||
DocDvOffset uint64 // offset of data inside the chunk for the given docid
|
||||
}
|
||||
|
||||
// newChunkedContentCoder returns a new chunk content coder which
|
||||
// packs data into chunks based on the provided chunkSize
|
||||
func newChunkedContentCoder(chunkSize uint64,
|
||||
maxDocNum uint64) *chunkedContentCoder {
|
||||
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
|
||||
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
|
||||
total := maxDocNum/chunkSize + 1
|
||||
rv := &chunkedContentCoder{
|
||||
chunkSize: chunkSize,
|
||||
chunkLens: make([]uint64, total),
|
||||
chunkMeta: make([]MetaData, 0, total),
|
||||
w: w,
|
||||
progressiveWrite: progressiveWrite,
|
||||
}
|
||||
|
||||
return rv
|
||||
|
@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error {
|
|||
|
||||
// write out the metaData slice
|
||||
for _, meta := range c.chunkMeta {
|
||||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
|
||||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error {
|
|||
metaData := c.chunkMetaBuf.Bytes()
|
||||
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
|
||||
// write the compressed data to the final data
|
||||
compressedData := snappy.Encode(nil, c.chunkBuf.Bytes())
|
||||
c.final = append(c.final, compressedData...)
|
||||
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
|
||||
c.final = append(c.final, c.compressed...)
|
||||
|
||||
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
|
||||
|
||||
if c.progressiveWrite {
|
||||
_, err := c.w.Write(c.final)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.final = c.final[:0]
|
||||
}
|
||||
|
||||
c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData))
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
|||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
// mark the starting offset for this doc
|
||||
// get the starting offset for this doc
|
||||
dvOffset := c.chunkBuf.Len()
|
||||
dvSize, err := c.chunkBuf.Write(vals)
|
||||
if err != nil {
|
||||
|
@ -131,37 +155,76 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
|||
|
||||
c.chunkMeta = append(c.chunkMeta, MetaData{
|
||||
DocNum: docNum,
|
||||
DocDvLoc: uint64(dvOffset),
|
||||
DocDvLen: uint64(dvSize),
|
||||
DocDvOffset: uint64(dvOffset + dvSize),
|
||||
})
|
||||
return nil
|
||||
}
|
||||
|
||||
// Write commits all the encoded chunked contents to the provided writer.
|
||||
func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
|
||||
//
|
||||
// | ..... data ..... | chunk offsets (varints)
|
||||
// | position of chunk offsets (uint64) | number of offsets (uint64) |
|
||||
//
|
||||
func (c *chunkedContentCoder) Write() (int, error) {
|
||||
var tw int
|
||||
buf := make([]byte, binary.MaxVarintLen64)
|
||||
|
||||
if c.final != nil {
|
||||
// write out the data section first
|
||||
nw, err := c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
|
||||
chunkOffsetsStart := uint64(tw)
|
||||
|
||||
if cap(c.final) < binary.MaxVarintLen64 {
|
||||
c.final = make([]byte, binary.MaxVarintLen64)
|
||||
} else {
|
||||
c.final = c.final[0:binary.MaxVarintLen64]
|
||||
}
|
||||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||
// write out the chunk offsets
|
||||
for _, chunkOffset := range chunkOffsets {
|
||||
n := binary.PutUvarint(c.final, chunkOffset)
|
||||
nw, err := c.w.Write(c.final[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
|
||||
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
|
||||
|
||||
c.final = c.final[0:8]
|
||||
// write out the length of chunk offsets
|
||||
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
|
||||
nw, err := c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
// write out the number of chunks
|
||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
||||
nw, err := w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
// write out the chunk lens
|
||||
for _, chunkLen := range c.chunkLens {
|
||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
||||
nw, err = w.Write(buf[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
}
|
||||
// write out the data
|
||||
nw, err = w.Write(c.final)
|
||||
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
|
||||
nw, err = c.w.Write(c.final)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
c.final = c.final[:0]
|
||||
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
// ReadDocValueBoundary elicits the start, end offsets from a
|
||||
// metaData header slice
|
||||
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
|
||||
var start uint64
|
||||
if chunk > 0 {
|
||||
start = metaHeaders[chunk-1].DocDvOffset
|
||||
}
|
||||
return start, metaHeaders[chunk].DocDvOffset
|
||||
}
|
||||
|
|
|
@ -17,6 +17,8 @@ package zap
|
|||
import (
|
||||
"hash/crc32"
|
||||
"io"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
)
|
||||
|
||||
// CountHashWriter is a wrapper around a Writer which counts the number of
|
||||
|
@ -25,6 +27,7 @@ type CountHashWriter struct {
|
|||
w io.Writer
|
||||
crc uint32
|
||||
n int
|
||||
s segment.StatsReporter
|
||||
}
|
||||
|
||||
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
|
||||
|
@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter {
|
|||
return &CountHashWriter{w: w}
|
||||
}
|
||||
|
||||
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter {
|
||||
return &CountHashWriter{w: w, s: s}
|
||||
}
|
||||
|
||||
// Write writes the provided bytes to the wrapped writer and counts the bytes
|
||||
func (c *CountHashWriter) Write(b []byte) (int, error) {
|
||||
n, err := c.w.Write(b)
|
||||
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
|
||||
c.n += n
|
||||
if c.s != nil {
|
||||
c.s.ReportBytesWritten(uint64(n))
|
||||
}
|
||||
return n, err
|
||||
}
|
||||
|
||||
|
|
|
@ -15,13 +15,13 @@
|
|||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/couchbase/vellum/regexp"
|
||||
)
|
||||
|
||||
// Dictionary is the zap representation of the term dictionary
|
||||
|
@ -30,23 +30,36 @@ type Dictionary struct {
|
|||
field string
|
||||
fieldID uint16
|
||||
fst *vellum.FST
|
||||
fstReader *vellum.Reader
|
||||
}
|
||||
|
||||
// PostingsList returns the postings list for the specified term
|
||||
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
return d.postingsList([]byte(term), except, nil)
|
||||
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
|
||||
prealloc segment.PostingsList) (segment.PostingsList, error) {
|
||||
var preallocPL *PostingsList
|
||||
pl, ok := prealloc.(*PostingsList)
|
||||
if ok && pl != nil {
|
||||
preallocPL = pl
|
||||
}
|
||||
return d.postingsList(term, except, preallocPL)
|
||||
}
|
||||
|
||||
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
|
||||
if d.fst == nil {
|
||||
if d.fstReader == nil {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
return emptyPostingsList, nil
|
||||
}
|
||||
return d.postingsListInit(rv, except), nil
|
||||
}
|
||||
|
||||
postingsOffset, exists, err := d.fst.Get(term)
|
||||
postingsOffset, exists, err := d.fstReader.Get(term)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("vellum err: %v", err)
|
||||
}
|
||||
if !exists {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
return emptyPostingsList, nil
|
||||
}
|
||||
return d.postingsListInit(rv, except), nil
|
||||
}
|
||||
|
||||
|
@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari
|
|||
}
|
||||
|
||||
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
|
||||
if rv == nil {
|
||||
if rv == nil || rv == emptyPostingsList {
|
||||
rv = &PostingsList{}
|
||||
} else {
|
||||
postings := rv.postings
|
||||
if postings != nil {
|
||||
postings.Clear()
|
||||
}
|
||||
|
||||
*rv = PostingsList{} // clear the struct
|
||||
|
||||
rv.postings = postings
|
||||
}
|
||||
rv.sb = d.sb
|
||||
rv.except = except
|
||||
|
@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
|||
itr, err := d.fst.Iterator(nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
|||
d: d,
|
||||
}
|
||||
|
||||
kBeg := []byte(prefix)
|
||||
kEnd := segment.IncrementBytes(kBeg)
|
||||
|
||||
if d.fst != nil {
|
||||
r, err := regexp.New(prefix + ".*")
|
||||
if err == nil {
|
||||
itr, err := d.fst.Search(r, nil, nil)
|
||||
itr, err := d.fst.Iterator(kBeg, kEnd)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
}
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
|||
itr, err := d.fst.Iterator([]byte(start), endBytes)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// AutomatonIterator returns an iterator which only visits terms
|
||||
// having the the vellum automaton and start/end key range
|
||||
func (d *Dictionary) AutomatonIterator(a vellum.Automaton,
|
||||
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator {
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
}
|
||||
|
||||
if d.fst != nil {
|
||||
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
|
||||
includeCount bool) segment.DictionaryIterator {
|
||||
|
||||
rv := &DictionaryIterator{
|
||||
d: d,
|
||||
omitCount: !includeCount,
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
builder, err := vellum.New(&buf, nil)
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
for _, term := range onlyTerms {
|
||||
err = builder.Insert(term, 0)
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
}
|
||||
err = builder.Close()
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
|
||||
onlyFST, err := vellum.Load(buf.Bytes())
|
||||
if err != nil {
|
||||
rv.err = err
|
||||
return rv
|
||||
}
|
||||
|
||||
itr, err := d.fst.Search(onlyFST, nil, nil)
|
||||
if err == nil {
|
||||
rv.itr = itr
|
||||
} else if err != vellum.ErrIteratorDone {
|
||||
rv.err = err
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
// DictionaryIterator is an iterator for term dictionary
|
||||
type DictionaryIterator struct {
|
||||
d *Dictionary
|
||||
itr vellum.Iterator
|
||||
err error
|
||||
tmp PostingsList
|
||||
entry index.DictEntry
|
||||
omitCount bool
|
||||
}
|
||||
|
||||
// Next returns the next entry in the dictionary
|
||||
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||
if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||
return nil, nil
|
||||
} else if i.err != nil {
|
||||
if i.err != nil && i.err != vellum.ErrIteratorDone {
|
||||
return nil, i.err
|
||||
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||
return nil, nil
|
||||
}
|
||||
term, postingsOffset := i.itr.Current()
|
||||
i.entry.Term = string(term)
|
||||
if !i.omitCount {
|
||||
i.err = i.tmp.read(postingsOffset, i.d)
|
||||
if i.err != nil {
|
||||
return nil, i.err
|
||||
}
|
||||
rv := &index.DictEntry{
|
||||
Term: string(term),
|
||||
Count: i.tmp.Count(),
|
||||
i.entry.Count = i.tmp.Count()
|
||||
}
|
||||
i.err = i.itr.Next()
|
||||
return rv, nil
|
||||
return &i.entry, nil
|
||||
}
|
||||
|
|
|
@ -19,93 +19,129 @@ import (
|
|||
"encoding/binary"
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
type docValueIterator struct {
|
||||
var reflectStaticSizedocValueReader int
|
||||
|
||||
func init() {
|
||||
var dvi docValueReader
|
||||
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
|
||||
}
|
||||
|
||||
type docNumTermsVisitor func(docNum uint64, terms []byte) error
|
||||
|
||||
type docVisitState struct {
|
||||
dvrs map[uint16]*docValueReader
|
||||
segment *Segment
|
||||
}
|
||||
|
||||
type docValueReader struct {
|
||||
field string
|
||||
curChunkNum uint64
|
||||
numChunks uint64
|
||||
chunkLens []uint64
|
||||
chunkOffsets []uint64
|
||||
dvDataLoc uint64
|
||||
curChunkHeader []MetaData
|
||||
curChunkData []byte // compressed data cache
|
||||
uncompressed []byte // temp buf for snappy decompression
|
||||
}
|
||||
|
||||
func (di *docValueIterator) sizeInBytes() uint64 {
|
||||
// curChunkNum, numChunks, dvDataLoc --> uint64
|
||||
sizeInBytes := 24
|
||||
|
||||
// field
|
||||
sizeInBytes += (len(di.field) + int(segment.SizeOfString))
|
||||
|
||||
// chunkLens, curChunkHeader
|
||||
sizeInBytes += len(di.chunkLens)*8 +
|
||||
len(di.curChunkHeader)*24 +
|
||||
int(segment.SizeOfSlice*2) /* overhead from slices */
|
||||
|
||||
// curChunkData is mmap'ed, not included
|
||||
|
||||
return uint64(sizeInBytes)
|
||||
func (di *docValueReader) size() int {
|
||||
return reflectStaticSizedocValueReader + size.SizeOfPtr +
|
||||
len(di.field) +
|
||||
len(di.chunkOffsets)*size.SizeOfUint64 +
|
||||
len(di.curChunkHeader)*reflectStaticSizeMetaData +
|
||||
len(di.curChunkData)
|
||||
}
|
||||
|
||||
func (di *docValueIterator) fieldName() string {
|
||||
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
|
||||
if rv == nil {
|
||||
rv = &docValueReader{}
|
||||
}
|
||||
|
||||
rv.field = di.field
|
||||
rv.curChunkNum = math.MaxUint64
|
||||
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
|
||||
rv.dvDataLoc = di.dvDataLoc
|
||||
rv.curChunkHeader = rv.curChunkHeader[:0]
|
||||
rv.curChunkData = nil
|
||||
rv.uncompressed = rv.uncompressed[:0]
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (di *docValueReader) fieldName() string {
|
||||
return di.field
|
||||
}
|
||||
|
||||
func (di *docValueIterator) curChunkNumber() uint64 {
|
||||
func (di *docValueReader) curChunkNumber() uint64 {
|
||||
return di.curChunkNum
|
||||
}
|
||||
|
||||
func (s *SegmentBase) loadFieldDocValueIterator(field string,
|
||||
fieldDvLoc uint64) (*docValueIterator, error) {
|
||||
func (s *SegmentBase) loadFieldDocValueReader(field string,
|
||||
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
|
||||
// get the docValue offset for the given fields
|
||||
if fieldDvLoc == fieldNotUninverted {
|
||||
return nil, fmt.Errorf("loadFieldDocValueIterator: "+
|
||||
if fieldDvLocStart == fieldNotUninverted {
|
||||
return nil, fmt.Errorf("loadFieldDocValueReader: "+
|
||||
"no docValues found for field: %s", field)
|
||||
}
|
||||
|
||||
// read the number of chunks, chunk lengths
|
||||
var offset, clen uint64
|
||||
numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
return nil, fmt.Errorf("failed to read the field "+
|
||||
"doc values for field %s", field)
|
||||
}
|
||||
offset += uint64(read)
|
||||
// read the number of chunks, and chunk offsets position
|
||||
var numChunks, chunkOffsetsPosition uint64
|
||||
|
||||
fdvIter := &docValueIterator{
|
||||
if fieldDvLocEnd-fieldDvLocStart > 16 {
|
||||
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
|
||||
// read the length of chunk offsets
|
||||
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
|
||||
// acquire position of chunk offsets
|
||||
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
|
||||
}
|
||||
|
||||
fdvIter := &docValueReader{
|
||||
curChunkNum: math.MaxUint64,
|
||||
field: field,
|
||||
chunkLens: make([]uint64, int(numChunks)),
|
||||
chunkOffsets: make([]uint64, int(numChunks)),
|
||||
}
|
||||
|
||||
// read the chunk offsets
|
||||
var offset uint64
|
||||
for i := 0; i < int(numChunks); i++ {
|
||||
clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
|
||||
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
return nil, fmt.Errorf("corrupted chunk length during segment load")
|
||||
return nil, fmt.Errorf("corrupted chunk offset during segment load")
|
||||
}
|
||||
fdvIter.chunkLens[i] = clen
|
||||
fdvIter.chunkOffsets[i] = loc
|
||||
offset += uint64(read)
|
||||
}
|
||||
|
||||
fdvIter.dvDataLoc = fieldDvLoc + offset
|
||||
// set the data offset
|
||||
fdvIter.dvDataLoc = fieldDvLocStart
|
||||
|
||||
return fdvIter, nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) loadDvChunk(chunkNumber,
|
||||
localDocNum uint64, s *SegmentBase) error {
|
||||
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
|
||||
// advance to the chunk where the docValues
|
||||
// reside for the given docNum
|
||||
destChunkDataLoc := di.dvDataLoc
|
||||
for i := 0; i < int(chunkNumber); i++ {
|
||||
destChunkDataLoc += di.chunkLens[i]
|
||||
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
|
||||
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
|
||||
if start >= end {
|
||||
di.curChunkHeader = di.curChunkHeader[:0]
|
||||
di.curChunkData = nil
|
||||
di.curChunkNum = chunkNumber
|
||||
di.uncompressed = di.uncompressed[:0]
|
||||
return nil
|
||||
}
|
||||
|
||||
curChunkSize := di.chunkLens[chunkNumber]
|
||||
destChunkDataLoc += start
|
||||
curChunkEnd += end
|
||||
|
||||
// read the number of docs reside in the chunk
|
||||
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
|
||||
if read <= 0 {
|
||||
|
@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
|
|||
chunkMetaLoc := destChunkDataLoc + uint64(read)
|
||||
|
||||
offset := uint64(0)
|
||||
if cap(di.curChunkHeader) < int(numDocs) {
|
||||
di.curChunkHeader = make([]MetaData, int(numDocs))
|
||||
} else {
|
||||
di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
|
||||
}
|
||||
for i := 0; i < int(numDocs); i++ {
|
||||
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||
offset += uint64(read)
|
||||
}
|
||||
|
||||
compressedDataLoc := chunkMetaLoc + offset
|
||||
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
|
||||
dataLength := curChunkEnd - compressedDataLoc
|
||||
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
|
||||
di.curChunkNum = chunkNumber
|
||||
di.uncompressed = di.uncompressed[:0]
|
||||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) visitDocValues(docNum uint64,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
// binary search the term locations for the docNum
|
||||
start, length := di.getDocValueLocs(docNum)
|
||||
if start == math.MaxUint64 || length == math.MaxUint64 {
|
||||
return nil
|
||||
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
|
||||
for i := 0; i < len(di.chunkOffsets); i++ {
|
||||
err := di.loadDvChunk(uint64(i), s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// uncompress the already loaded data
|
||||
uncompressed, err := snappy.Decode(nil, di.curChunkData)
|
||||
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
di.uncompressed = uncompressed
|
||||
|
||||
start := uint64(0)
|
||||
for _, entry := range di.curChunkHeader {
|
||||
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start = entry.DocDvOffset
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueReader) visitDocValues(docNum uint64,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
// binary search the term locations for the docNum
|
||||
start, end := di.getDocValueLocs(docNum)
|
||||
if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
|
||||
return nil
|
||||
}
|
||||
|
||||
var uncompressed []byte
|
||||
var err error
|
||||
// use the uncompressed copy if available
|
||||
if len(di.uncompressed) > 0 {
|
||||
uncompressed = di.uncompressed
|
||||
} else {
|
||||
// uncompress the already loaded data
|
||||
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
di.uncompressed = uncompressed
|
||||
}
|
||||
|
||||
// pick the terms for the given docNum
|
||||
uncompressed = uncompressed[start : start+length]
|
||||
uncompressed = uncompressed[start:end]
|
||||
for {
|
||||
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
|
@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
|
|||
return nil
|
||||
}
|
||||
|
||||
func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
||||
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
||||
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
|
||||
return di.curChunkHeader[i].DocNum >= docNum
|
||||
})
|
||||
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
|
||||
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
|
||||
return ReadDocValueBoundary(i, di.curChunkHeader)
|
||||
}
|
||||
return math.MaxUint64, math.MaxUint64
|
||||
}
|
||||
|
||||
// VisitDocumentFieldTerms is an implementation of the
|
||||
// DocumentFieldTermVisitable interface
|
||||
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
fieldIDPlus1 := uint16(0)
|
||||
ok := true
|
||||
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
|
||||
segment.DocVisitState, error) {
|
||||
dvs, ok := dvsIn.(*docVisitState)
|
||||
if !ok || dvs == nil {
|
||||
dvs = &docVisitState{}
|
||||
} else {
|
||||
if dvs.segment != s {
|
||||
dvs.segment = s
|
||||
dvs.dvrs = nil
|
||||
}
|
||||
}
|
||||
|
||||
var fieldIDPlus1 uint16
|
||||
if dvs.dvrs == nil {
|
||||
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
|
||||
for _, field := range fields {
|
||||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||
continue
|
||||
}
|
||||
fieldID := fieldIDPlus1 - 1
|
||||
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
|
||||
dvIter != nil {
|
||||
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// find the chunkNumber where the docValues are stored
|
||||
docInChunk := localDocNum / uint64(s.chunkFactor)
|
||||
|
||||
if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists &&
|
||||
dvIter != nil {
|
||||
// check if the chunk is already loaded
|
||||
if docInChunk != dvIter.curChunkNumber() {
|
||||
err := dvIter.loadDvChunk(docInChunk, localDocNum, s)
|
||||
if err != nil {
|
||||
var dvr *docValueReader
|
||||
for _, field := range fields {
|
||||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||
continue
|
||||
}
|
||||
fieldID := fieldIDPlus1 - 1
|
||||
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
|
||||
// check if the chunk is already loaded
|
||||
if docInChunk != dvr.curChunkNumber() {
|
||||
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
|
||||
if err != nil {
|
||||
return dvs, err
|
||||
}
|
||||
}
|
||||
|
||||
_ = dvIter.visitDocValues(localDocNum, visitor)
|
||||
_ = dvr.visitDocValues(localDocNum, visitor)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return dvs, nil
|
||||
}
|
||||
|
||||
// VisitableDocValueFields returns the list of fields with
|
||||
// persisted doc value terms ready to be visitable using the
|
||||
// VisitDocumentFieldTerms method.
|
||||
func (s *Segment) VisitableDocValueFields() ([]string, error) {
|
||||
var rv []string
|
||||
for fieldID, field := range s.fieldsInv {
|
||||
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
|
||||
dvIter != nil {
|
||||
rv = append(rv, field)
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
return s.fieldDvNames, nil
|
||||
}
|
||||
|
|
|
@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
|
|||
for i, itr := range rv.itrs {
|
||||
rv.currKs[i], rv.currVs[i] = itr.Current()
|
||||
}
|
||||
rv.updateMatches()
|
||||
if rv.lowK == nil {
|
||||
rv.updateMatches(false)
|
||||
if rv.lowK == nil && len(rv.lowIdxs) == 0 {
|
||||
return rv, vellum.ErrIteratorDone
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
// updateMatches maintains the low key matches based on the currKs
|
||||
func (m *enumerator) updateMatches() {
|
||||
func (m *enumerator) updateMatches(skipEmptyKey bool) {
|
||||
m.lowK = nil
|
||||
m.lowIdxs = m.lowIdxs[:0]
|
||||
m.lowCurr = 0
|
||||
|
||||
for i, key := range m.currKs {
|
||||
if key == nil {
|
||||
if (key == nil && m.currVs[i] == 0) || // in case of empty iterator
|
||||
(len(key) == 0 && skipEmptyKey) { // skip empty keys
|
||||
continue
|
||||
}
|
||||
|
||||
cmp := bytes.Compare(key, m.lowK)
|
||||
if cmp < 0 || m.lowK == nil {
|
||||
if cmp < 0 || len(m.lowIdxs) == 0 {
|
||||
// reached a new low
|
||||
m.lowK = key
|
||||
m.lowIdxs = m.lowIdxs[:0]
|
||||
|
@ -102,9 +103,10 @@ func (m *enumerator) Next() error {
|
|||
}
|
||||
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
|
||||
}
|
||||
m.updateMatches()
|
||||
// can skip any empty keys encountered at this point
|
||||
m.updateMatches(true)
|
||||
}
|
||||
if m.lowK == nil {
|
||||
if m.lowK == nil && len(m.lowIdxs) == 0 {
|
||||
return vellum.ErrIteratorDone
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -18,16 +18,12 @@ import (
|
|||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
"github.com/Smerity/govarint"
|
||||
)
|
||||
|
||||
type chunkedIntCoder struct {
|
||||
final []byte
|
||||
maxDocNum uint64
|
||||
chunkSize uint64
|
||||
chunkBuf bytes.Buffer
|
||||
encoder *govarint.Base128Encoder
|
||||
chunkLens []uint64
|
||||
currChunk uint64
|
||||
|
||||
|
@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
|
|||
total := maxDocNum/chunkSize + 1
|
||||
rv := &chunkedIntCoder{
|
||||
chunkSize: chunkSize,
|
||||
maxDocNum: maxDocNum,
|
||||
chunkLens: make([]uint64, total),
|
||||
final: make([]byte, 0, 64),
|
||||
}
|
||||
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf)
|
||||
|
||||
return rv
|
||||
}
|
||||
|
@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
|||
chunk := docNum / c.chunkSize
|
||||
if chunk != c.currChunk {
|
||||
// starting a new chunk
|
||||
if c.encoder != nil {
|
||||
// close out last
|
||||
c.Close()
|
||||
c.chunkBuf.Reset()
|
||||
}
|
||||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
if len(c.buf) < binary.MaxVarintLen64 {
|
||||
c.buf = make([]byte, binary.MaxVarintLen64)
|
||||
}
|
||||
|
||||
for _, val := range vals {
|
||||
_, err := c.encoder.PutU64(val)
|
||||
wb := binary.PutUvarint(c.buf, val)
|
||||
_, err := c.chunkBuf.Write(c.buf[:wb])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
|
||||
chunk := docNum / c.chunkSize
|
||||
if chunk != c.currChunk {
|
||||
// starting a new chunk
|
||||
c.Close()
|
||||
c.chunkBuf.Reset()
|
||||
c.currChunk = chunk
|
||||
}
|
||||
|
||||
_, err := c.chunkBuf.Write(buf)
|
||||
return err
|
||||
}
|
||||
|
||||
// Close indicates you are done calling Add() this allows the final chunk
|
||||
// to be encoded.
|
||||
func (c *chunkedIntCoder) Close() {
|
||||
c.encoder.Close()
|
||||
encodingBytes := c.chunkBuf.Bytes()
|
||||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
||||
c.final = append(c.final, encodingBytes...)
|
||||
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
|
||||
}
|
||||
|
||||
// Write commits all the encoded chunked integers to the provided writer.
|
||||
|
@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
|||
}
|
||||
buf := c.buf
|
||||
|
||||
// write out the number of chunks & each chunkLen
|
||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
||||
for _, chunkLen := range c.chunkLens {
|
||||
n += binary.PutUvarint(buf[n:], uint64(chunkLen))
|
||||
// convert the chunk lengths into chunk offsets
|
||||
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||
|
||||
// write out the number of chunks & each chunk offsets
|
||||
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
|
||||
for _, chunkOffset := range chunkOffsets {
|
||||
n += binary.PutUvarint(buf[n:], chunkOffset)
|
||||
}
|
||||
|
||||
tw, err := w.Write(buf[:n])
|
||||
|
@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
|||
}
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
func (c *chunkedIntCoder) FinalSize() int {
|
||||
return len(c.final)
|
||||
}
|
||||
|
||||
// modifyLengthsToEndOffsets converts the chunk length array
|
||||
// to a chunk offset array. The readChunkBoundary
|
||||
// will figure out the start and end of every chunk from
|
||||
// these offsets. Starting offset of i'th index is stored
|
||||
// in i-1'th position except for 0'th index and ending offset
|
||||
// is stored at i'th index position.
|
||||
// For 0'th element, starting position is always zero.
|
||||
// eg:
|
||||
// Lens -> 5 5 5 5 => 5 10 15 20
|
||||
// Lens -> 0 5 0 5 => 0 5 5 10
|
||||
// Lens -> 0 0 0 5 => 0 0 0 5
|
||||
// Lens -> 5 0 0 0 => 5 5 5 5
|
||||
// Lens -> 0 5 0 0 => 0 5 5 5
|
||||
// Lens -> 0 0 5 0 => 0 0 5 5
|
||||
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
|
||||
var runningOffset uint64
|
||||
var index, i int
|
||||
for i = 1; i <= len(lengths); i++ {
|
||||
runningOffset += lengths[i-1]
|
||||
lengths[index] = runningOffset
|
||||
index++
|
||||
}
|
||||
return lengths
|
||||
}
|
||||
|
||||
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
|
||||
var start uint64
|
||||
if chunk > 0 {
|
||||
start = offsets[chunk-1]
|
||||
}
|
||||
return start, offsets[chunk]
|
||||
}
|
||||
|
|
|
@ -24,11 +24,13 @@ import (
|
|||
"sort"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
seg "github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var DefaultFileMergerBufferSize = 1024 * 1024
|
||||
|
||||
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
||||
|
||||
// Merge takes a slice of zap segments and bit masks describing which
|
||||
|
@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
|||
// remaining data. This new segment is built at the specified path,
|
||||
// with the provided chunkFactor.
|
||||
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
||||
chunkFactor uint32) ([][]uint64, error) {
|
||||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||
[][]uint64, uint64, error) {
|
||||
segmentBases := make([]*SegmentBase, len(segments))
|
||||
for segmenti, segment := range segments {
|
||||
segmentBases[segmenti] = &segment.SegmentBase
|
||||
}
|
||||
|
||||
return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s)
|
||||
}
|
||||
|
||||
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
|
||||
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||
[][]uint64, uint64, error) {
|
||||
flag := os.O_RDWR | os.O_CREATE
|
||||
|
||||
f, err := os.OpenFile(path, flag, 0600)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
cleanup := func() {
|
||||
|
@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
|||
_ = os.Remove(path)
|
||||
}
|
||||
|
||||
segmentBases := make([]*SegmentBase, len(segments))
|
||||
for segmenti, segment := range segments {
|
||||
segmentBases[segmenti] = &segment.SegmentBase
|
||||
}
|
||||
|
||||
// buffer the output
|
||||
br := bufio.NewWriter(f)
|
||||
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
|
||||
|
||||
// wrap it for counting (tracking offsets)
|
||||
cr := NewCountHashWriter(br)
|
||||
cr := NewCountHashWriterWithStatsReporter(br, s)
|
||||
|
||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
|
||||
MergeToWriter(segmentBases, drops, chunkFactor, cr)
|
||||
MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||
docValueOffset, chunkFactor, cr.Sum32(), cr)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = br.Flush()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = f.Sync()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
err = f.Close()
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return newDocNums, nil
|
||||
return newDocNums, uint64(cr.Count()), nil
|
||||
}
|
||||
|
||||
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||
chunkFactor uint32, cr *CountHashWriter) (
|
||||
chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) (
|
||||
newDocNums [][]uint64,
|
||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
||||
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
|
||||
|
@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
fieldsMap = mapFields(fieldsInv)
|
||||
|
||||
numDocs = computeNewDocCount(segments, drops)
|
||||
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed
|
||||
}
|
||||
|
||||
if numDocs > 0 {
|
||||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
||||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr)
|
||||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
}
|
||||
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
|
||||
newDocNums, numDocs, chunkFactor, cr)
|
||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
|
||||
fieldsInv, fieldsMap, fieldsSame,
|
||||
newDocNums, numDocs, chunkFactor, cr, closeCh)
|
||||
if err != nil {
|
||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||
}
|
||||
|
@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
|
|||
}
|
||||
|
||||
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64,
|
||||
newSegDocCount uint64, chunkFactor uint32,
|
||||
w *CountHashWriter) ([]uint64, uint64, error) {
|
||||
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
|
||||
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
|
||||
w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) {
|
||||
|
||||
var bufReuse bytes.Buffer
|
||||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
||||
var bufLoc []uint64
|
||||
|
||||
|
@ -168,36 +182,38 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
var postItr *PostingsIterator
|
||||
|
||||
rv := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocs := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocsStart := make([]uint64, len(fieldsInv))
|
||||
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
|
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
|
||||
// docTermMap is keyed by docNum, where the array impl provides
|
||||
// better memory usage behavior than a sparse-friendlier hashmap
|
||||
// for when docs have much structural similarity (i.e., every doc
|
||||
// has a given field)
|
||||
var docTermMap [][]byte
|
||||
|
||||
var vellumBuf bytes.Buffer
|
||||
|
||||
// for each field
|
||||
for fieldID, fieldName := range fieldsInv {
|
||||
if fieldID != 0 {
|
||||
vellumBuf.Reset()
|
||||
}
|
||||
newVellum, err := vellum.New(&vellumBuf, nil)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
newRoaring := roaring.NewBitmap()
|
||||
|
||||
// for each field
|
||||
for fieldID, fieldName := range fieldsInv {
|
||||
|
||||
// collect FST iterators from all active segments for this field
|
||||
var newDocNums [][]uint64
|
||||
var drops []*roaring.Bitmap
|
||||
var dicts []*Dictionary
|
||||
var itrs []vellum.Iterator
|
||||
|
||||
var segmentsInFocus []*SegmentBase
|
||||
|
||||
for segmentI, segment := range segments {
|
||||
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
dict, err2 := segment.dictionary(fieldName)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
|
@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
}
|
||||
if itr != nil {
|
||||
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
|
||||
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
|
||||
drops = append(drops, dropsIn[segmentI])
|
||||
} else {
|
||||
drops = append(drops, nil)
|
||||
}
|
||||
dicts = append(dicts, dict)
|
||||
itrs = append(itrs, itr)
|
||||
segmentsInFocus = append(segmentsInFocus, segment)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if uint64(cap(docTermMap)) < newSegDocCount {
|
||||
docTermMap = make([][]byte, newSegDocCount)
|
||||
} else {
|
||||
docTermMap = docTermMap[0:newSegDocCount]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
var prevTerm []byte
|
||||
|
||||
newRoaring := roaring.NewBitmap()
|
||||
newRoaringLocs := roaring.NewBitmap()
|
||||
newRoaring.Clear()
|
||||
|
||||
finishTerm := func(term []byte) error {
|
||||
if term == nil {
|
||||
return nil
|
||||
var lastDocNum, lastFreq, lastNorm uint64
|
||||
|
||||
// determines whether to use "1-hit" encoding optimization
|
||||
// when a term appears in only 1 doc, with no loc info,
|
||||
// has freq of 1, and the docNum fits into 31-bits
|
||||
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
|
||||
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
|
||||
docNum := uint64(newRoaring.Minimum())
|
||||
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
|
||||
return true, docNum, lastNorm
|
||||
}
|
||||
}
|
||||
return false, 0, 0
|
||||
}
|
||||
|
||||
finishTerm := func(term []byte) error {
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
if newRoaring.GetCardinality() > 0 {
|
||||
// this field/term actually has hits in the new segment, lets write it down
|
||||
freqOffset := uint64(w.Count())
|
||||
_, err := tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
locOffset := uint64(w.Count())
|
||||
_, err = locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
postingLocOffset := uint64(w.Count())
|
||||
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
postingOffset := uint64(w.Count())
|
||||
|
||||
// write out the start of the term info
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the start of the loc info
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// write out the start of the posting locs
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
|
||||
postingsOffset, err := writePostings(newRoaring,
|
||||
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = newVellum.Insert(term, postingOffset)
|
||||
if postingsOffset > 0 {
|
||||
err = newVellum.Insert(term, postingsOffset)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
newRoaring = roaring.NewBitmap()
|
||||
newRoaringLocs = roaring.NewBitmap()
|
||||
newRoaring.Clear()
|
||||
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
|
||||
lastDocNum = 0
|
||||
lastFreq = 0
|
||||
lastNorm = 0
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -301,74 +291,47 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
term, itrI, postingsOffset := enumerator.Current()
|
||||
|
||||
if !bytes.Equal(prevTerm, term) {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
// if the term changed, write out the info collected
|
||||
// for the previous term
|
||||
err2 := finishTerm(prevTerm)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
err = finishTerm(prevTerm)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
var err2 error
|
||||
postings, err2 = dicts[itrI].postingsListFromOffset(
|
||||
postings, err = dicts[itrI].postingsListFromOffset(
|
||||
postingsOffset, drops[itrI], postings)
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
}
|
||||
|
||||
newDocNumsI := newDocNums[itrI]
|
||||
|
||||
postItr = postings.iterator(postItr)
|
||||
next, err2 := postItr.Next()
|
||||
for next != nil && err2 == nil {
|
||||
hitNewDocNum := newDocNumsI[next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return nil, 0, fmt.Errorf("see hit with dropped doc num")
|
||||
}
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
// encode norm bits
|
||||
norm := next.Norm()
|
||||
normBits := math.Float32bits(float32(norm))
|
||||
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
locs := next.Locations()
|
||||
if len(locs) > 0 {
|
||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
||||
for _, loc := range locs {
|
||||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
|
||||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
||||
|
||||
postItr = postings.iterator(true, true, true, postItr)
|
||||
|
||||
if fieldsSame {
|
||||
// can optimize by copying freq/norm/loc bytes directly
|
||||
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
|
||||
term, postItr, newDocNums[itrI], newRoaring,
|
||||
tfEncoder, locEncoder)
|
||||
} else {
|
||||
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
|
||||
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
|
||||
tfEncoder, locEncoder, bufLoc)
|
||||
}
|
||||
args := bufLoc[0:5]
|
||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||
args[1] = loc.Pos()
|
||||
args[2] = loc.Start()
|
||||
args[3] = loc.End()
|
||||
args[4] = uint64(len(loc.ArrayPositions()))
|
||||
args = append(args, loc.ArrayPositions()...)
|
||||
err = locEncoder.Add(hitNewDocNum, args...)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
docTermMap[hitNewDocNum] =
|
||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
||||
|
||||
next, err2 = postItr.Next()
|
||||
}
|
||||
if err2 != nil {
|
||||
return nil, 0, err2
|
||||
}
|
||||
|
||||
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
|
||||
prevTerm = append(prevTerm, term...)
|
||||
|
||||
err = enumerator.Next()
|
||||
}
|
||||
if err != nil && err != vellum.ErrIteratorDone {
|
||||
if err != vellum.ErrIteratorDone {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
|
@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
|
||||
rv[fieldID] = dictOffset
|
||||
|
||||
// get the field doc value offset (start)
|
||||
fieldDvLocsStart[fieldID] = uint64(w.Count())
|
||||
|
||||
// update the field doc values
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true)
|
||||
|
||||
fdvReadersAvailable := false
|
||||
var dvIterClone *docValueReader
|
||||
for segmentI, segment := range segmentsInFocus {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return nil, 0, seg.ErrClosed
|
||||
}
|
||||
|
||||
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
|
||||
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
|
||||
dvIter != nil {
|
||||
fdvReadersAvailable = true
|
||||
dvIterClone = dvIter.cloneInto(dvIterClone)
|
||||
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
|
||||
if newDocNums[segmentI][docNum] == docDropped {
|
||||
return nil
|
||||
}
|
||||
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if fdvReadersAvailable {
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// get the field doc value offset
|
||||
fieldDvLocs[fieldID] = uint64(w.Count())
|
||||
|
||||
// persist the doc value details for this field
|
||||
_, err = fdvEncoder.Write(w)
|
||||
_, err = fdvEncoder.Write()
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
// get the field doc value offset (end)
|
||||
fieldDvLocsEnd[fieldID] = uint64(w.Count())
|
||||
} else {
|
||||
fieldDvLocsStart[fieldID] = fieldNotUninverted
|
||||
fieldDvLocsEnd[fieldID] = fieldNotUninverted
|
||||
}
|
||||
|
||||
// reset vellum buffer and vellum builder
|
||||
vellumBuf.Reset()
|
||||
err = newVellum.Reset(&vellumBuf)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
|||
fieldDvLocsOffset := uint64(w.Count())
|
||||
|
||||
buf := bufMaxVarintLen64
|
||||
for _, offset := range fieldDvLocs {
|
||||
n := binary.PutUvarint(buf, uint64(offset))
|
||||
for i := 0; i < len(fieldDvLocsStart); i++ {
|
||||
n := binary.PutUvarint(buf, fieldDvLocsStart[i])
|
||||
_, err := w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
|
||||
_, err = w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, fieldDvLocsOffset, nil
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
|
||||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
|
||||
next, err := postItr.Next()
|
||||
for next != nil && err == nil {
|
||||
hitNewDocNum := newDocNums[next.Number()]
|
||||
if hitNewDocNum == docDropped {
|
||||
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
|
||||
}
|
||||
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
|
||||
nextFreq := next.Frequency()
|
||||
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
|
||||
|
||||
locs := next.Locations()
|
||||
|
||||
err = tfEncoder.Add(hitNewDocNum,
|
||||
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
if len(locs) > 0 {
|
||||
numBytesLocs := 0
|
||||
for _, loc := range locs {
|
||||
ap := loc.ArrayPositions()
|
||||
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
|
||||
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
|
||||
}
|
||||
|
||||
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
for _, loc := range locs {
|
||||
ap := loc.ArrayPositions()
|
||||
if cap(bufLoc) < 5+len(ap) {
|
||||
bufLoc = make([]uint64, 0, 5+len(ap))
|
||||
}
|
||||
args := bufLoc[0:5]
|
||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||
args[1] = loc.Pos()
|
||||
args[2] = loc.Start()
|
||||
args[3] = loc.End()
|
||||
args[4] = uint64(len(ap))
|
||||
args = append(args, ap...)
|
||||
err = locEncoder.Add(hitNewDocNum, args...)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
next, err = postItr.Next()
|
||||
}
|
||||
|
||||
return lastDocNum, lastFreq, lastNorm, bufLoc, err
|
||||
}
|
||||
|
||||
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
|
||||
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) (
|
||||
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) {
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err :=
|
||||
postItr.nextBytes()
|
||||
for err == nil && len(nextFreqNormBytes) > 0 {
|
||||
hitNewDocNum := newDocNums[nextDocNum]
|
||||
if hitNewDocNum == docDropped {
|
||||
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num")
|
||||
}
|
||||
|
||||
newRoaring.Add(uint32(hitNewDocNum))
|
||||
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
|
||||
if err != nil {
|
||||
return 0, 0, 0, err
|
||||
}
|
||||
|
||||
if len(nextLocBytes) > 0 {
|
||||
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
|
||||
if err != nil {
|
||||
return 0, 0, 0, err
|
||||
}
|
||||
}
|
||||
|
||||
lastDocNum = hitNewDocNum
|
||||
lastFreq = nextFreq
|
||||
lastNorm = nextNorm
|
||||
|
||||
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err =
|
||||
postItr.nextBytes()
|
||||
}
|
||||
|
||||
return lastDocNum, lastFreq, lastNorm, err
|
||||
}
|
||||
|
||||
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
|
||||
use1HitEncoding func(uint64) (bool, uint64, uint64),
|
||||
w *CountHashWriter, bufMaxVarintLen64 []byte) (
|
||||
offset uint64, err error) {
|
||||
termCardinality := postings.GetCardinality()
|
||||
if termCardinality <= 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
if use1HitEncoding != nil {
|
||||
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
|
||||
if encodeAs1Hit {
|
||||
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
|
||||
}
|
||||
}
|
||||
|
||||
tfOffset := uint64(w.Count())
|
||||
_, err = tfEncoder.Write(w)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
locOffset := uint64(w.Count())
|
||||
_, err = locEncoder.Write(w)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
postingsOffset := uint64(w.Count())
|
||||
|
||||
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return postingsOffset, nil
|
||||
}
|
||||
|
||||
type varintEncoder func(uint64) (int, error)
|
||||
|
||||
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
|
||||
w *CountHashWriter) (uint64, [][]uint64, error) {
|
||||
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
|
||||
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
||||
|
||||
var newDocNum uint64
|
||||
|
||||
var curr int
|
||||
var metaBuf bytes.Buffer
|
||||
var data, compressed []byte
|
||||
|
||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
||||
var metaBuf bytes.Buffer
|
||||
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||
metaEncode := func(val uint64) (int, error) {
|
||||
wb := binary.PutUvarint(varBuf, val)
|
||||
return metaBuf.Write(varBuf[:wb])
|
||||
}
|
||||
|
||||
vals := make([][][]byte, len(fieldsInv))
|
||||
typs := make([][]byte, len(fieldsInv))
|
||||
poss := make([][][]uint64, len(fieldsInv))
|
||||
|
||||
var posBuf []uint64
|
||||
|
||||
docNumOffsets := make([]uint64, newSegDocCount)
|
||||
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
defer visitDocumentCtxPool.Put(vdc)
|
||||
|
||||
// for each segment
|
||||
for segI, segment := range segments {
|
||||
// check for the closure in meantime
|
||||
if isClosed(closeCh) {
|
||||
return 0, nil, seg.ErrClosed
|
||||
}
|
||||
|
||||
segNewDocNums := make([]uint64, segment.numDocs)
|
||||
|
||||
dropsI := drops[segI]
|
||||
|
@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
curr = 0
|
||||
metaBuf.Reset()
|
||||
data = data[:0]
|
||||
compressed = compressed[:0]
|
||||
|
||||
posTemp := posBuf
|
||||
|
||||
// collect all the data
|
||||
for i := 0; i < len(fieldsInv); i++ {
|
||||
|
@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
typs[i] = typs[i][:0]
|
||||
poss[i] = poss[i][:0]
|
||||
}
|
||||
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
fieldID := int(fieldsMap[field]) - 1
|
||||
vals[fieldID] = append(vals[fieldID], value)
|
||||
typs[fieldID] = append(typs[fieldID], typ)
|
||||
poss[fieldID] = append(poss[fieldID], pos)
|
||||
|
||||
// copy array positions to preserve them beyond the scope of this callback
|
||||
var curPos []uint64
|
||||
if len(pos) > 0 {
|
||||
if cap(posTemp) < len(pos) {
|
||||
posBuf = make([]uint64, len(pos)*len(fieldsInv))
|
||||
posTemp = posBuf
|
||||
}
|
||||
curPos = posTemp[0:len(pos)]
|
||||
copy(curPos, pos)
|
||||
posTemp = posTemp[len(pos):]
|
||||
}
|
||||
poss[fieldID] = append(poss[fieldID], curPos)
|
||||
|
||||
return true
|
||||
})
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// now walk the fields in order
|
||||
for fieldID := range fieldsInv {
|
||||
storedFieldValues := vals[int(fieldID)]
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := vals[uint16(0)][0]
|
||||
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
stf := typs[int(fieldID)]
|
||||
spf := poss[int(fieldID)]
|
||||
// now walk the non-"_id" fields in order
|
||||
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
|
||||
storedFieldValues := vals[fieldID]
|
||||
|
||||
stf := typs[fieldID]
|
||||
spf := poss[fieldID]
|
||||
|
||||
var err2 error
|
||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
||||
storedFieldValues, stf, spf, curr, metaEncode, data)
|
||||
if err2 != nil {
|
||||
return 0, nil, err2
|
||||
}
|
||||
}
|
||||
|
||||
metaEncoder.Close()
|
||||
metaBytes := metaBuf.Bytes()
|
||||
|
||||
compressed = snappy.Encode(compressed, data)
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||
|
||||
// record where we're about to start writing
|
||||
docNumOffsets[newDocNum] = uint64(w.Count())
|
||||
|
||||
// write out the meta len and compressed data len
|
||||
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
||||
_, err = writeUvarints(w,
|
||||
uint64(len(metaBytes)),
|
||||
uint64(len(idFieldVal)+len(compressed)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
|||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the _id field val (counted as part of the 'compressed' data)
|
||||
_, err = w.Write(idFieldVal)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
// now write the compressed data
|
||||
_, err = w.Write(compressed)
|
||||
if err != nil {
|
||||
|
@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) {
|
|||
|
||||
return fieldsSame, rv
|
||||
}
|
||||
|
||||
func isClosed(closeCh chan struct{}) bool {
|
||||
select {
|
||||
case <-closeCh:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
|
@ -0,0 +1,826 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"math"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/couchbase/vellum"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var NewSegmentBufferNumResultsBump int = 100
|
||||
var NewSegmentBufferNumResultsFactor float64 = 1.0
|
||||
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
|
||||
|
||||
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||
// SegmentBase from analysis results
|
||||
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||
chunkFactor uint32) (*SegmentBase, uint64, error) {
|
||||
s := interimPool.Get().(*interim)
|
||||
|
||||
var br bytes.Buffer
|
||||
if s.lastNumDocs > 0 {
|
||||
// use previous results to initialize the buf with an estimate
|
||||
// size, but note that the interim instance comes from a
|
||||
// global interimPool, so multiple scorch instances indexing
|
||||
// different docs can lead to low quality estimates
|
||||
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
|
||||
NewSegmentBufferNumResultsFactor)
|
||||
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
|
||||
NewSegmentBufferAvgBytesPerDocFactor)
|
||||
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
|
||||
}
|
||||
|
||||
s.results = results
|
||||
s.chunkFactor = chunkFactor
|
||||
s.w = NewCountHashWriter(&br)
|
||||
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
||||
err := s.convert()
|
||||
if err != nil {
|
||||
return nil, uint64(0), err
|
||||
}
|
||||
|
||||
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
|
||||
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
||||
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
||||
|
||||
if err == nil && s.reset() == nil {
|
||||
s.lastNumDocs = len(results)
|
||||
s.lastOutSize = len(br.Bytes())
|
||||
interimPool.Put(s)
|
||||
}
|
||||
|
||||
return sb, uint64(len(br.Bytes())), err
|
||||
}
|
||||
|
||||
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
|
||||
|
||||
// interim holds temporary working data used while converting from
|
||||
// analysis results to a zap-encoded segment
|
||||
type interim struct {
|
||||
results []*index.AnalysisResult
|
||||
|
||||
chunkFactor uint32
|
||||
|
||||
w *CountHashWriter
|
||||
|
||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||
// name -> field id + 1
|
||||
FieldsMap map[string]uint16
|
||||
|
||||
// FieldsInv is the inverse of FieldsMap
|
||||
// field id -> name
|
||||
FieldsInv []string
|
||||
|
||||
// Term dictionaries for each field
|
||||
// field id -> term -> postings list id + 1
|
||||
Dicts []map[string]uint64
|
||||
|
||||
// Terms for each field, where terms are sorted ascending
|
||||
// field id -> []term
|
||||
DictKeys [][]string
|
||||
|
||||
// Fields whose IncludeDocValues is true
|
||||
// field id -> bool
|
||||
IncludeDocValues []bool
|
||||
|
||||
// postings id -> bitmap of docNums
|
||||
Postings []*roaring.Bitmap
|
||||
|
||||
// postings id -> freq/norm's, one for each docNum in postings
|
||||
FreqNorms [][]interimFreqNorm
|
||||
freqNormsBacking []interimFreqNorm
|
||||
|
||||
// postings id -> locs, one for each freq
|
||||
Locs [][]interimLoc
|
||||
locsBacking []interimLoc
|
||||
|
||||
numTermsPerPostingsList []int // key is postings list id
|
||||
numLocsPerPostingsList []int // key is postings list id
|
||||
|
||||
builder *vellum.Builder
|
||||
builderBuf bytes.Buffer
|
||||
|
||||
metaBuf bytes.Buffer
|
||||
|
||||
tmp0 []byte
|
||||
tmp1 []byte
|
||||
|
||||
lastNumDocs int
|
||||
lastOutSize int
|
||||
}
|
||||
|
||||
func (s *interim) reset() (err error) {
|
||||
s.results = nil
|
||||
s.chunkFactor = 0
|
||||
s.w = nil
|
||||
s.FieldsMap = nil
|
||||
s.FieldsInv = nil
|
||||
for i := range s.Dicts {
|
||||
s.Dicts[i] = nil
|
||||
}
|
||||
s.Dicts = s.Dicts[:0]
|
||||
for i := range s.DictKeys {
|
||||
s.DictKeys[i] = s.DictKeys[i][:0]
|
||||
}
|
||||
s.DictKeys = s.DictKeys[:0]
|
||||
for i := range s.IncludeDocValues {
|
||||
s.IncludeDocValues[i] = false
|
||||
}
|
||||
s.IncludeDocValues = s.IncludeDocValues[:0]
|
||||
for _, idn := range s.Postings {
|
||||
idn.Clear()
|
||||
}
|
||||
s.Postings = s.Postings[:0]
|
||||
s.FreqNorms = s.FreqNorms[:0]
|
||||
for i := range s.freqNormsBacking {
|
||||
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||
}
|
||||
s.freqNormsBacking = s.freqNormsBacking[:0]
|
||||
s.Locs = s.Locs[:0]
|
||||
for i := range s.locsBacking {
|
||||
s.locsBacking[i] = interimLoc{}
|
||||
}
|
||||
s.locsBacking = s.locsBacking[:0]
|
||||
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
|
||||
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
|
||||
s.builderBuf.Reset()
|
||||
if s.builder != nil {
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
}
|
||||
s.metaBuf.Reset()
|
||||
s.tmp0 = s.tmp0[:0]
|
||||
s.tmp1 = s.tmp1[:0]
|
||||
s.lastNumDocs = 0
|
||||
s.lastOutSize = 0
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *interim) grabBuf(size int) []byte {
|
||||
buf := s.tmp0
|
||||
if cap(buf) < size {
|
||||
buf = make([]byte, size)
|
||||
s.tmp0 = buf
|
||||
}
|
||||
return buf[0:size]
|
||||
}
|
||||
|
||||
type interimStoredField struct {
|
||||
vals [][]byte
|
||||
typs []byte
|
||||
arrayposs [][]uint64 // array positions
|
||||
}
|
||||
|
||||
type interimFreqNorm struct {
|
||||
freq uint64
|
||||
norm float32
|
||||
numLocs int
|
||||
}
|
||||
|
||||
type interimLoc struct {
|
||||
fieldID uint16
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
arrayposs []uint64
|
||||
}
|
||||
|
||||
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||
s.FieldsMap = map[string]uint16{}
|
||||
|
||||
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||
|
||||
for _, result := range s.results {
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
for _, field := range result.Document.Fields {
|
||||
s.getOrDefineField(field.Name())
|
||||
}
|
||||
}
|
||||
|
||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||
|
||||
for fieldID, fieldName := range s.FieldsInv {
|
||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||
}
|
||||
|
||||
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
|
||||
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
|
||||
} else {
|
||||
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
s.prepareDicts()
|
||||
|
||||
for _, dict := range s.DictKeys {
|
||||
sort.Strings(dict)
|
||||
}
|
||||
|
||||
s.processDocuments()
|
||||
|
||||
storedIndexOffset, err := s.writeStoredFields()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
var fdvIndexOffset uint64
|
||||
var dictOffsets []uint64
|
||||
|
||||
if len(s.results) > 0 {
|
||||
fdvIndexOffset, dictOffsets, err = s.writeDicts()
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
} else {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
}
|
||||
|
||||
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
|
||||
if err != nil {
|
||||
return 0, 0, 0, nil, err
|
||||
}
|
||||
|
||||
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func (s *interim) getOrDefineField(fieldName string) int {
|
||||
fieldIDPlus1, exists := s.FieldsMap[fieldName]
|
||||
if !exists {
|
||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||
s.FieldsMap[fieldName] = fieldIDPlus1
|
||||
s.FieldsInv = append(s.FieldsInv, fieldName)
|
||||
|
||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||
|
||||
n := len(s.DictKeys)
|
||||
if n < cap(s.DictKeys) {
|
||||
s.DictKeys = s.DictKeys[:n+1]
|
||||
s.DictKeys[n] = s.DictKeys[n][:0]
|
||||
} else {
|
||||
s.DictKeys = append(s.DictKeys, []string(nil))
|
||||
}
|
||||
}
|
||||
|
||||
return int(fieldIDPlus1 - 1)
|
||||
}
|
||||
|
||||
// fill Dicts and DictKeys from analysis results
|
||||
func (s *interim) prepareDicts() {
|
||||
var pidNext int
|
||||
|
||||
var totTFs int
|
||||
var totLocs int
|
||||
|
||||
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||
dict := s.Dicts[fieldID]
|
||||
dictKeys := s.DictKeys[fieldID]
|
||||
|
||||
for term, tf := range tfs {
|
||||
pidPlus1, exists := dict[term]
|
||||
if !exists {
|
||||
pidNext++
|
||||
pidPlus1 = uint64(pidNext)
|
||||
|
||||
dict[term] = pidPlus1
|
||||
dictKeys = append(dictKeys, term)
|
||||
|
||||
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
|
||||
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
|
||||
}
|
||||
|
||||
pid := pidPlus1 - 1
|
||||
|
||||
s.numTermsPerPostingsList[pid] += 1
|
||||
s.numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||
|
||||
totLocs += len(tf.Locations)
|
||||
}
|
||||
|
||||
totTFs += len(tfs)
|
||||
|
||||
s.DictKeys[fieldID] = dictKeys
|
||||
}
|
||||
|
||||
for _, result := range s.results {
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
_, tf := field.Analyze()
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, tf)
|
||||
}
|
||||
}
|
||||
|
||||
numPostingsLists := pidNext
|
||||
|
||||
if cap(s.Postings) >= numPostingsLists {
|
||||
s.Postings = s.Postings[:numPostingsLists]
|
||||
} else {
|
||||
postings := make([]*roaring.Bitmap, numPostingsLists)
|
||||
copy(postings, s.Postings[:cap(s.Postings)])
|
||||
for i := 0; i < numPostingsLists; i++ {
|
||||
if postings[i] == nil {
|
||||
postings[i] = roaring.New()
|
||||
}
|
||||
}
|
||||
s.Postings = postings
|
||||
}
|
||||
|
||||
if cap(s.FreqNorms) >= numPostingsLists {
|
||||
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||
} else {
|
||||
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.freqNormsBacking) >= totTFs {
|
||||
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
|
||||
} else {
|
||||
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
|
||||
}
|
||||
|
||||
freqNormsBacking := s.freqNormsBacking
|
||||
for pid, numTerms := range s.numTermsPerPostingsList {
|
||||
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
||||
freqNormsBacking = freqNormsBacking[numTerms:]
|
||||
}
|
||||
|
||||
if cap(s.Locs) >= numPostingsLists {
|
||||
s.Locs = s.Locs[:numPostingsLists]
|
||||
} else {
|
||||
s.Locs = make([][]interimLoc, numPostingsLists)
|
||||
}
|
||||
|
||||
if cap(s.locsBacking) >= totLocs {
|
||||
s.locsBacking = s.locsBacking[:totLocs]
|
||||
} else {
|
||||
s.locsBacking = make([]interimLoc, totLocs)
|
||||
}
|
||||
|
||||
locsBacking := s.locsBacking
|
||||
for pid, numLocs := range s.numLocsPerPostingsList {
|
||||
s.Locs[pid] = locsBacking[0:0]
|
||||
locsBacking = locsBacking[numLocs:]
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocuments() {
|
||||
numFields := len(s.FieldsInv)
|
||||
reuseFieldLens := make([]int, numFields)
|
||||
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||
reuseFieldLens[i] = 0
|
||||
reuseFieldTFs[i] = nil
|
||||
}
|
||||
|
||||
s.processDocument(uint64(docNum), result,
|
||||
reuseFieldLens, reuseFieldTFs)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) processDocument(docNum uint64,
|
||||
result *index.AnalysisResult,
|
||||
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
|
||||
visitField := func(fieldID uint16, fieldName string,
|
||||
ln int, tf analysis.TokenFrequencies) {
|
||||
fieldLens[fieldID] += ln
|
||||
|
||||
existingFreqs := fieldTFs[fieldID]
|
||||
if existingFreqs != nil {
|
||||
existingFreqs.MergeAll(fieldName, tf)
|
||||
} else {
|
||||
fieldTFs[fieldID] = tf
|
||||
}
|
||||
}
|
||||
|
||||
// walk each composite field
|
||||
for _, field := range result.Document.CompositeFields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln, tf := field.Analyze()
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// walk each field
|
||||
for i, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
ln := result.Length[i]
|
||||
tf := result.Analyzed[i]
|
||||
visitField(fieldID, field.Name(), ln, tf)
|
||||
}
|
||||
|
||||
// now that it's been rolled up into fieldTFs, walk that
|
||||
for fieldID, tfs := range fieldTFs {
|
||||
dict := s.Dicts[fieldID]
|
||||
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
|
||||
|
||||
for term, tf := range tfs {
|
||||
pid := dict[term] - 1
|
||||
bs := s.Postings[pid]
|
||||
bs.Add(uint32(docNum))
|
||||
|
||||
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
||||
interimFreqNorm{
|
||||
freq: uint64(tf.Frequency()),
|
||||
norm: norm,
|
||||
numLocs: len(tf.Locations),
|
||||
})
|
||||
|
||||
if len(tf.Locations) > 0 {
|
||||
locs := s.Locs[pid]
|
||||
|
||||
for _, loc := range tf.Locations {
|
||||
var locf = uint16(fieldID)
|
||||
if loc.Field != "" {
|
||||
locf = uint16(s.getOrDefineField(loc.Field))
|
||||
}
|
||||
var arrayposs []uint64
|
||||
if len(loc.ArrayPositions) > 0 {
|
||||
arrayposs = loc.ArrayPositions
|
||||
}
|
||||
locs = append(locs, interimLoc{
|
||||
fieldID: locf,
|
||||
pos: uint64(loc.Position),
|
||||
start: uint64(loc.Start),
|
||||
end: uint64(loc.End),
|
||||
arrayposs: arrayposs,
|
||||
})
|
||||
}
|
||||
|
||||
s.Locs[pid] = locs
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *interim) writeStoredFields() (
|
||||
storedIndexOffset uint64, err error) {
|
||||
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||
metaEncode := func(val uint64) (int, error) {
|
||||
wb := binary.PutUvarint(varBuf, val)
|
||||
return s.metaBuf.Write(varBuf[:wb])
|
||||
}
|
||||
|
||||
data, compressed := s.tmp0[:0], s.tmp1[:0]
|
||||
defer func() { s.tmp0, s.tmp1 = data, compressed }()
|
||||
|
||||
// keyed by docNum
|
||||
docStoredOffsets := make([]uint64, len(s.results))
|
||||
|
||||
// keyed by fieldID, for the current doc in the loop
|
||||
docStoredFields := map[uint16]interimStoredField{}
|
||||
|
||||
for docNum, result := range s.results {
|
||||
for fieldID := range docStoredFields { // reset for next doc
|
||||
delete(docStoredFields, fieldID)
|
||||
}
|
||||
|
||||
for _, field := range result.Document.Fields {
|
||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||
|
||||
opts := field.Options()
|
||||
|
||||
if opts.IsStored() {
|
||||
isf := docStoredFields[fieldID]
|
||||
isf.vals = append(isf.vals, field.Value())
|
||||
isf.typs = append(isf.typs, encodeFieldType(field))
|
||||
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
|
||||
docStoredFields[fieldID] = isf
|
||||
}
|
||||
|
||||
if opts.IncludeDocValues() {
|
||||
s.IncludeDocValues[fieldID] = true
|
||||
}
|
||||
}
|
||||
|
||||
var curr int
|
||||
|
||||
s.metaBuf.Reset()
|
||||
data = data[:0]
|
||||
|
||||
// _id field special case optimizes ExternalID() lookups
|
||||
idFieldVal := docStoredFields[uint16(0)].vals[0]
|
||||
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
// handle non-"_id" fields
|
||||
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
|
||||
isf, exists := docStoredFields[uint16(fieldID)]
|
||||
if exists {
|
||||
curr, data, err = persistStoredFieldValues(
|
||||
fieldID, isf.vals, isf.typs, isf.arrayposs,
|
||||
curr, metaEncode, data)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
metaBytes := s.metaBuf.Bytes()
|
||||
|
||||
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||
|
||||
docStoredOffsets[docNum] = uint64(s.w.Count())
|
||||
|
||||
_, err := writeUvarints(s.w,
|
||||
uint64(len(metaBytes)),
|
||||
uint64(len(idFieldVal)+len(compressed)))
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(metaBytes)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(idFieldVal)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
_, err = s.w.Write(compressed)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
storedIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for _, docStoredOffset := range docStoredOffsets {
|
||||
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
}
|
||||
|
||||
return storedIndexOffset, nil
|
||||
}
|
||||
|
||||
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
|
||||
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||
|
||||
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
|
||||
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
|
||||
|
||||
buf := s.grabBuf(binary.MaxVarintLen64)
|
||||
|
||||
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
|
||||
|
||||
var docTermMap [][]byte
|
||||
|
||||
if s.builder == nil {
|
||||
s.builder, err = vellum.New(&s.builderBuf, nil)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
for fieldID, terms := range s.DictKeys {
|
||||
if cap(docTermMap) < len(s.results) {
|
||||
docTermMap = make([][]byte, len(s.results))
|
||||
} else {
|
||||
docTermMap = docTermMap[0:len(s.results)]
|
||||
for docNum := range docTermMap { // reset the docTermMap
|
||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||
}
|
||||
}
|
||||
|
||||
dict := s.Dicts[fieldID]
|
||||
|
||||
for _, term := range terms { // terms are already sorted
|
||||
pid := dict[term] - 1
|
||||
|
||||
postingsBS := s.Postings[pid]
|
||||
|
||||
freqNorms := s.FreqNorms[pid]
|
||||
freqNormOffset := 0
|
||||
|
||||
locs := s.Locs[pid]
|
||||
locOffset := 0
|
||||
|
||||
postingsItr := postingsBS.Iterator()
|
||||
for postingsItr.HasNext() {
|
||||
docNum := uint64(postingsItr.Next())
|
||||
|
||||
freqNorm := freqNorms[freqNormOffset]
|
||||
|
||||
err = tfEncoder.Add(docNum,
|
||||
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
|
||||
uint64(math.Float32bits(freqNorm.norm)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if freqNorm.numLocs > 0 {
|
||||
numBytesLocs := 0
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
numBytesLocs += totalUvarintBytes(
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)), loc.arrayposs)
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, uint64(numBytesLocs))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||
err = locEncoder.Add(docNum,
|
||||
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||
uint64(len(loc.arrayposs)))
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
err = locEncoder.Add(docNum, loc.arrayposs...)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
locOffset += freqNorm.numLocs
|
||||
}
|
||||
|
||||
freqNormOffset++
|
||||
|
||||
docTermMap[docNum] = append(
|
||||
append(docTermMap[docNum], term...),
|
||||
termSeparator)
|
||||
}
|
||||
|
||||
tfEncoder.Close()
|
||||
locEncoder.Close()
|
||||
|
||||
postingsOffset, err :=
|
||||
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
if postingsOffset > uint64(0) {
|
||||
err = s.builder.Insert([]byte(term), postingsOffset)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
tfEncoder.Reset()
|
||||
locEncoder.Reset()
|
||||
}
|
||||
|
||||
err = s.builder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// record where this dictionary starts
|
||||
dictOffsets[fieldID] = uint64(s.w.Count())
|
||||
|
||||
vellumData := s.builderBuf.Bytes()
|
||||
|
||||
// write out the length of the vellum data
|
||||
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write this vellum to disk
|
||||
_, err = s.w.Write(vellumData)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// reset vellum for reuse
|
||||
s.builderBuf.Reset()
|
||||
|
||||
err = s.builder.Reset(&s.builderBuf)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
// write the field doc values
|
||||
if s.IncludeDocValues[fieldID] {
|
||||
for docNum, docTerms := range docTermMap {
|
||||
if len(docTerms) > 0 {
|
||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
err = fdvEncoder.Close()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
|
||||
|
||||
_, err = fdvEncoder.Write()
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
|
||||
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
|
||||
|
||||
fdvEncoder.Reset()
|
||||
} else {
|
||||
fdvOffsetsStart[fieldID] = fieldNotUninverted
|
||||
fdvOffsetsEnd[fieldID] = fieldNotUninverted
|
||||
}
|
||||
}
|
||||
|
||||
fdvIndexOffset = uint64(s.w.Count())
|
||||
|
||||
for i := 0; i < len(fdvOffsetsStart); i++ {
|
||||
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
|
||||
_, err := s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
|
||||
_, err = s.w.Write(buf[:n])
|
||||
if err != nil {
|
||||
return 0, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return fdvIndexOffset, dictOffsets, nil
|
||||
}
|
||||
|
||||
func encodeFieldType(f document.Field) byte {
|
||||
fieldType := byte('x')
|
||||
switch f.(type) {
|
||||
case *document.TextField:
|
||||
fieldType = 't'
|
||||
case *document.NumericField:
|
||||
fieldType = 'n'
|
||||
case *document.DateTimeField:
|
||||
fieldType = 'd'
|
||||
case *document.BooleanField:
|
||||
fieldType = 'b'
|
||||
case *document.GeoPointField:
|
||||
fieldType = 'g'
|
||||
case *document.CompositeField:
|
||||
fieldType = 'c'
|
||||
}
|
||||
return fieldType
|
||||
}
|
||||
|
||||
// returns the total # of bytes needed to encode the given uint64's
|
||||
// into binary.PutUVarint() encoding
|
||||
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
|
||||
n = numUvarintBytes(a)
|
||||
n += numUvarintBytes(b)
|
||||
n += numUvarintBytes(c)
|
||||
n += numUvarintBytes(d)
|
||||
n += numUvarintBytes(e)
|
||||
for _, v := range more {
|
||||
n += numUvarintBytes(v)
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||
func numUvarintBytes(x uint64) (n int) {
|
||||
for x >= 0x80 {
|
||||
x >>= 7
|
||||
n++
|
||||
}
|
||||
return n + 1
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -20,16 +20,24 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"reflect"
|
||||
"sync"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/Smerity/govarint"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/couchbase/vellum"
|
||||
mmap "github.com/edsrzf/mmap-go"
|
||||
"github.com/golang/snappy"
|
||||
)
|
||||
|
||||
var reflectStaticSizeSegmentBase int
|
||||
|
||||
func init() {
|
||||
var sb SegmentBase
|
||||
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
|
||||
}
|
||||
|
||||
// Open returns a zap impl of a segment
|
||||
func Open(path string) (segment.Segment, error) {
|
||||
f, err := os.Open(path)
|
||||
|
@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) {
|
|||
SegmentBase: SegmentBase{
|
||||
mem: mm[0 : len(mm)-FooterSize],
|
||||
fieldsMap: make(map[string]uint16),
|
||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
||||
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||
},
|
||||
f: f,
|
||||
mm: mm,
|
||||
path: path,
|
||||
refs: 1,
|
||||
}
|
||||
rv.SegmentBase.updateSize()
|
||||
|
||||
err = rv.loadConfig()
|
||||
if err != nil {
|
||||
|
@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
err = rv.loadDvIterators()
|
||||
err = rv.loadDvReaders()
|
||||
if err != nil {
|
||||
_ = rv.Close()
|
||||
return nil, err
|
||||
|
@ -89,7 +98,39 @@ type SegmentBase struct {
|
|||
fieldsIndexOffset uint64
|
||||
docValueOffset uint64
|
||||
dictLocs []uint64
|
||||
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field
|
||||
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
|
||||
fieldDvNames []string // field names cached in fieldDvReaders
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) Size() int {
|
||||
return int(sb.size)
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) updateSize() {
|
||||
sizeInBytes := reflectStaticSizeSegmentBase +
|
||||
cap(sb.mem)
|
||||
|
||||
// fieldsMap
|
||||
for k, _ := range sb.fieldsMap {
|
||||
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
|
||||
}
|
||||
|
||||
// fieldsInv, dictLocs
|
||||
for _, entry := range sb.fieldsInv {
|
||||
sizeInBytes += len(entry) + size.SizeOfString
|
||||
}
|
||||
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
|
||||
|
||||
// fieldDvReaders
|
||||
for _, v := range sb.fieldDvReaders {
|
||||
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
|
||||
if v != nil {
|
||||
sizeInBytes += v.size()
|
||||
}
|
||||
}
|
||||
|
||||
sb.size = uint64(sizeInBytes)
|
||||
}
|
||||
|
||||
func (sb *SegmentBase) AddRef() {}
|
||||
|
@ -111,56 +152,19 @@ type Segment struct {
|
|||
refs int64
|
||||
}
|
||||
|
||||
func (s *Segment) SizeInBytes() uint64 {
|
||||
func (s *Segment) Size() int {
|
||||
// 8 /* size of file pointer */
|
||||
// 4 /* size of version -> uint32 */
|
||||
// 4 /* size of crc -> uint32 */
|
||||
sizeOfUints := 16
|
||||
|
||||
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
|
||||
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
|
||||
|
||||
// mutex, refs -> int64
|
||||
sizeInBytes += 16
|
||||
|
||||
// do not include the mmap'ed part
|
||||
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem))
|
||||
}
|
||||
|
||||
func (s *SegmentBase) SizeInBytes() uint64 {
|
||||
// 4 /* size of memCRC -> uint32 */
|
||||
// 4 /* size of chunkFactor -> uint32 */
|
||||
// 8 /* size of numDocs -> uint64 */
|
||||
// 8 /* size of storedIndexOffset -> uint64 */
|
||||
// 8 /* size of fieldsIndexOffset -> uint64 */
|
||||
// 8 /* size of docValueOffset -> uint64 */
|
||||
sizeInBytes := 40
|
||||
|
||||
sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
|
||||
|
||||
// fieldsMap
|
||||
for k, _ := range s.fieldsMap {
|
||||
sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */
|
||||
}
|
||||
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
|
||||
|
||||
// fieldsInv, dictLocs
|
||||
for _, entry := range s.fieldsInv {
|
||||
sizeInBytes += (len(entry) + int(segment.SizeOfString))
|
||||
}
|
||||
sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
|
||||
sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
|
||||
|
||||
// fieldDvIterMap
|
||||
sizeInBytes += len(s.fieldDvIterMap) *
|
||||
int(segment.SizeOfPointer+2 /* size of uint16 */)
|
||||
for _, entry := range s.fieldDvIterMap {
|
||||
if entry != nil {
|
||||
sizeInBytes += int(entry.sizeInBytes())
|
||||
}
|
||||
}
|
||||
sizeInBytes += int(segment.SizeOfMap)
|
||||
|
||||
return uint64(sizeInBytes)
|
||||
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
|
||||
}
|
||||
|
||||
func (s *Segment) AddRef() {
|
||||
|
@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error {
|
|||
|
||||
verOffset := crcOffset - 4
|
||||
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
|
||||
if s.version != version {
|
||||
if s.version != Version {
|
||||
return fmt.Errorf("unsupported version %d", s.version)
|
||||
}
|
||||
|
||||
|
@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error {
|
|||
}
|
||||
|
||||
func (s *SegmentBase) loadFields() error {
|
||||
// NOTE for now we assume the fields index immediately preceeds
|
||||
// NOTE for now we assume the fields index immediately precedes
|
||||
// the footer, and if this changes, need to adjust accordingly (or
|
||||
// store explicit length), where s.mem was sliced from s.mm in Open().
|
||||
fieldsIndexEnd := uint64(len(s.mem))
|
||||
|
@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
|||
if err != nil {
|
||||
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
|
||||
}
|
||||
rv.fstReader, err = rv.fst.Reader()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
// visitDocumentCtx holds data structures that are reusable across
|
||||
// multiple VisitDocument() calls to avoid memory allocations
|
||||
type visitDocumentCtx struct {
|
||||
buf []byte
|
||||
reader bytes.Reader
|
||||
arrayPos []uint64
|
||||
}
|
||||
|
||||
var visitDocumentCtxPool = sync.Pool{
|
||||
New: func() interface{} {
|
||||
reuse := &visitDocumentCtx{}
|
||||
return reuse
|
||||
},
|
||||
}
|
||||
|
||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||
// for the specified doc number
|
||||
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
defer visitDocumentCtxPool.Put(vdc)
|
||||
return s.visitDocument(vdc, num, visitor)
|
||||
}
|
||||
|
||||
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64,
|
||||
visitor segment.DocumentFieldValueVisitor) error {
|
||||
// first make sure this is a valid number in this segment
|
||||
if num < s.numDocs {
|
||||
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||
uncompressed, err := snappy.Decode(nil, compressed)
|
||||
|
||||
vdc.reader.Reset(meta)
|
||||
|
||||
// handle _id field special case
|
||||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
idFieldVal := compressed[:idFieldValLen]
|
||||
|
||||
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
|
||||
if !keepGoing {
|
||||
visitDocumentCtxPool.Put(vdc)
|
||||
return nil
|
||||
}
|
||||
|
||||
// handle non-"_id" fields
|
||||
compressed = compressed[idFieldValLen:]
|
||||
|
||||
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// now decode meta and process
|
||||
reader := bytes.NewReader(meta)
|
||||
decoder := govarint.NewU64Base128Decoder(reader)
|
||||
|
||||
keepGoing := true
|
||||
for keepGoing {
|
||||
field, err := decoder.GetU64()
|
||||
field, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
typ, err := decoder.GetU64()
|
||||
typ, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
offset, err := decoder.GetU64()
|
||||
offset, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
l, err := decoder.GetU64()
|
||||
l, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
numap, err := decoder.GetU64()
|
||||
numap, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var arrayPos []uint64
|
||||
if numap > 0 {
|
||||
arrayPos = make([]uint64, numap)
|
||||
if cap(vdc.arrayPos) < int(numap) {
|
||||
vdc.arrayPos = make([]uint64, numap)
|
||||
}
|
||||
arrayPos = vdc.arrayPos[:numap]
|
||||
for i := 0; i < int(numap); i++ {
|
||||
ap, err := decoder.GetU64()
|
||||
ap, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal
|
|||
value := uncompressed[offset : offset+l]
|
||||
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
|
||||
}
|
||||
|
||||
vdc.buf = uncompressed
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// DocID returns the value of the _id field for the given docNum
|
||||
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
|
||||
if num >= s.numDocs {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||
|
||||
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||
|
||||
vdc.reader.Reset(meta)
|
||||
|
||||
// handle _id field special case
|
||||
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
idFieldVal := compressed[:idFieldValLen]
|
||||
|
||||
visitDocumentCtxPool.Put(vdc)
|
||||
|
||||
return idFieldVal, nil
|
||||
}
|
||||
|
||||
// Count returns the number of documents in this segment.
|
||||
func (s *SegmentBase) Count() uint64 {
|
||||
return s.numDocs
|
||||
|
@ -343,16 +417,27 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
|||
return nil, err
|
||||
}
|
||||
|
||||
var postings *PostingsList
|
||||
for _, id := range ids {
|
||||
postings, err = idDict.postingsList([]byte(id), nil, postings)
|
||||
postingsList := emptyPostingsList
|
||||
|
||||
sMax, err := idDict.fst.GetMaxKey()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if postings.postings != nil {
|
||||
rv.Or(postings.postings)
|
||||
sMaxStr := string(sMax)
|
||||
filteredIds := make([]string, 0, len(ids))
|
||||
for _, id := range ids {
|
||||
if id <= sMaxStr {
|
||||
filteredIds = append(filteredIds, id)
|
||||
}
|
||||
}
|
||||
|
||||
for _, id := range filteredIds {
|
||||
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
postingsList.OrInto(rv)
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
|
@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) {
|
|||
return s.dictLocs[fieldIDPlus1-1], nil
|
||||
}
|
||||
|
||||
func (s *SegmentBase) loadDvIterators() error {
|
||||
func (s *SegmentBase) loadDvReaders() error {
|
||||
if s.docValueOffset == fieldNotUninverted {
|
||||
return nil
|
||||
}
|
||||
|
||||
var read uint64
|
||||
for fieldID, field := range s.fieldsInv {
|
||||
fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
var fieldLocStart, fieldLocEnd uint64
|
||||
var n int
|
||||
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
if n <= 0 {
|
||||
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID)
|
||||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
|
||||
}
|
||||
s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc)
|
||||
read += uint64(n)
|
||||
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||
if n <= 0 {
|
||||
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
|
||||
}
|
||||
read += uint64(n)
|
||||
|
||||
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
|
||||
if fieldDvReader != nil {
|
||||
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
|
||||
s.fieldDvNames = append(s.fieldDvNames, field)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
package zap
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"io"
|
||||
|
||||
|
@ -25,28 +24,29 @@ import (
|
|||
// writes out the length of the roaring bitmap in bytes as varint
|
||||
// then writes out the roaring bitmap itself
|
||||
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
|
||||
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) {
|
||||
reuseBuf.Reset()
|
||||
|
||||
// write out postings list to memory so we know the len
|
||||
postingsListLen, err := r.WriteTo(reuseBuf)
|
||||
reuseBufVarint []byte) (int, error) {
|
||||
buf, err := r.ToBytes()
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
var tw int
|
||||
// write out the length of this postings list
|
||||
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen))
|
||||
|
||||
// write out the length
|
||||
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
|
||||
nw, err := w.Write(reuseBufVarint[:n])
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
// write out the postings list itself
|
||||
nw, err = w.Write(reuseBuf.Bytes())
|
||||
|
||||
// write out the roaring bytes
|
||||
nw, err = w.Write(buf)
|
||||
tw += nw
|
||||
if err != nil {
|
||||
return tw, err
|
||||
}
|
||||
|
||||
return tw, nil
|
||||
}
|
||||
|
||||
|
@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset
|
|||
return err
|
||||
}
|
||||
// write out 32-bit version
|
||||
err = binary.Write(w, binary.BigEndian, version)
|
||||
err = binary.Write(w, binary.BigEndian, Version)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -15,10 +15,10 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
@ -27,8 +27,13 @@ import (
|
|||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/couchbase/vellum"
|
||||
lev2 "github.com/couchbase/vellum/levenshtein2"
|
||||
)
|
||||
|
||||
// re usable, threadsafe levenshtein builders
|
||||
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder
|
||||
|
||||
type asynchSegmentResult struct {
|
||||
dictItr segment.DictionaryIterator
|
||||
|
||||
|
@ -40,15 +45,36 @@ type asynchSegmentResult struct {
|
|||
err error
|
||||
}
|
||||
|
||||
var reflectStaticSizeIndexSnapshot int
|
||||
|
||||
func init() {
|
||||
var is interface{} = IndexSnapshot{}
|
||||
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
|
||||
var err error
|
||||
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
|
||||
}
|
||||
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true)
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
|
||||
}
|
||||
}
|
||||
|
||||
type IndexSnapshot struct {
|
||||
parent *Scorch
|
||||
segment []*SegmentSnapshot
|
||||
offsets []uint64
|
||||
internal map[string][]byte
|
||||
epoch uint64
|
||||
size uint64
|
||||
creator string
|
||||
|
||||
m sync.Mutex // Protects the fields that follow.
|
||||
refs int64
|
||||
|
||||
m2 sync.Mutex // Protects the fields that follow.
|
||||
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
|
||||
|
@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) {
|
|||
return err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Close() error {
|
||||
return i.DecRef()
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) Size() int {
|
||||
return int(i.size)
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) updateSize() {
|
||||
i.size += uint64(reflectStaticSizeIndexSnapshot)
|
||||
for _, s := range i.segment {
|
||||
i.size += uint64(s.Size())
|
||||
}
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
|
||||
|
||||
results := make(chan *asynchSegmentResult)
|
||||
for index, segment := range i.segment {
|
||||
go func(index int, segment *SegmentSnapshot) {
|
||||
dict, err := segment.Dictionary(field)
|
||||
dict, err := segment.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
results <- &asynchSegmentResult{err: err}
|
||||
} else {
|
||||
|
@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
|
|||
if next != nil {
|
||||
rv.cursors = append(rv.cursors, &segmentDictCursor{
|
||||
itr: asr.dictItr,
|
||||
curr: next,
|
||||
curr: *next,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
|
|||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictRegexp(field string,
|
||||
termRegex string) (index.FieldDict, error) {
|
||||
// TODO: potential optimization where the literal prefix represents the,
|
||||
// entire regexp, allowing us to use PrefixIterator(prefixTerm)?
|
||||
|
||||
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) getLevAutomaton(term string,
|
||||
fuzziness uint8) (vellum.Automaton, error) {
|
||||
if fuzziness == 1 {
|
||||
return lb1.BuildDfa(term, fuzziness)
|
||||
} else if fuzziness == 2 {
|
||||
return lb2.BuildDfa(term, fuzziness)
|
||||
}
|
||||
return nil, fmt.Errorf("fuzziness exceeds the max limit")
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictFuzzy(field string,
|
||||
term string, fuzziness int, prefix string) (index.FieldDict, error) {
|
||||
a, err := i.getLevAutomaton(term, uint8(fuzziness))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var prefixBeg, prefixEnd []byte
|
||||
if prefix != "" {
|
||||
prefixBeg = []byte(prefix)
|
||||
prefixEnd = segment.IncrementBytes(prefixBeg)
|
||||
}
|
||||
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) FieldDictOnly(field string,
|
||||
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
|
||||
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||
return i.OnlyIterator(onlyTerms, includeCount)
|
||||
})
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
||||
results := make(chan *asynchSegmentResult)
|
||||
for index, segment := range i.segment {
|
||||
|
@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
|
|||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
|
||||
rv = document.NewDocument(id)
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool {
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool {
|
||||
if name == "_id" {
|
||||
return true
|
||||
}
|
||||
|
||||
// copy value, array positions to preserve them beyond the scope of this callback
|
||||
value := append([]byte(nil), val...)
|
||||
arrayPos := append([]uint64(nil), pos...)
|
||||
|
||||
switch typ {
|
||||
case 't':
|
||||
rv.AddField(document.NewTextField(name, pos, value))
|
||||
rv.AddField(document.NewTextField(name, arrayPos, value))
|
||||
case 'n':
|
||||
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value))
|
||||
case 'd':
|
||||
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value))
|
||||
case 'b':
|
||||
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value))
|
||||
case 'g':
|
||||
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value))
|
||||
rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value))
|
||||
}
|
||||
|
||||
return true
|
||||
|
@ -307,26 +403,17 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
|
|||
}
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
|
||||
var found bool
|
||||
var rv string
|
||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||
if field == "_id" {
|
||||
found = true
|
||||
rv = string(value)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
v, err := i.segment[segmentIndex].DocID(localDocNum)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if found {
|
||||
return rv, nil
|
||||
}
|
||||
if v == nil {
|
||||
return "", fmt.Errorf("document number %d not found", docNum)
|
||||
}
|
||||
|
||||
return string(v), nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
|
||||
// FIXME could be done more efficiently directly, but reusing for simplicity
|
||||
tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false)
|
||||
|
@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
|
|||
|
||||
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
|
||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||
rv := i.allocTermFieldReaderDicts(field)
|
||||
|
||||
rv := &IndexSnapshotTermFieldReader{
|
||||
term: term,
|
||||
field: field,
|
||||
snapshot: i,
|
||||
postings: make([]segment.PostingsList, len(i.segment)),
|
||||
iterators: make([]segment.PostingsIterator, len(i.segment)),
|
||||
includeFreq: includeFreq,
|
||||
includeNorm: includeNorm,
|
||||
includeTermVectors: includeTermVectors,
|
||||
rv.term = term
|
||||
rv.field = field
|
||||
rv.snapshot = i
|
||||
if rv.postings == nil {
|
||||
rv.postings = make([]segment.PostingsList, len(i.segment))
|
||||
}
|
||||
if rv.iterators == nil {
|
||||
rv.iterators = make([]segment.PostingsIterator, len(i.segment))
|
||||
}
|
||||
rv.segmentOffset = 0
|
||||
rv.includeFreq = includeFreq
|
||||
rv.includeNorm = includeNorm
|
||||
rv.includeTermVectors = includeTermVectors
|
||||
rv.currPosting = nil
|
||||
rv.currID = rv.currID[:0]
|
||||
|
||||
if rv.dicts == nil {
|
||||
rv.dicts = make([]segment.TermDictionary, len(i.segment))
|
||||
for i, segment := range i.segment {
|
||||
dict, err := segment.Dictionary(field)
|
||||
dict, err := segment.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
pl, err := dict.PostingsList(string(term), nil)
|
||||
rv.dicts[i] = dict
|
||||
}
|
||||
}
|
||||
|
||||
for i, segment := range i.segment {
|
||||
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.postings[i] = pl
|
||||
rv.iterators[i] = pl.Iterator()
|
||||
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])
|
||||
}
|
||||
atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1))
|
||||
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) {
|
||||
i.m2.Lock()
|
||||
if i.fieldTFRs != nil {
|
||||
tfrs := i.fieldTFRs[field]
|
||||
last := len(tfrs) - 1
|
||||
if last >= 0 {
|
||||
tfr = tfrs[last]
|
||||
tfrs[last] = nil
|
||||
i.fieldTFRs[field] = tfrs[:last]
|
||||
i.m2.Unlock()
|
||||
return
|
||||
}
|
||||
}
|
||||
i.m2.Unlock()
|
||||
return &IndexSnapshotTermFieldReader{}
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
|
||||
i.parent.rootLock.RLock()
|
||||
obsolete := i.parent.root != i
|
||||
i.parent.rootLock.RUnlock()
|
||||
if obsolete {
|
||||
// if we're not the current root (mutations happened), don't bother recycling
|
||||
return
|
||||
}
|
||||
|
||||
i.m2.Lock()
|
||||
if i.fieldTFRs == nil {
|
||||
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
|
||||
}
|
||||
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
|
||||
i.m2.Unlock()
|
||||
}
|
||||
|
||||
func docNumberToBytes(buf []byte, in uint64) []byte {
|
||||
if len(buf) != 8 {
|
||||
if cap(buf) >= 8 {
|
||||
|
@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte {
|
|||
}
|
||||
|
||||
func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
|
||||
var res uint64
|
||||
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
if len(in) != 8 {
|
||||
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
|
||||
}
|
||||
return res, nil
|
||||
return binary.BigEndian.Uint64(in), nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor) error {
|
||||
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil)
|
||||
return err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor,
|
||||
dvs segment.DocVisitState) (segment.DocVisitState, error) {
|
||||
docNum, err := docInternalToNumber(id)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(i.segment) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
_, dvs, err = i.documentVisitFieldTermsOnSegment(
|
||||
segmentIndex, localDocNum, fields, nil, visitor, dvs)
|
||||
|
||||
return dvs, err
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) documentVisitFieldTermsOnSegment(
|
||||
segmentIndex int, localDocNum uint64, fields []string, cFields []string,
|
||||
visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) (
|
||||
cFieldsOut []string, dvsOut segment.DocVisitState, err error) {
|
||||
ss := i.segment[segmentIndex]
|
||||
|
||||
var vFields []string // fields that are visitable via the segment
|
||||
|
||||
ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable)
|
||||
if ssvOk && ssv != nil {
|
||||
vFields, err = ssv.VisitableDocValueFields()
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var errCh chan error
|
||||
|
||||
// cFields represents the fields that we'll need from the
|
||||
// cachedDocs, and might be optionally be provided by the caller,
|
||||
// if the caller happens to know we're on the same segmentIndex
|
||||
// from a previous invocation
|
||||
if cFields == nil {
|
||||
cFields = subtractStrings(fields, vFields)
|
||||
|
||||
if !ss.cachedDocs.hasFields(cFields) {
|
||||
errCh = make(chan error, 1)
|
||||
|
||||
go func() {
|
||||
err := ss.cachedDocs.prepareFields(cFields, ss)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
close(errCh)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
if ssvOk && ssv != nil && len(vFields) > 0 {
|
||||
dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if errCh != nil {
|
||||
err = <-errCh
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if len(cFields) > 0 {
|
||||
ss.cachedDocs.visitDoc(localDocNum, cFields, visitor)
|
||||
}
|
||||
|
||||
return cFields, dvs, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshot) DocValueReader(fields []string) (
|
||||
index.DocValueReader, error) {
|
||||
return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil
|
||||
}
|
||||
|
||||
type DocValueReader struct {
|
||||
i *IndexSnapshot
|
||||
fields []string
|
||||
dvs segment.DocVisitState
|
||||
|
||||
currSegmentIndex int
|
||||
currCachedFields []string
|
||||
}
|
||||
|
||||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||
visitor index.DocumentFieldTermVisitor) (err error) {
|
||||
docNum, err := docInternalToNumber(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(i.segment) {
|
||||
|
||||
segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||
if segmentIndex >= len(dvr.i.segment) {
|
||||
return nil
|
||||
}
|
||||
|
||||
ss := i.segment[segmentIndex]
|
||||
if dvr.currSegmentIndex != segmentIndex {
|
||||
dvr.currSegmentIndex = segmentIndex
|
||||
dvr.currCachedFields = nil
|
||||
}
|
||||
|
||||
dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment(
|
||||
dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs)
|
||||
|
||||
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok {
|
||||
// get the list of doc value persisted fields
|
||||
pFields, err := zaps.VisitableDocValueFields()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// assort the fields for which terms look up have to
|
||||
// be performed runtime
|
||||
dvPendingFields := extractDvPendingFields(fields, pFields)
|
||||
if len(dvPendingFields) == 0 {
|
||||
// all fields are doc value persisted
|
||||
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
||||
}
|
||||
|
||||
// concurrently trigger the runtime doc value preparations for
|
||||
// pending fields as well as the visit of the persisted doc values
|
||||
errCh := make(chan error, 1)
|
||||
|
||||
func (i *IndexSnapshot) DumpAll() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
||||
if err != nil {
|
||||
errCh <- err
|
||||
}
|
||||
close(rv)
|
||||
}()
|
||||
|
||||
// visit the persisted dv while the cache preparation is in progress
|
||||
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
||||
if err != nil {
|
||||
return err
|
||||
return rv
|
||||
}
|
||||
|
||||
// err out if fieldCache preparation failed
|
||||
err = <-errCh
|
||||
if err != nil {
|
||||
return err
|
||||
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor)
|
||||
return nil
|
||||
func (i *IndexSnapshot) DumpFields() chan interface{} {
|
||||
rv := make(chan interface{})
|
||||
go func() {
|
||||
close(rv)
|
||||
}()
|
||||
return rv
|
||||
}
|
||||
|
||||
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor)
|
||||
// subtractStrings returns set a minus elements of set b.
|
||||
func subtractStrings(a, b []string) []string {
|
||||
if len(b) == 0 {
|
||||
return a
|
||||
}
|
||||
|
||||
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error {
|
||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor)
|
||||
return nil
|
||||
}
|
||||
|
||||
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string,
|
||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) {
|
||||
|
||||
for _, field := range fields {
|
||||
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
|
||||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
||||
for {
|
||||
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
visitor(field, tlist[0:i])
|
||||
tlist = tlist[i+1:]
|
||||
rv := make([]string, 0, len(a))
|
||||
OUTER:
|
||||
for _, as := range a {
|
||||
for _, bs := range b {
|
||||
if as == bs {
|
||||
continue OUTER
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func extractDvPendingFields(requestedFields, persistedFields []string) []string {
|
||||
removeMap := map[string]struct{}{}
|
||||
for _, str := range persistedFields {
|
||||
removeMap[str] = struct{}{}
|
||||
}
|
||||
|
||||
rv := make([]string, 0, len(requestedFields))
|
||||
for _, s := range requestedFields {
|
||||
if _, ok := removeMap[s]; !ok {
|
||||
rv = append(rv, s)
|
||||
}
|
||||
rv = append(rv, as)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
|
|
@ -23,12 +23,13 @@ import (
|
|||
|
||||
type segmentDictCursor struct {
|
||||
itr segment.DictionaryIterator
|
||||
curr *index.DictEntry
|
||||
curr index.DictEntry
|
||||
}
|
||||
|
||||
type IndexSnapshotFieldDict struct {
|
||||
snapshot *IndexSnapshot
|
||||
cursors []*segmentDictCursor
|
||||
entry index.DictEntry
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
|
||||
|
@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} {
|
|||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
||||
if len(i.cursors) <= 0 {
|
||||
if len(i.cursors) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
rv := i.cursors[0].curr
|
||||
i.entry = i.cursors[0].curr
|
||||
next, err := i.cursors[0].itr.Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
|||
heap.Pop(i)
|
||||
} else {
|
||||
// modified heap, fix it
|
||||
i.cursors[0].curr = next
|
||||
i.cursors[0].curr = *next
|
||||
heap.Fix(i, 0)
|
||||
}
|
||||
// look for any other entries with the exact same term
|
||||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term {
|
||||
rv.Count += i.cursors[0].curr.Count
|
||||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term {
|
||||
i.entry.Count += i.cursors[0].curr.Count
|
||||
next, err := i.cursors[0].itr.Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
|||
heap.Pop(i)
|
||||
} else {
|
||||
// modified heap, fix it
|
||||
i.cursors[0].curr = next
|
||||
i.cursors[0].curr = *next
|
||||
heap.Fix(i, 0)
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
return &i.entry, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotFieldDict) Close() error {
|
||||
|
|
|
@ -16,17 +16,30 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"reflect"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexSnapshotDocIDReader int
|
||||
|
||||
func init() {
|
||||
var isdr IndexSnapshotDocIDReader
|
||||
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size())
|
||||
}
|
||||
|
||||
type IndexSnapshotDocIDReader struct {
|
||||
snapshot *IndexSnapshot
|
||||
iterators []roaring.IntIterable
|
||||
segmentOffset int
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotDocIDReader) Size() int {
|
||||
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
|
||||
for i.segmentOffset < len(i.iterators) {
|
||||
if !i.iterators[i.segmentOffset].HasNext() {
|
||||
|
|
|
@ -16,16 +16,27 @@ package scorch
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexSnapshotTermFieldReader int
|
||||
|
||||
func init() {
|
||||
var istfr IndexSnapshotTermFieldReader
|
||||
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size())
|
||||
}
|
||||
|
||||
type IndexSnapshotTermFieldReader struct {
|
||||
term []byte
|
||||
field string
|
||||
snapshot *IndexSnapshot
|
||||
dicts []segment.TermDictionary
|
||||
postings []segment.PostingsList
|
||||
iterators []segment.PostingsIterator
|
||||
segmentOffset int
|
||||
|
@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct {
|
|||
currID index.IndexInternalID
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr +
|
||||
len(i.term) +
|
||||
len(i.field) +
|
||||
len(i.currID)
|
||||
|
||||
for _, entry := range i.postings {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range i.iterators {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
if i.currPosting != nil {
|
||||
sizeInBytes += i.currPosting.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
|
||||
rv := preAlloced
|
||||
if rv == nil {
|
||||
rv = &index.TermFieldDoc{}
|
||||
}
|
||||
// find the next hit
|
||||
for i.segmentOffset < len(i.postings) {
|
||||
for i.segmentOffset < len(i.iterators) {
|
||||
next, err := i.iterators[i.segmentOffset].Next()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin
|
|||
}
|
||||
if i.includeTermVectors {
|
||||
locs := next.Locations()
|
||||
if cap(rv.Vectors) < len(locs) {
|
||||
rv.Vectors = make([]*index.TermFieldVector, len(locs))
|
||||
backing := make([]index.TermFieldVector, len(locs))
|
||||
for i := range backing {
|
||||
rv.Vectors[i] = &backing[i]
|
||||
}
|
||||
}
|
||||
rv.Vectors = rv.Vectors[:len(locs)]
|
||||
for i, loc := range locs {
|
||||
rv.Vectors[i] = &index.TermFieldVector{
|
||||
*rv.Vectors[i] = index.TermFieldVector{
|
||||
Start: loc.Start(),
|
||||
End: loc.End(),
|
||||
Pos: loc.Pos(),
|
||||
|
@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
|
|||
}
|
||||
*i = *(i2.(*IndexSnapshotTermFieldReader))
|
||||
}
|
||||
// FIXME do something better
|
||||
next, err := i.Next(preAlloced)
|
||||
num, err := docInternalToNumber(ID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
|
||||
}
|
||||
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
|
||||
if segIndex >= len(i.snapshot.segment) {
|
||||
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
|
||||
segIndex, len(i.snapshot.segment))
|
||||
}
|
||||
// skip directly to the target segment
|
||||
i.segmentOffset = segIndex
|
||||
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if next == nil {
|
||||
return nil, nil
|
||||
// we jumped directly to the segment that should have contained it
|
||||
// but it wasn't there, so reuse Next() which should correctly
|
||||
// get the next hit after it (we moved i.segmentOffset)
|
||||
return i.Next(preAlloced)
|
||||
}
|
||||
for bytes.Compare(next.ID, ID) < 0 {
|
||||
next, err = i.Next(preAlloced)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
if preAlloced == nil {
|
||||
preAlloced = &index.TermFieldDoc{}
|
||||
}
|
||||
if next == nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
return next, nil
|
||||
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
|
||||
i.snapshot.offsets[segIndex])
|
||||
i.postingToTermFieldDoc(next, preAlloced)
|
||||
i.currID = preAlloced.ID
|
||||
i.currPosting = next
|
||||
return preAlloced, nil
|
||||
}
|
||||
|
||||
func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
||||
|
@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
|||
|
||||
func (i *IndexSnapshotTermFieldReader) Close() error {
|
||||
if i.snapshot != nil {
|
||||
atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1))
|
||||
atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1))
|
||||
i.snapshot.recycleTermFieldReader(i)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ import (
|
|||
"log"
|
||||
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type RollbackPoint struct {
|
||||
|
|
|
@ -15,42 +15,25 @@
|
|||
package scorch
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/RoaringBitmap/roaring"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var TermSeparator byte = 0xff
|
||||
|
||||
var TermSeparatorSplitSlice = []byte{TermSeparator}
|
||||
|
||||
type SegmentDictionarySnapshot struct {
|
||||
s *SegmentSnapshot
|
||||
d segment.TermDictionary
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
||||
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
|
||||
return s.d.PostingsList(term, s.s.deleted)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
|
||||
return s.d.Iterator()
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||
return s.d.PrefixIterator(prefix)
|
||||
}
|
||||
|
||||
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
|
||||
return s.d.RangeIterator(start, end)
|
||||
}
|
||||
|
||||
type SegmentSnapshot struct {
|
||||
id uint64
|
||||
segment segment.Segment
|
||||
deleted *roaring.Bitmap
|
||||
creator string
|
||||
|
||||
cachedDocs *cachedDocs
|
||||
}
|
||||
|
@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel
|
|||
return s.segment.VisitDocument(num, visitor)
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Count() uint64 {
|
||||
func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
|
||||
return s.segment.DocID(num)
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Count() uint64 {
|
||||
rv := s.segment.Count()
|
||||
if s.deleted != nil {
|
||||
rv -= s.deleted.GetCardinality()
|
||||
|
@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 {
|
|||
return rv
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
|
||||
d, err := s.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &SegmentDictionarySnapshot{
|
||||
s: s,
|
||||
d: d,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
||||
rv, err := s.segment.DocNumbers(docIDs)
|
||||
if err != nil {
|
||||
|
@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
|||
return rv, nil
|
||||
}
|
||||
|
||||
// DocNumbersLive returns bitsit containing doc numbers for all live docs
|
||||
// DocNumbersLive returns a bitmap containing doc numbers for all live docs
|
||||
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
|
||||
rv := roaring.NewBitmap()
|
||||
rv.AddRange(0, s.segment.Count())
|
||||
|
@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string {
|
|||
return s.segment.Fields()
|
||||
}
|
||||
|
||||
func (s *SegmentSnapshot) Size() (rv int) {
|
||||
rv = s.segment.Size()
|
||||
if s.deleted != nil {
|
||||
rv += int(s.deleted.GetSizeInBytes())
|
||||
}
|
||||
rv += s.cachedDocs.Size()
|
||||
return
|
||||
}
|
||||
|
||||
type cachedFieldDocs struct {
|
||||
m sync.Mutex
|
||||
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
|
||||
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
|
||||
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
||||
defer close(cfd.readyCh)
|
||||
func (cfd *cachedFieldDocs) Size() int {
|
||||
var rv int
|
||||
cfd.m.Lock()
|
||||
for _, entry := range cfd.docs {
|
||||
rv += 8 /* size of uint64 */ + len(entry)
|
||||
}
|
||||
cfd.m.Unlock()
|
||||
return rv
|
||||
}
|
||||
|
||||
func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
|
||||
cfd.m.Lock()
|
||||
defer func() {
|
||||
close(cfd.readyCh)
|
||||
cfd.m.Unlock()
|
||||
}()
|
||||
|
||||
cfd.size += uint64(size.SizeOfUint64) /* size field */
|
||||
dict, err := ss.segment.Dictionary(field)
|
||||
if err != nil {
|
||||
cfd.err = err
|
||||
return
|
||||
}
|
||||
|
||||
var postings segment.PostingsList
|
||||
var postingsItr segment.PostingsIterator
|
||||
|
||||
dictItr := dict.Iterator()
|
||||
next, err := dictItr.Next()
|
||||
for err == nil && next != nil {
|
||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
||||
var err1 error
|
||||
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
|
||||
if err1 != nil {
|
||||
cfd.err = err1
|
||||
return
|
||||
}
|
||||
|
||||
postingsItr := postings.Iterator()
|
||||
cfd.size += uint64(size.SizeOfUint64) /* map key */
|
||||
postingsItr = postings.Iterator(false, false, false, postingsItr)
|
||||
nextPosting, err2 := postingsItr.Next()
|
||||
for err2 == nil && nextPosting != nil {
|
||||
docNum := nextPosting.Number()
|
||||
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
|
||||
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
|
||||
cfd.size += uint64(len(next.Term) + 1) // map value
|
||||
nextPosting, err2 = postingsItr.Next()
|
||||
}
|
||||
|
||||
|
@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
|||
type cachedDocs struct {
|
||||
m sync.Mutex // As the cache is asynchronously prepared, need a lock
|
||||
cache map[string]*cachedFieldDocs // Keyed by field
|
||||
size uint64
|
||||
}
|
||||
|
||||
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
|
||||
c.m.Lock()
|
||||
|
||||
if c.cache == nil {
|
||||
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
|
||||
}
|
||||
|
@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
|||
docs: make(map[uint64][]byte),
|
||||
}
|
||||
|
||||
go c.cache[field].prepareFields(field, ss)
|
||||
go c.cache[field].prepareField(field, ss)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
|||
c.m.Lock()
|
||||
}
|
||||
|
||||
c.updateSizeLOCKED()
|
||||
|
||||
c.m.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cachedDocs) sizeInBytes() uint64 {
|
||||
sizeInBytes := 0
|
||||
// hasFields returns true if the cache has all the given fields
|
||||
func (c *cachedDocs) hasFields(fields []string) bool {
|
||||
c.m.Lock()
|
||||
for k, v := range c.cache { // cachedFieldDocs
|
||||
sizeInBytes += len(k)
|
||||
if v != nil {
|
||||
for _, entry := range v.docs { // docs
|
||||
sizeInBytes += 8 /* size of uint64 */ + len(entry)
|
||||
}
|
||||
for _, field := range fields {
|
||||
if _, exists := c.cache[field]; !exists {
|
||||
c.m.Unlock()
|
||||
return false // found a field not in cache
|
||||
}
|
||||
}
|
||||
c.m.Unlock()
|
||||
return uint64(sizeInBytes)
|
||||
return true
|
||||
}
|
||||
|
||||
func (c *cachedDocs) Size() int {
|
||||
return int(atomic.LoadUint64(&c.size))
|
||||
}
|
||||
|
||||
func (c *cachedDocs) updateSizeLOCKED() {
|
||||
sizeInBytes := 0
|
||||
for k, v := range c.cache { // cachedFieldDocs
|
||||
sizeInBytes += len(k)
|
||||
if v != nil {
|
||||
sizeInBytes += v.Size()
|
||||
}
|
||||
}
|
||||
atomic.StoreUint64(&c.size, uint64(sizeInBytes))
|
||||
}
|
||||
|
||||
func (c *cachedDocs) visitDoc(localDocNum uint64,
|
||||
fields []string, visitor index.DocumentFieldTermVisitor) {
|
||||
c.m.Lock()
|
||||
|
||||
for _, field := range fields {
|
||||
if cachedFieldDocs, exists := c.cache[field]; exists {
|
||||
c.m.Unlock()
|
||||
<-cachedFieldDocs.readyCh
|
||||
c.m.Lock()
|
||||
|
||||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
||||
for {
|
||||
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
||||
if i < 0 {
|
||||
break
|
||||
}
|
||||
visitor(field, tlist[0:i])
|
||||
tlist = tlist[i+1:]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
c.m.Unlock()
|
||||
}
|
||||
|
|
|
@ -16,63 +16,125 @@ package scorch
|
|||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"reflect"
|
||||
"sync/atomic"
|
||||
)
|
||||
|
||||
// Stats tracks statistics about the index
|
||||
// Stats tracks statistics about the index, fields that are
|
||||
// prefixed like CurXxxx are gauges (can go up and down),
|
||||
// and fields that are prefixed like TotXxxx are monotonically
|
||||
// increasing counters.
|
||||
type Stats struct {
|
||||
updates, deletes, batches, errors uint64
|
||||
analysisTime, indexTime uint64
|
||||
termSearchersStarted uint64
|
||||
termSearchersFinished uint64
|
||||
numPlainTextBytesIndexed uint64
|
||||
numItemsIntroduced uint64
|
||||
numItemsPersisted uint64
|
||||
i *Scorch
|
||||
TotUpdates uint64
|
||||
TotDeletes uint64
|
||||
|
||||
TotBatches uint64
|
||||
TotBatchesEmpty uint64
|
||||
TotBatchIntroTime uint64
|
||||
MaxBatchIntroTime uint64
|
||||
|
||||
CurRootEpoch uint64
|
||||
LastPersistedEpoch uint64
|
||||
LastMergedEpoch uint64
|
||||
|
||||
TotOnErrors uint64
|
||||
|
||||
TotAnalysisTime uint64
|
||||
TotIndexTime uint64
|
||||
|
||||
TotIndexedPlainTextBytes uint64
|
||||
|
||||
TotTermSearchersStarted uint64
|
||||
TotTermSearchersFinished uint64
|
||||
|
||||
TotIntroduceLoop uint64
|
||||
TotIntroduceSegmentBeg uint64
|
||||
TotIntroduceSegmentEnd uint64
|
||||
TotIntroducePersistBeg uint64
|
||||
TotIntroducePersistEnd uint64
|
||||
TotIntroduceMergeBeg uint64
|
||||
TotIntroduceMergeEnd uint64
|
||||
TotIntroduceRevertBeg uint64
|
||||
TotIntroduceRevertEnd uint64
|
||||
|
||||
TotIntroducedItems uint64
|
||||
TotIntroducedSegmentsBatch uint64
|
||||
TotIntroducedSegmentsMerge uint64
|
||||
|
||||
TotPersistLoopBeg uint64
|
||||
TotPersistLoopErr uint64
|
||||
TotPersistLoopProgress uint64
|
||||
TotPersistLoopWait uint64
|
||||
TotPersistLoopWaitNotified uint64
|
||||
TotPersistLoopEnd uint64
|
||||
|
||||
TotPersistedItems uint64
|
||||
TotItemsToPersist uint64
|
||||
TotPersistedSegments uint64
|
||||
|
||||
TotPersisterSlowMergerPause uint64
|
||||
TotPersisterSlowMergerResume uint64
|
||||
|
||||
TotPersisterNapPauseCompleted uint64
|
||||
TotPersisterMergerNapBreak uint64
|
||||
|
||||
TotFileMergeLoopBeg uint64
|
||||
TotFileMergeLoopErr uint64
|
||||
TotFileMergeLoopEnd uint64
|
||||
|
||||
TotFileMergePlan uint64
|
||||
TotFileMergePlanErr uint64
|
||||
TotFileMergePlanNone uint64
|
||||
TotFileMergePlanOk uint64
|
||||
|
||||
TotFileMergePlanTasks uint64
|
||||
TotFileMergePlanTasksDone uint64
|
||||
TotFileMergePlanTasksErr uint64
|
||||
TotFileMergePlanTasksSegments uint64
|
||||
TotFileMergePlanTasksSegmentsEmpty uint64
|
||||
|
||||
TotFileMergeSegmentsEmpty uint64
|
||||
TotFileMergeSegments uint64
|
||||
TotFileSegmentsAtRoot uint64
|
||||
TotFileMergeWrittenBytes uint64
|
||||
|
||||
TotFileMergeZapBeg uint64
|
||||
TotFileMergeZapEnd uint64
|
||||
TotFileMergeZapTime uint64
|
||||
MaxFileMergeZapTime uint64
|
||||
|
||||
TotFileMergeIntroductions uint64
|
||||
TotFileMergeIntroductionsDone uint64
|
||||
TotFileMergeIntroductionsSkipped uint64
|
||||
|
||||
TotMemMergeBeg uint64
|
||||
TotMemMergeErr uint64
|
||||
TotMemMergeDone uint64
|
||||
TotMemMergeZapBeg uint64
|
||||
TotMemMergeZapEnd uint64
|
||||
TotMemMergeZapTime uint64
|
||||
MaxMemMergeZapTime uint64
|
||||
TotMemMergeSegments uint64
|
||||
TotMemorySegmentsAtRoot uint64
|
||||
}
|
||||
|
||||
func (s *Stats) statsMap() (map[string]interface{}, error) {
|
||||
// atomically populates the returned map
|
||||
func (s *Stats) ToMap() map[string]interface{} {
|
||||
m := map[string]interface{}{}
|
||||
m["updates"] = atomic.LoadUint64(&s.updates)
|
||||
m["deletes"] = atomic.LoadUint64(&s.deletes)
|
||||
m["batches"] = atomic.LoadUint64(&s.batches)
|
||||
m["errors"] = atomic.LoadUint64(&s.errors)
|
||||
m["analysis_time"] = atomic.LoadUint64(&s.analysisTime)
|
||||
m["index_time"] = atomic.LoadUint64(&s.indexTime)
|
||||
m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted)
|
||||
m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished)
|
||||
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed)
|
||||
m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced)
|
||||
m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted)
|
||||
|
||||
if s.i.path != "" {
|
||||
finfos, err := ioutil.ReadDir(s.i.path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var numFilesOnDisk, numBytesUsedDisk uint64
|
||||
|
||||
for _, finfo := range finfos {
|
||||
if !finfo.IsDir() {
|
||||
numBytesUsedDisk += uint64(finfo.Size())
|
||||
numFilesOnDisk++
|
||||
sve := reflect.ValueOf(s).Elem()
|
||||
svet := sve.Type()
|
||||
for i := 0; i < svet.NumField(); i++ {
|
||||
svef := sve.Field(i)
|
||||
if svef.CanAddr() {
|
||||
svefp := svef.Addr().Interface()
|
||||
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64))
|
||||
}
|
||||
}
|
||||
|
||||
m["num_bytes_used_disk"] = numBytesUsedDisk
|
||||
m["num_files_on_disk"] = numFilesOnDisk
|
||||
return m
|
||||
}
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
// MarshalJSON implements json.Marshaler
|
||||
// MarshalJSON implements json.Marshaler, and in contrast to standard
|
||||
// json marshaling provides atomic safety
|
||||
func (s *Stats) MarshalJSON() ([]byte, error) {
|
||||
m, err := s.statsMap()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return json.Marshal(m)
|
||||
return json.Marshal(s.ToMap())
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ package boltdb
|
|||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type Iterator struct {
|
||||
|
|
|
@ -16,7 +16,7 @@ package boltdb
|
|||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
type Reader struct {
|
||||
|
|
|
@ -30,7 +30,7 @@ import (
|
|||
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/boltdb/bolt"
|
||||
bolt "github.com/etcd-io/bbolt"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore,
|
|||
bo.ReadOnly = ro
|
||||
}
|
||||
|
||||
if initialMmapSize, ok := config["initialMmapSize"].(int); ok {
|
||||
bo.InitialMmapSize = initialMmapSize
|
||||
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok {
|
||||
bo.InitialMmapSize = int(initialMmapSize)
|
||||
}
|
||||
|
||||
db, err := bolt.Open(path, 0600, bo)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
|
@ -15,11 +15,20 @@
|
|||
package upsidedown
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
)
|
||||
|
||||
var reflectStaticSizeIndexReader int
|
||||
|
||||
func init() {
|
||||
var ir IndexReader
|
||||
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size())
|
||||
}
|
||||
|
||||
type IndexReader struct {
|
||||
index *UpsideDownCouch
|
||||
kvreader store.KVReader
|
||||
|
@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte {
|
|||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) {
|
||||
return &DocValueReader{i: i, fields: fields}, nil
|
||||
}
|
||||
|
||||
type DocValueReader struct {
|
||||
i *IndexReader
|
||||
fields []string
|
||||
}
|
||||
|
||||
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||
visitor index.DocumentFieldTermVisitor) error {
|
||||
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor)
|
||||
}
|
||||
|
|
|
@ -16,13 +16,27 @@ package upsidedown
|
|||
|
||||
import (
|
||||
"bytes"
|
||||
"reflect"
|
||||
"sort"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/index/store"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeUpsideDownCouchTermFieldReader int
|
||||
var reflectStaticSizeUpsideDownCouchDocIDReader int
|
||||
|
||||
func init() {
|
||||
var tfr UpsideDownCouchTermFieldReader
|
||||
reflectStaticSizeUpsideDownCouchTermFieldReader =
|
||||
int(reflect.TypeOf(tfr).Size())
|
||||
var cdr UpsideDownCouchDocIDReader
|
||||
reflectStaticSizeUpsideDownCouchDocIDReader =
|
||||
int(reflect.TypeOf(cdr).Size())
|
||||
}
|
||||
|
||||
type UpsideDownCouchTermFieldReader struct {
|
||||
count uint64
|
||||
indexReader *IndexReader
|
||||
|
@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct {
|
|||
includeTermVectors bool
|
||||
}
|
||||
|
||||
func (r *UpsideDownCouchTermFieldReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr +
|
||||
len(r.term) +
|
||||
r.tfrPrealloc.Size() +
|
||||
len(r.keyBuf)
|
||||
|
||||
if r.tfrNext != nil {
|
||||
sizeInBytes += r.tfrNext.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
|
||||
bufNeeded := termFrequencyRowKeySize(term, nil)
|
||||
if bufNeeded < dictionaryRowKeySize(term) {
|
||||
|
@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct {
|
|||
onlyMode bool
|
||||
}
|
||||
|
||||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
||||
func (r *UpsideDownCouchDocIDReader) Size() int {
|
||||
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader +
|
||||
reflectStaticSizeIndexReader + size.SizeOfPtr
|
||||
|
||||
for _, entry := range r.only {
|
||||
sizeInBytes += size.SizeOfString + len(entry)
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
||||
startBytes := []byte{0x0}
|
||||
endBytes := []byte{0xff}
|
||||
|
||||
|
|
|
@ -20,10 +20,22 @@ import (
|
|||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/size"
|
||||
"github.com/golang/protobuf/proto"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermFrequencyRow int
|
||||
var reflectStaticSizeTermVector int
|
||||
|
||||
func init() {
|
||||
var tfr TermFrequencyRow
|
||||
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
|
||||
var tv TermVector
|
||||
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
|
||||
}
|
||||
|
||||
const ByteSeparator byte = 0xff
|
||||
|
||||
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
||||
|
@ -358,6 +370,11 @@ type TermVector struct {
|
|||
end uint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) Size() int {
|
||||
return reflectStaticSizeTermVector + size.SizeOfPtr +
|
||||
len(tv.arrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) String() string {
|
||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
||||
}
|
||||
|
@ -371,6 +388,18 @@ type TermFrequencyRow struct {
|
|||
field uint16
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermFrequencyRow +
|
||||
len(tfr.term) +
|
||||
len(tfr.doc)
|
||||
|
||||
for _, entry := range tfr.vectors {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (tfr *TermFrequencyRow) Term() []byte {
|
||||
return tfr.term
|
||||
}
|
||||
|
@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error {
|
|||
|
||||
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
||||
tfr.doc = key[3+len(term)+1:]
|
||||
if len(tfr.doc) <= 0 {
|
||||
if len(tfr.doc) == 0 {
|
||||
return fmt.Errorf("invalid term frequency key, empty docid")
|
||||
}
|
||||
|
||||
|
|
|
@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
|
|||
}
|
||||
|
||||
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
||||
if len(in) <= 0 {
|
||||
if len(in) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -810,6 +810,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
|||
}
|
||||
}
|
||||
|
||||
if len(batch.IndexOps) > 0 {
|
||||
go func() {
|
||||
for _, doc := range batch.IndexOps {
|
||||
if doc != nil {
|
||||
|
@ -819,6 +820,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
|||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// retrieve back index rows concurrent with analysis
|
||||
docBackIndexRowErr := error(nil)
|
||||
|
@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
|||
} else {
|
||||
atomic.AddUint64(&udc.stats.errors, 1)
|
||||
}
|
||||
|
||||
persistedCallback := batch.PersistedCallback()
|
||||
if persistedCallback != nil {
|
||||
persistedCallback(err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
|
|
@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest {
|
|||
Explain: req.Explain,
|
||||
Sort: req.Sort.Copy(),
|
||||
IncludeLocations: req.IncludeLocations,
|
||||
Score: req.Score,
|
||||
}
|
||||
return &rv
|
||||
}
|
||||
|
|
|
@ -50,6 +50,12 @@ const storePath = "store"
|
|||
|
||||
var mappingInternalKey = []byte("_mapping")
|
||||
|
||||
const SearchQueryStartCallbackKey = "_search_query_start_callback_key"
|
||||
const SearchQueryEndCallbackKey = "_search_query_end_callback_key"
|
||||
|
||||
type SearchQueryStartCallbackFn func(size uint64) error
|
||||
type SearchQueryEndCallbackFn func(size uint64) error
|
||||
|
||||
func indexStorePath(path string) string {
|
||||
return path + string(os.PathSeparator) + storePath
|
||||
}
|
||||
|
@ -362,6 +368,68 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) {
|
|||
return i.SearchInContext(context.Background(), req)
|
||||
}
|
||||
|
||||
var documentMatchEmptySize int
|
||||
var searchContextEmptySize int
|
||||
var facetResultEmptySize int
|
||||
var documentEmptySize int
|
||||
|
||||
func init() {
|
||||
var dm search.DocumentMatch
|
||||
documentMatchEmptySize = dm.Size()
|
||||
|
||||
var sc search.SearchContext
|
||||
searchContextEmptySize = sc.Size()
|
||||
|
||||
var fr search.FacetResult
|
||||
facetResultEmptySize = fr.Size()
|
||||
|
||||
var d document.Document
|
||||
documentEmptySize = d.Size()
|
||||
}
|
||||
|
||||
// memNeededForSearch is a helper function that returns an estimate of RAM
|
||||
// needed to execute a search request.
|
||||
func memNeededForSearch(req *SearchRequest,
|
||||
searcher search.Searcher,
|
||||
topnCollector *collector.TopNCollector) uint64 {
|
||||
|
||||
backingSize := req.Size + req.From + 1
|
||||
if req.Size+req.From > collector.PreAllocSizeSkipCap {
|
||||
backingSize = collector.PreAllocSizeSkipCap + 1
|
||||
}
|
||||
numDocMatches := backingSize + searcher.DocumentMatchPoolSize()
|
||||
|
||||
estimate := 0
|
||||
|
||||
// overhead, size in bytes from collector
|
||||
estimate += topnCollector.Size()
|
||||
|
||||
// pre-allocing DocumentMatchPool
|
||||
estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize
|
||||
|
||||
// searcher overhead
|
||||
estimate += searcher.Size()
|
||||
|
||||
// overhead from results, lowestMatchOutsideResults
|
||||
estimate += (numDocMatches + 1) * documentMatchEmptySize
|
||||
|
||||
// additional overhead from SearchResult
|
||||
estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus
|
||||
|
||||
// overhead from facet results
|
||||
if req.Facets != nil {
|
||||
estimate += len(req.Facets) * facetResultEmptySize
|
||||
}
|
||||
|
||||
// highlighting, store
|
||||
if len(req.Fields) > 0 || req.Highlight != nil {
|
||||
// Size + From => number of hits
|
||||
estimate += (req.Size + req.From) * documentEmptySize
|
||||
}
|
||||
|
||||
return uint64(estimate)
|
||||
}
|
||||
|
||||
// SearchInContext executes a search request operation within the provided
|
||||
// Context. Returns a SearchResult object or an error.
|
||||
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
|
||||
|
@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
|||
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{
|
||||
Explain: req.Explain,
|
||||
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil,
|
||||
Score: req.Score,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
|||
collector.SetFacetsBuilder(facetsBuilder)
|
||||
}
|
||||
|
||||
memNeeded := memNeededForSearch(req, searcher, collector)
|
||||
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil {
|
||||
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok {
|
||||
err = cbF(memNeeded)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil {
|
||||
if cbF, ok := cb.(SearchQueryEndCallbackFn); ok {
|
||||
defer func() {
|
||||
_ = cbF(memNeeded)
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
err = collector.Collect(ctx, searcher, indexReader)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
|||
doc, err := indexReader.Document(hit.ID)
|
||||
if err == nil && doc != nil {
|
||||
if len(req.Fields) > 0 {
|
||||
for _, f := range req.Fields {
|
||||
fieldsToLoad := deDuplicate(req.Fields)
|
||||
for _, f := range fieldsToLoad {
|
||||
for _, docF := range doc.Fields {
|
||||
if f == "*" || docF.Name() == f {
|
||||
var value interface{}
|
||||
|
@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
|||
return &SearchResult{
|
||||
Status: &SearchStatus{
|
||||
Total: 1,
|
||||
Failed: 0,
|
||||
Successful: 1,
|
||||
Errors: make(map[string]error),
|
||||
},
|
||||
Request: req,
|
||||
Hits: hits,
|
||||
|
@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error {
|
|||
}
|
||||
return f.indexReader.Close()
|
||||
}
|
||||
|
||||
// helper function to remove duplicate entries from slice of strings
|
||||
func deDuplicate(fields []string) []string {
|
||||
entries := make(map[string]struct{})
|
||||
ret := []string{}
|
||||
for _, entry := range fields {
|
||||
if _, exists := entries[entry]; !exists {
|
||||
entries[entry] = struct{}{}
|
||||
ret = append(ret, entry)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/blevesearch/bleve/index/upsidedown"
|
||||
)
|
||||
|
@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) {
|
|||
}
|
||||
|
||||
func indexMetaPath(path string) string {
|
||||
return path + string(os.PathSeparator) + metaFilename
|
||||
return filepath.Join(path, metaFilename)
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ type DocumentMapping struct {
|
|||
Dynamic bool `json:"dynamic"`
|
||||
Properties map[string]*DocumentMapping `json:"properties,omitempty"`
|
||||
Fields []*FieldMapping `json:"fields,omitempty"`
|
||||
DefaultAnalyzer string `json:"default_analyzer"`
|
||||
DefaultAnalyzer string `json:"default_analyzer,omitempty"`
|
||||
|
||||
// StructTagKey overrides "json" when looking for field names in struct tags
|
||||
StructTagKey string `json:"struct_tag_key,omitempty"`
|
||||
|
@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string {
|
|||
}
|
||||
|
||||
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) {
|
||||
// allow default "json" tag to be overriden
|
||||
// allow default "json" tag to be overridden
|
||||
structTagKey := dm.StructTagKey
|
||||
if structTagKey == "" {
|
||||
structTagKey = "json"
|
||||
}
|
||||
|
||||
val := reflect.ValueOf(data)
|
||||
if !val.IsValid() {
|
||||
return
|
||||
}
|
||||
|
||||
typ := val.Type()
|
||||
switch typ.Kind() {
|
||||
case reflect.Map:
|
||||
|
@ -420,8 +424,12 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
|
|||
if subDocMapping != nil {
|
||||
// index by explicit mapping
|
||||
for _, fieldMapping := range subDocMapping.Fields {
|
||||
if fieldMapping.Type == "geopoint" {
|
||||
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
|
||||
} else {
|
||||
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
|
||||
}
|
||||
}
|
||||
} else if closestDocMapping.Dynamic {
|
||||
// automatic indexing behavior
|
||||
|
||||
|
|
|
@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string {
|
|||
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
|
||||
docType := im.determineType(data)
|
||||
docMapping := im.mappingForType(docType)
|
||||
walkContext := im.newWalkContext(doc, docMapping)
|
||||
if docMapping.Enabled {
|
||||
walkContext := im.newWalkContext(doc, docMapping)
|
||||
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
|
||||
|
||||
// see if the _all field was disabled
|
||||
|
|
|
@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} {
|
|||
|
||||
func lookupPropertyPathPart(data interface{}, part string) interface{} {
|
||||
val := reflect.ValueOf(data)
|
||||
if !val.IsValid() {
|
||||
return nil
|
||||
}
|
||||
typ := val.Type()
|
||||
switch typ.Kind() {
|
||||
case reflect.Map:
|
||||
|
|
|
@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16}
|
|||
|
||||
// Interleave the first 32 bits of each uint64
|
||||
// apdated from org.apache.lucene.util.BitUtil
|
||||
// whcih was adapted from:
|
||||
// which was adapted from:
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
|
||||
func Interleave(v1, v2 uint64) uint64 {
|
||||
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4]
|
||||
|
|
|
@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) {
|
|||
}
|
||||
|
||||
func ValidPrefixCodedTerm(p string) (bool, int) {
|
||||
return ValidPrefixCodedTermBytes([]byte(p))
|
||||
}
|
||||
|
||||
func ValidPrefixCodedTermBytes(p []byte) (bool, int) {
|
||||
if len(p) > 0 {
|
||||
if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 {
|
||||
return false, 0
|
||||
|
|
|
@ -17,15 +17,29 @@ package bleve
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/analysis"
|
||||
"github.com/blevesearch/bleve/analysis/datetime/optional"
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/registry"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/collector"
|
||||
"github.com/blevesearch/bleve/search/query"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeSearchResult int
|
||||
var reflectStaticSizeSearchStatus int
|
||||
|
||||
func init() {
|
||||
var sr SearchResult
|
||||
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size())
|
||||
var ss SearchStatus
|
||||
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size())
|
||||
}
|
||||
|
||||
var cache = registry.NewCache()
|
||||
|
||||
const defaultDateTimeParser = optional.Name
|
||||
|
@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) {
|
|||
// Explain triggers inclusion of additional search
|
||||
// result score explanations.
|
||||
// Sort describes the desired order for the results to be returned.
|
||||
// Score controls the kind of scoring performed
|
||||
//
|
||||
// A special field named "*" can be used to return all fields.
|
||||
type SearchRequest struct {
|
||||
|
@ -259,6 +274,7 @@ type SearchRequest struct {
|
|||
Explain bool `json:"explain"`
|
||||
Sort search.SortOrder `json:"sort"`
|
||||
IncludeLocations bool `json:"includeLocations"`
|
||||
Score string `json:"score,omitempty"`
|
||||
}
|
||||
|
||||
func (r *SearchRequest) Validate() error {
|
||||
|
@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
|
|||
Explain bool `json:"explain"`
|
||||
Sort []json.RawMessage `json:"sort"`
|
||||
IncludeLocations bool `json:"includeLocations"`
|
||||
Score string `json:"score"`
|
||||
}
|
||||
|
||||
err := json.Unmarshal(input, &temp)
|
||||
|
@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
|
|||
r.Fields = temp.Fields
|
||||
r.Facets = temp.Facets
|
||||
r.IncludeLocations = temp.IncludeLocations
|
||||
r.Score = temp.Score
|
||||
r.Query, err = query.ParseQuery(temp.Q)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -432,6 +450,24 @@ type SearchResult struct {
|
|||
Facets search.FacetResults `json:"facets"`
|
||||
}
|
||||
|
||||
func (sr *SearchResult) Size() int {
|
||||
sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr +
|
||||
reflectStaticSizeSearchStatus
|
||||
|
||||
for _, entry := range sr.Hits {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
for k, v := range sr.Facets {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
v.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (sr *SearchResult) String() string {
|
||||
rv := ""
|
||||
if sr.Total > 0 {
|
||||
|
@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) {
|
|||
|
||||
sr.Facets.Merge(other.Facets)
|
||||
}
|
||||
|
||||
// MemoryNeededForSearchResult is an exported helper function to determine the RAM
|
||||
// needed to accommodate the results for a given search request.
|
||||
func MemoryNeededForSearchResult(req *SearchRequest) uint64 {
|
||||
if req == nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
numDocMatches := req.Size + req.From
|
||||
if req.Size+req.From > collector.PreAllocSizeSkipCap {
|
||||
numDocMatches = collector.PreAllocSizeSkipCap
|
||||
}
|
||||
|
||||
estimate := 0
|
||||
|
||||
// overhead from the SearchResult structure
|
||||
var sr SearchResult
|
||||
estimate += sr.Size()
|
||||
|
||||
var dm search.DocumentMatch
|
||||
sizeOfDocumentMatch := dm.Size()
|
||||
|
||||
// overhead from results
|
||||
estimate += numDocMatches * sizeOfDocumentMatch
|
||||
|
||||
// overhead from facet results
|
||||
if req.Facets != nil {
|
||||
var fr search.FacetResult
|
||||
estimate += len(req.Facets) * fr.Size()
|
||||
}
|
||||
|
||||
// highlighting, store
|
||||
var d document.Document
|
||||
if len(req.Fields) > 0 || req.Highlight != nil {
|
||||
for i := 0; i < (req.Size + req.From); i++ {
|
||||
estimate += (req.Size + req.From) * d.Size()
|
||||
}
|
||||
}
|
||||
|
||||
return uint64(estimate)
|
||||
}
|
||||
|
|
|
@ -30,3 +30,23 @@ type Collector interface {
|
|||
SetFacetsBuilder(facetsBuilder *FacetsBuilder)
|
||||
FacetResults() FacetResults
|
||||
}
|
||||
|
||||
// DocumentMatchHandler is the type of document match callback
|
||||
// bleve will invoke during the search.
|
||||
// Eventually, bleve will indicate the completion of an ongoing search,
|
||||
// by passing a nil value for the document match callback.
|
||||
// The application should take a copy of the hit/documentMatch
|
||||
// if it wish to own it or need prolonged access to it.
|
||||
type DocumentMatchHandler func(hit *DocumentMatch) error
|
||||
|
||||
type MakeDocumentMatchHandlerKeyType string
|
||||
|
||||
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
|
||||
"MakeDocumentMatchHandlerKey")
|
||||
|
||||
// MakeDocumentMatchHandler is an optional DocumentMatchHandler
|
||||
// builder function which the applications can pass to bleve.
|
||||
// These builder methods gives a DocumentMatchHandler function
|
||||
// to bleve, which it will invoke on every document matches.
|
||||
type MakeDocumentMatchHandler func(ctx *SearchContext) (
|
||||
callback DocumentMatchHandler, loadID bool, err error)
|
||||
|
|
|
@ -25,9 +25,9 @@ type collectStoreHeap struct {
|
|||
compare collectorCompare
|
||||
}
|
||||
|
||||
func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap {
|
||||
func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap {
|
||||
rv := &collectStoreHeap{
|
||||
heap: make(search.DocumentMatchCollection, 0, cap),
|
||||
heap: make(search.DocumentMatchCollection, 0, capacity),
|
||||
compare: compare,
|
||||
}
|
||||
heap.Init(rv)
|
||||
|
|
|
@ -25,7 +25,7 @@ type collectStoreList struct {
|
|||
compare collectorCompare
|
||||
}
|
||||
|
||||
func newStoreList(cap int, compare collectorCompare) *collectStoreList {
|
||||
func newStoreList(capacity int, compare collectorCompare) *collectStoreList {
|
||||
rv := &collectStoreList{
|
||||
results: list.New(),
|
||||
compare: compare,
|
||||
|
@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList {
|
|||
return rv
|
||||
}
|
||||
|
||||
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch,
|
||||
size int) *search.DocumentMatch {
|
||||
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch {
|
||||
c.add(doc)
|
||||
if c.len() > size {
|
||||
return c.removeLast()
|
||||
|
|
|
@ -21,9 +21,9 @@ type collectStoreSlice struct {
|
|||
compare collectorCompare
|
||||
}
|
||||
|
||||
func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice {
|
||||
func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice {
|
||||
rv := &collectStoreSlice{
|
||||
slice: make(search.DocumentMatchCollection, 0, cap),
|
||||
slice: make(search.DocumentMatchCollection, 0, capacity),
|
||||
compare: compare,
|
||||
}
|
||||
return rv
|
||||
|
|
|
@ -16,12 +16,21 @@ package collector
|
|||
|
||||
import (
|
||||
"context"
|
||||
"reflect"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTopNCollector int
|
||||
|
||||
func init() {
|
||||
var coll TopNCollector
|
||||
reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
|
||||
}
|
||||
|
||||
type collectorStore interface {
|
||||
// Add the document, and if the new store size exceeds the provided size
|
||||
// the last element is removed and returned. If the size has not been
|
||||
|
@ -58,6 +67,8 @@ type TopNCollector struct {
|
|||
cachedDesc []bool
|
||||
|
||||
lowestMatchOutsideResults *search.DocumentMatch
|
||||
updateFieldVisitor index.DocumentFieldTermVisitor
|
||||
dvReader index.DocValueReader
|
||||
}
|
||||
|
||||
// CheckDoneEvery controls how frequently we check the context deadline
|
||||
|
@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
|
|||
return hc
|
||||
}
|
||||
|
||||
func (hc *TopNCollector) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
|
||||
|
||||
if hc.facetsBuilder != nil {
|
||||
sizeInBytes += hc.facetsBuilder.Size()
|
||||
}
|
||||
|
||||
for _, entry := range hc.neededFields {
|
||||
sizeInBytes += len(entry) + size.SizeOfString
|
||||
}
|
||||
|
||||
sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
// Collect goes to the index to find the matching documents
|
||||
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
|
||||
startTime := time.Now()
|
||||
|
@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
|||
}
|
||||
searchContext := &search.SearchContext{
|
||||
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
|
||||
Collector: hc,
|
||||
}
|
||||
|
||||
hc.dvReader, err = reader.DocValueReader(hc.neededFields)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hc.updateFieldVisitor = func(field string, term []byte) {
|
||||
if hc.facetsBuilder != nil {
|
||||
hc.facetsBuilder.UpdateVisitor(field, term)
|
||||
}
|
||||
hc.sort.UpdateVisitor(field, term)
|
||||
}
|
||||
|
||||
dmHandlerMaker := MakeTopNDocumentMatchHandler
|
||||
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
|
||||
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
|
||||
}
|
||||
// use the application given builder for making the custom document match
|
||||
// handler and perform callbacks/invocations on the newly made handler.
|
||||
dmHandler, loadID, err := dmHandlerMaker(searchContext)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
hc.needDocIds = hc.needDocIds || loadID
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
|
@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
|||
}
|
||||
}
|
||||
|
||||
err = hc.collectSingle(searchContext, reader, next)
|
||||
err = hc.prepareDocumentMatch(searchContext, reader, next)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
err = dmHandler(next)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
|
||||
next, err = searcher.Next(searchContext)
|
||||
}
|
||||
|
||||
// help finalize/flush the results in case
|
||||
// of custom document match handlers.
|
||||
err = dmHandler(nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// compute search duration
|
||||
hc.took = time.Since(startTime)
|
||||
if err != nil {
|
||||
|
@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
|||
|
||||
var sortByScoreOpt = []string{"_score"}
|
||||
|
||||
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error {
|
||||
var err error
|
||||
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
|
||||
reader index.IndexReader, d *search.DocumentMatch) (err error) {
|
||||
|
||||
// visit field terms for features that require it (sort, facets)
|
||||
if len(hc.neededFields) > 0 {
|
||||
|
@ -187,11 +253,24 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
|||
hc.sort.Value(d)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func MakeTopNDocumentMatchHandler(
|
||||
ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
|
||||
var hc *TopNCollector
|
||||
var ok bool
|
||||
if hc, ok = ctx.Collector.(*TopNCollector); ok {
|
||||
return func(d *search.DocumentMatch) error {
|
||||
if d == nil {
|
||||
return nil
|
||||
}
|
||||
// optimization, we track lowest sorting hit already removed from heap
|
||||
// with this one comparison, we can avoid all heap operations if
|
||||
// this hit would have been added and then immediately removed
|
||||
if hc.lowestMatchOutsideResults != nil {
|
||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults)
|
||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
|
||||
hc.lowestMatchOutsideResults)
|
||||
if cmp >= 0 {
|
||||
// this hit can't possibly be in the result set, so avoid heap ops
|
||||
ctx.DocumentMatchPool.Put(d)
|
||||
|
@ -204,7 +283,8 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
|||
if hc.lowestMatchOutsideResults == nil {
|
||||
hc.lowestMatchOutsideResults = removed
|
||||
} else {
|
||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults)
|
||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
|
||||
removed, hc.lowestMatchOutsideResults)
|
||||
if cmp < 0 {
|
||||
tmp := hc.lowestMatchOutsideResults
|
||||
hc.lowestMatchOutsideResults = removed
|
||||
|
@ -212,8 +292,10 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}, false, nil
|
||||
}
|
||||
return nil, false, nil
|
||||
}
|
||||
|
||||
// visitFieldTerms is responsible for visiting the field terms of the
|
||||
|
@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc
|
|||
hc.facetsBuilder.StartDoc()
|
||||
}
|
||||
|
||||
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) {
|
||||
if hc.facetsBuilder != nil {
|
||||
hc.facetsBuilder.UpdateVisitor(field, term)
|
||||
}
|
||||
hc.sort.UpdateVisitor(field, term)
|
||||
})
|
||||
|
||||
err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
|
||||
if hc.facetsBuilder != nil {
|
||||
hc.facetsBuilder.EndDoc()
|
||||
}
|
||||
|
@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
|
|||
return err
|
||||
}
|
||||
}
|
||||
doc.Complete(nil)
|
||||
return nil
|
||||
})
|
||||
|
||||
|
@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults {
|
|||
if hc.facetsBuilder != nil {
|
||||
return hc.facetsBuilder.Results()
|
||||
}
|
||||
return search.FacetResults{}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -17,8 +17,18 @@ package search
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeExplanation int
|
||||
|
||||
func init() {
|
||||
var e Explanation
|
||||
reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size())
|
||||
}
|
||||
|
||||
type Explanation struct {
|
||||
Value float64 `json:"value"`
|
||||
Message string `json:"message"`
|
||||
|
@ -32,3 +42,14 @@ func (expl *Explanation) String() string {
|
|||
}
|
||||
return string(js)
|
||||
}
|
||||
|
||||
func (expl *Explanation) Size() int {
|
||||
sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr +
|
||||
len(expl.Message)
|
||||
|
||||
for _, entry := range expl.Children {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
|
|
@ -15,13 +15,25 @@
|
|||
package facet
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
"github.com/blevesearch/bleve/numeric"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDateTimeFacetBuilder int
|
||||
var reflectStaticSizedateTimeRange int
|
||||
|
||||
func init() {
|
||||
var dtfb DateTimeFacetBuilder
|
||||
reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size())
|
||||
var dtr dateTimeRange
|
||||
reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size())
|
||||
}
|
||||
|
||||
type dateTimeRange struct {
|
||||
start time.Time
|
||||
end time.Time
|
||||
|
@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
func (fb *DateTimeFacetBuilder) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr +
|
||||
len(fb.field)
|
||||
|
||||
for k, _ := range fb.termsCount {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfInt
|
||||
}
|
||||
|
||||
for k, _ := range fb.ranges {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfPtr + reflectStaticSizedateTimeRange
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
|
||||
r := dateTimeRange{
|
||||
start: start,
|
||||
|
|
|
@ -15,12 +15,24 @@
|
|||
package facet
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/numeric"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeNumericFacetBuilder int
|
||||
var reflectStaticSizenumericRange int
|
||||
|
||||
func init() {
|
||||
var nfb NumericFacetBuilder
|
||||
reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size())
|
||||
var nr numericRange
|
||||
reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size())
|
||||
}
|
||||
|
||||
type numericRange struct {
|
||||
min *float64
|
||||
max *float64
|
||||
|
@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
func (fb *NumericFacetBuilder) Size() int {
|
||||
sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr +
|
||||
len(fb.field)
|
||||
|
||||
for k, _ := range fb.termsCount {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfInt
|
||||
}
|
||||
|
||||
for k, _ := range fb.ranges {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfPtr + reflectStaticSizenumericRange
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) {
|
||||
r := numericRange{
|
||||
min: min,
|
||||
|
|
|
@ -15,11 +15,20 @@
|
|||
package facet
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermsFacetBuilder int
|
||||
|
||||
func init() {
|
||||
var tfb TermsFacetBuilder
|
||||
reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size())
|
||||
}
|
||||
|
||||
type TermsFacetBuilder struct {
|
||||
size int
|
||||
field string
|
||||
|
@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
func (fb *TermsFacetBuilder) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr +
|
||||
len(fb.field)
|
||||
|
||||
for k, _ := range fb.termsCount {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfInt
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (fb *TermsFacetBuilder) Field() string {
|
||||
return fb.field
|
||||
}
|
||||
|
|
|
@ -15,11 +15,32 @@
|
|||
package search
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeFacetsBuilder int
|
||||
var reflectStaticSizeFacetResult int
|
||||
var reflectStaticSizeTermFacet int
|
||||
var reflectStaticSizeNumericRangeFacet int
|
||||
var reflectStaticSizeDateRangeFacet int
|
||||
|
||||
func init() {
|
||||
var fb FacetsBuilder
|
||||
reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size())
|
||||
var fr FacetResult
|
||||
reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size())
|
||||
var tf TermFacet
|
||||
reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size())
|
||||
var nrf NumericRangeFacet
|
||||
reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size())
|
||||
var drf DateRangeFacet
|
||||
reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size())
|
||||
}
|
||||
|
||||
type FacetBuilder interface {
|
||||
StartDoc()
|
||||
UpdateVisitor(field string, term []byte)
|
||||
|
@ -27,23 +48,40 @@ type FacetBuilder interface {
|
|||
|
||||
Result() *FacetResult
|
||||
Field() string
|
||||
|
||||
Size() int
|
||||
}
|
||||
|
||||
type FacetsBuilder struct {
|
||||
indexReader index.IndexReader
|
||||
facets map[string]FacetBuilder
|
||||
facetNames []string
|
||||
facets []FacetBuilder
|
||||
fields []string
|
||||
}
|
||||
|
||||
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
|
||||
return &FacetsBuilder{
|
||||
indexReader: indexReader,
|
||||
facets: make(map[string]FacetBuilder, 0),
|
||||
}
|
||||
}
|
||||
|
||||
func (fb *FacetsBuilder) Size() int {
|
||||
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr
|
||||
|
||||
for k, v := range fb.facets {
|
||||
sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k])
|
||||
}
|
||||
|
||||
for _, entry := range fb.fields {
|
||||
sizeInBytes += size.SizeOfString + len(entry)
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
|
||||
fb.facets[name] = facetBuilder
|
||||
fb.facetNames = append(fb.facetNames, name)
|
||||
fb.facets = append(fb.facets, facetBuilder)
|
||||
fb.fields = append(fb.fields, facetBuilder.Field())
|
||||
}
|
||||
|
||||
|
@ -213,6 +251,14 @@ type FacetResult struct {
|
|||
DateRanges DateRangeFacets `json:"date_ranges,omitempty"`
|
||||
}
|
||||
|
||||
func (fr *FacetResult) Size() int {
|
||||
return reflectStaticSizeFacetResult + size.SizeOfPtr +
|
||||
len(fr.Field) +
|
||||
len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
|
||||
len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) +
|
||||
len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr)
|
||||
}
|
||||
|
||||
func (fr *FacetResult) Merge(other *FacetResult) {
|
||||
fr.Total += other.Total
|
||||
fr.Missing += other.Missing
|
||||
|
@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) {
|
|||
|
||||
func (fb *FacetsBuilder) Results() FacetResults {
|
||||
fr := make(FacetResults)
|
||||
for facetName, facetBuilder := range fb.facets {
|
||||
for i, facetBuilder := range fb.facets {
|
||||
facetResult := facetBuilder.Result()
|
||||
fr[facetName] = facetResult
|
||||
fr[fb.facetNames[i]] = facetResult
|
||||
}
|
||||
return fr
|
||||
}
|
||||
|
|
|
@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int {
|
|||
// in which case the first return val will be the max
|
||||
// and the second will be true, indicating max was exceeded
|
||||
func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
|
||||
v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil)
|
||||
return v, wasMax
|
||||
}
|
||||
|
||||
func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) {
|
||||
la := len(a)
|
||||
lb := len(b)
|
||||
|
||||
ld := int(math.Abs(float64(la - lb)))
|
||||
if ld > max {
|
||||
return max, true
|
||||
return max, true, d
|
||||
}
|
||||
|
||||
d := make([]int, la+1)
|
||||
if cap(d) < la+1 {
|
||||
d = make([]int, la+1)
|
||||
}
|
||||
d = d[:la+1]
|
||||
|
||||
var lastdiag, olddiag, temp int
|
||||
|
||||
for i := 1; i <= la; i++ {
|
||||
|
@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
|
|||
}
|
||||
// after each row if rowmin isn't less than max stop
|
||||
if rowmin > max {
|
||||
return max, true
|
||||
return max, true, d
|
||||
}
|
||||
}
|
||||
return d[la], false
|
||||
return d[la], false, d
|
||||
}
|
||||
|
|
|
@ -14,6 +14,17 @@
|
|||
|
||||
package search
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDocumentMatchPool int
|
||||
|
||||
func init() {
|
||||
var dmp DocumentMatchPool
|
||||
reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size())
|
||||
}
|
||||
|
||||
// DocumentMatchPoolTooSmall is a callback function that can be executed
|
||||
// when the DocumentMatchPool does not have sufficient capacity
|
||||
// By default we just perform just-in-time allocation, but you could log
|
||||
|
|
|
@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
|||
}
|
||||
ss = append(ss, sr)
|
||||
}
|
||||
|
||||
if len(ss) < 1 {
|
||||
return searcher.NewMatchNoneSearcher(i)
|
||||
}
|
||||
|
||||
return searcher.NewConjunctionSearcher(i, ss, options)
|
||||
}
|
||||
|
||||
|
|
|
@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) {
|
|||
q.Min = m
|
||||
}
|
||||
|
||||
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
|
||||
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
||||
options search.SearcherOptions) (search.Searcher, error) {
|
||||
ss := make([]search.Searcher, 0, len(q.Disjuncts))
|
||||
for _, disjunct := range q.Disjuncts {
|
||||
sr, err := disjunct.Searcher(i, m, options)
|
||||
|
@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
|||
}
|
||||
ss = append(ss, sr)
|
||||
}
|
||||
|
||||
if len(ss) < 1 {
|
||||
return searcher.NewMatchNoneSearcher(i)
|
||||
} else if len(ss) == 1 && int(q.Min) == ss[0].Min() {
|
||||
// apply optimization only if both conditions below are satisfied:
|
||||
// - disjunction searcher has only 1 child searcher
|
||||
// - parent searcher's min setting is equal to child searcher's min
|
||||
|
||||
return ss[0], nil
|
||||
}
|
||||
|
||||
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options)
|
||||
}
|
||||
|
||||
|
|
|
@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
|
|||
}
|
||||
|
||||
expand = func(query Query) (Query, error) {
|
||||
switch query.(type) {
|
||||
switch q := query.(type) {
|
||||
case *QueryStringQuery:
|
||||
q := query.(*QueryStringQuery)
|
||||
parsed, err := parseQuerySyntax(q.Query)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err)
|
||||
}
|
||||
return expand(parsed)
|
||||
case *ConjunctionQuery:
|
||||
q := *query.(*ConjunctionQuery)
|
||||
children, err := expandSlice(q.Conjuncts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
q.Conjuncts = children
|
||||
return &q, nil
|
||||
return q, nil
|
||||
case *DisjunctionQuery:
|
||||
q := *query.(*DisjunctionQuery)
|
||||
children, err := expandSlice(q.Disjuncts)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
q.Disjuncts = children
|
||||
return &q, nil
|
||||
return q, nil
|
||||
case *BooleanQuery:
|
||||
q := *query.(*BooleanQuery)
|
||||
var err error
|
||||
q.Must, err = expand(q.Must)
|
||||
if err != nil {
|
||||
|
@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
|
|||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &q, nil
|
||||
return q, nil
|
||||
default:
|
||||
return query, nil
|
||||
}
|
||||
|
|
|
@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
|||
// see where to go
|
||||
if !l.seenDot && next == '.' {
|
||||
// stay in this state
|
||||
l.seenDot = true
|
||||
l.buf += string(next)
|
||||
return inNumOrStrState, true
|
||||
} else if unicode.IsDigit(next) {
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
package query
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
|
@ -28,7 +27,6 @@ type RegexpQuery struct {
|
|||
Regexp string `json:"regexp"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
BoostVal *Boost `json:"boost,omitempty"`
|
||||
compiled *regexp.Regexp
|
||||
}
|
||||
|
||||
// NewRegexpQuery creates a new Query which finds
|
||||
|
@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti
|
|||
if q.FieldVal == "" {
|
||||
field = m.DefaultSearchField()
|
||||
}
|
||||
err := q.compile()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
|
||||
}
|
||||
|
||||
func (q *RegexpQuery) Validate() error {
|
||||
return q.compile()
|
||||
}
|
||||
|
||||
func (q *RegexpQuery) compile() error {
|
||||
if q.compiled == nil {
|
||||
// require that pattern NOT be anchored to start and end of term
|
||||
// require that pattern NOT be anchored to start and end of term.
|
||||
// do not attempt to remove trailing $, its presence is not
|
||||
// known to interfere with LiteralPrefix() the way ^ does
|
||||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
||||
actualRegexp := q.Regexp
|
||||
if strings.HasPrefix(actualRegexp, "^") {
|
||||
actualRegexp = actualRegexp[1:] // remove leading ^
|
||||
}
|
||||
// do not attempt to remove trailing $, it's presence is not
|
||||
// known to interfere with LiteralPrefix() the way ^ does
|
||||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
||||
var err error
|
||||
q.compiled, err = regexp.Compile(actualRegexp)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
return searcher.NewRegexpStringSearcher(i, actualRegexp, field,
|
||||
q.BoostVal.Value(), options)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
func (q *RegexpQuery) Validate() error {
|
||||
return nil // real validation delayed until searcher constructor
|
||||
}
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
package query
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
|
@ -47,7 +46,6 @@ type WildcardQuery struct {
|
|||
Wildcard string `json:"wildcard"`
|
||||
FieldVal string `json:"field,omitempty"`
|
||||
BoostVal *Boost `json:"boost,omitempty"`
|
||||
compiled *regexp.Regexp
|
||||
}
|
||||
|
||||
// NewWildcardQuery creates a new Query which finds
|
||||
|
@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op
|
|||
if q.FieldVal == "" {
|
||||
field = m.DefaultSearchField()
|
||||
}
|
||||
if q.compiled == nil {
|
||||
var err error
|
||||
q.compiled, err = q.convertToRegexp()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
|
||||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
||||
|
||||
return searcher.NewRegexpStringSearcher(i, regexpString, field,
|
||||
q.BoostVal.Value(), options)
|
||||
}
|
||||
|
||||
func (q *WildcardQuery) Validate() error {
|
||||
var err error
|
||||
q.compiled, err = q.convertToRegexp()
|
||||
return err
|
||||
}
|
||||
|
||||
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
|
||||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
||||
return regexp.Compile(regexpString)
|
||||
return nil // real validation delayed until searcher constructor
|
||||
}
|
||||
|
|
|
@ -15,13 +15,27 @@
|
|||
package scorer
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeConjunctionQueryScorer int
|
||||
|
||||
func init() {
|
||||
var cqs ConjunctionQueryScorer
|
||||
reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size())
|
||||
}
|
||||
|
||||
type ConjunctionQueryScorer struct {
|
||||
options search.SearcherOptions
|
||||
}
|
||||
|
||||
func (s *ConjunctionQueryScorer) Size() int {
|
||||
return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr
|
||||
}
|
||||
|
||||
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer {
|
||||
return &ConjunctionQueryScorer{
|
||||
options: options,
|
||||
|
@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
|||
childrenExplanations = make([]*search.Explanation, len(constituents))
|
||||
}
|
||||
|
||||
locations := []search.FieldTermLocationMap{}
|
||||
for i, docMatch := range constituents {
|
||||
sum += docMatch.Score
|
||||
if s.options.Explain {
|
||||
childrenExplanations[i] = docMatch.Expl
|
||||
}
|
||||
if docMatch.Locations != nil {
|
||||
locations = append(locations, docMatch.Locations)
|
||||
}
|
||||
}
|
||||
newScore := sum
|
||||
var newExpl *search.Explanation
|
||||
|
@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
|||
rv := constituents[0]
|
||||
rv.Score = newScore
|
||||
rv.Expl = newExpl
|
||||
if len(locations) == 1 {
|
||||
rv.Locations = locations[0]
|
||||
} else if len(locations) > 1 {
|
||||
rv.Locations = search.MergeLocations(locations)
|
||||
}
|
||||
rv.FieldTermLocations = search.MergeFieldTermLocations(
|
||||
rv.FieldTermLocations, constituents[1:])
|
||||
|
||||
return rv
|
||||
}
|
||||
|
|
|
@ -16,11 +16,20 @@ package scorer
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeConstantScorer int
|
||||
|
||||
func init() {
|
||||
var cs ConstantScorer
|
||||
reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size())
|
||||
}
|
||||
|
||||
type ConstantScorer struct {
|
||||
constant float64
|
||||
boost float64
|
||||
|
@ -30,6 +39,16 @@ type ConstantScorer struct {
|
|||
queryWeightExplanation *search.Explanation
|
||||
}
|
||||
|
||||
func (s *ConstantScorer) Size() int {
|
||||
sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr
|
||||
|
||||
if s.queryWeightExplanation != nil {
|
||||
sizeInBytes += s.queryWeightExplanation.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
|
||||
rv := ConstantScorer{
|
||||
options: options,
|
||||
|
|
|
@ -16,14 +16,27 @@ package scorer
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDisjunctionQueryScorer int
|
||||
|
||||
func init() {
|
||||
var dqs DisjunctionQueryScorer
|
||||
reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size())
|
||||
}
|
||||
|
||||
type DisjunctionQueryScorer struct {
|
||||
options search.SearcherOptions
|
||||
}
|
||||
|
||||
func (s *DisjunctionQueryScorer) Size() int {
|
||||
return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr
|
||||
}
|
||||
|
||||
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer {
|
||||
return &DisjunctionQueryScorer{
|
||||
options: options,
|
||||
|
@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
|||
childrenExplanations = make([]*search.Explanation, len(constituents))
|
||||
}
|
||||
|
||||
var locations []search.FieldTermLocationMap
|
||||
for i, docMatch := range constituents {
|
||||
sum += docMatch.Score
|
||||
if s.options.Explain {
|
||||
childrenExplanations[i] = docMatch.Expl
|
||||
}
|
||||
if docMatch.Locations != nil {
|
||||
locations = append(locations, docMatch.Locations)
|
||||
}
|
||||
}
|
||||
|
||||
var rawExpl *search.Explanation
|
||||
|
@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
|||
rv := constituents[0]
|
||||
rv.Score = newScore
|
||||
rv.Expl = newExpl
|
||||
if len(locations) == 1 {
|
||||
rv.Locations = locations[0]
|
||||
} else if len(locations) > 1 {
|
||||
rv.Locations = search.MergeLocations(locations)
|
||||
}
|
||||
rv.FieldTermLocations = search.MergeFieldTermLocations(
|
||||
rv.FieldTermLocations, constituents[1:])
|
||||
|
||||
return rv
|
||||
}
|
||||
|
|
|
@ -17,13 +17,22 @@ package scorer
|
|||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeTermQueryScorer int
|
||||
|
||||
func init() {
|
||||
var tqs TermQueryScorer
|
||||
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
|
||||
}
|
||||
|
||||
type TermQueryScorer struct {
|
||||
queryTerm []byte
|
||||
queryTerm string
|
||||
queryField string
|
||||
queryBoost float64
|
||||
docTerm uint64
|
||||
|
@ -36,9 +45,24 @@ type TermQueryScorer struct {
|
|||
queryWeightExplanation *search.Explanation
|
||||
}
|
||||
|
||||
func (s *TermQueryScorer) Size() int {
|
||||
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
|
||||
len(s.queryTerm) + len(s.queryField)
|
||||
|
||||
if s.idfExplanation != nil {
|
||||
sizeInBytes += s.idfExplanation.Size()
|
||||
}
|
||||
|
||||
if s.queryWeightExplanation != nil {
|
||||
sizeInBytes += s.queryWeightExplanation.Size()
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
|
||||
rv := TermQueryScorer{
|
||||
queryTerm: queryTerm,
|
||||
queryTerm: string(queryTerm),
|
||||
queryField: queryField,
|
||||
queryBoost: queryBoost,
|
||||
docTerm: docTerm,
|
||||
|
@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
|
|||
}
|
||||
s.queryWeightExplanation = &search.Explanation{
|
||||
Value: s.queryWeight,
|
||||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost),
|
||||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
|
||||
Children: childrenExplanations,
|
||||
}
|
||||
}
|
||||
|
@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
|||
childrenExplanations := make([]*search.Explanation, 3)
|
||||
childrenExplanations[0] = &search.Explanation{
|
||||
Value: tf,
|
||||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq),
|
||||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
|
||||
}
|
||||
childrenExplanations[1] = &search.Explanation{
|
||||
Value: termMatch.Norm,
|
||||
|
@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
|||
childrenExplanations[2] = s.idfExplanation
|
||||
scoreExplanation = &search.Explanation{
|
||||
Value: score,
|
||||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID),
|
||||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
|
||||
Children: childrenExplanations,
|
||||
}
|
||||
}
|
||||
|
@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
|||
childExplanations[1] = scoreExplanation
|
||||
scoreExplanation = &search.Explanation{
|
||||
Value: score,
|
||||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID),
|
||||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
|
||||
Children: childExplanations,
|
||||
}
|
||||
}
|
||||
|
@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
|||
rv.Expl = scoreExplanation
|
||||
}
|
||||
|
||||
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
|
||||
locs := make([]search.Location, len(termMatch.Vectors))
|
||||
locsUsed := 0
|
||||
|
||||
totalPositions := 0
|
||||
for _, v := range termMatch.Vectors {
|
||||
totalPositions += len(v.ArrayPositions)
|
||||
}
|
||||
positions := make(search.ArrayPositions, totalPositions)
|
||||
positionsUsed := 0
|
||||
|
||||
rv.Locations = make(search.FieldTermLocationMap)
|
||||
for _, v := range termMatch.Vectors {
|
||||
tlm := rv.Locations[v.Field]
|
||||
if tlm == nil {
|
||||
tlm = make(search.TermLocationMap)
|
||||
rv.Locations[v.Field] = tlm
|
||||
if len(termMatch.Vectors) > 0 {
|
||||
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
|
||||
rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
|
||||
}
|
||||
|
||||
loc := &locs[locsUsed]
|
||||
locsUsed++
|
||||
|
||||
loc.Pos = v.Pos
|
||||
loc.Start = v.Start
|
||||
loc.End = v.End
|
||||
|
||||
for _, v := range termMatch.Vectors {
|
||||
var ap search.ArrayPositions
|
||||
if len(v.ArrayPositions) > 0 {
|
||||
loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)]
|
||||
for i, ap := range v.ArrayPositions {
|
||||
loc.ArrayPositions[i] = ap
|
||||
n := len(rv.FieldTermLocations)
|
||||
if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
|
||||
ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
|
||||
}
|
||||
positionsUsed += len(v.ArrayPositions)
|
||||
ap = append(ap, v.ArrayPositions...)
|
||||
}
|
||||
|
||||
tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc)
|
||||
rv.FieldTermLocations =
|
||||
append(rv.FieldTermLocations, search.FieldTermLocation{
|
||||
Field: v.Field,
|
||||
Term: s.queryTerm,
|
||||
Location: search.Location{
|
||||
Pos: v.Pos,
|
||||
Start: v.Start,
|
||||
End: v.End,
|
||||
ArrayPositions: ap,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -16,11 +16,25 @@ package search
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/document"
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDocumentMatch int
|
||||
var reflectStaticSizeSearchContext int
|
||||
var reflectStaticSizeLocation int
|
||||
|
||||
func init() {
|
||||
var dm DocumentMatch
|
||||
reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size())
|
||||
var sc SearchContext
|
||||
reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size())
|
||||
var l Location
|
||||
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
|
||||
}
|
||||
|
||||
type ArrayPositions []uint64
|
||||
|
||||
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
|
||||
|
@ -47,6 +61,11 @@ type Location struct {
|
|||
ArrayPositions ArrayPositions `json:"array_positions"`
|
||||
}
|
||||
|
||||
func (l *Location) Size() int {
|
||||
return reflectStaticSizeLocation + size.SizeOfPtr +
|
||||
len(l.ArrayPositions)*size.SizeOfUint64
|
||||
}
|
||||
|
||||
type Locations []*Location
|
||||
|
||||
type TermLocationMap map[string]Locations
|
||||
|
@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) {
|
|||
|
||||
type FieldTermLocationMap map[string]TermLocationMap
|
||||
|
||||
type FieldTermLocation struct {
|
||||
Field string
|
||||
Term string
|
||||
Location Location
|
||||
}
|
||||
|
||||
type FieldFragmentMap map[string][]string
|
||||
|
||||
type DocumentMatch struct {
|
||||
|
@ -74,11 +99,14 @@ type DocumentMatch struct {
|
|||
// fields as float64s and date fields as time.RFC3339 formatted strings.
|
||||
Fields map[string]interface{} `json:"fields,omitempty"`
|
||||
|
||||
// if we load the document for this hit, remember it so we dont load again
|
||||
Document *document.Document `json:"-"`
|
||||
|
||||
// used to maintain natural index order
|
||||
HitNumber uint64 `json:"-"`
|
||||
|
||||
// used to temporarily hold field term location information during
|
||||
// search processing in an efficient, recycle-friendly manner, to
|
||||
// be later incorporated into the Locations map when search
|
||||
// results are completed
|
||||
FieldTermLocations []FieldTermLocation `json:"-"`
|
||||
}
|
||||
|
||||
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
|
||||
|
@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch {
|
|||
indexInternalID := dm.IndexInternalID
|
||||
// remember the []interface{} used for sort
|
||||
sort := dm.Sort
|
||||
// remember the FieldTermLocations backing array
|
||||
ftls := dm.FieldTermLocations
|
||||
for i := range ftls { // recycle the ArrayPositions of each location
|
||||
ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0]
|
||||
}
|
||||
// idiom to copy over from empty DocumentMatch (0 allocations)
|
||||
*dm = DocumentMatch{}
|
||||
// reuse the []byte already allocated (and reset len to 0)
|
||||
dm.IndexInternalID = indexInternalID[:0]
|
||||
// reuse the []interface{} already allocated (and reset len to 0)
|
||||
dm.Sort = sort[:0]
|
||||
// reuse the FieldTermLocations already allocated (and reset len to 0)
|
||||
dm.FieldTermLocations = ftls[:0]
|
||||
return dm
|
||||
}
|
||||
|
||||
func (dm *DocumentMatch) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr +
|
||||
len(dm.Index) +
|
||||
len(dm.ID) +
|
||||
len(dm.IndexInternalID)
|
||||
|
||||
if dm.Expl != nil {
|
||||
sizeInBytes += dm.Expl.Size()
|
||||
}
|
||||
|
||||
for k, v := range dm.Locations {
|
||||
sizeInBytes += size.SizeOfString + len(k)
|
||||
for k1, v1 := range v {
|
||||
sizeInBytes += size.SizeOfString + len(k1) +
|
||||
size.SizeOfSlice
|
||||
for _, entry := range v1 {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for k, v := range dm.Fragments {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfSlice
|
||||
|
||||
for _, entry := range v {
|
||||
sizeInBytes += size.SizeOfString + len(entry)
|
||||
}
|
||||
}
|
||||
|
||||
for _, entry := range dm.Sort {
|
||||
sizeInBytes += size.SizeOfString + len(entry)
|
||||
}
|
||||
|
||||
for k, _ := range dm.Fields {
|
||||
sizeInBytes += size.SizeOfString + len(k) +
|
||||
size.SizeOfPtr
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
// Complete performs final preparation & transformation of the
|
||||
// DocumentMatch at the end of search processing, also allowing the
|
||||
// caller to provide an optional preallocated locations slice
|
||||
func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
|
||||
// transform the FieldTermLocations slice into the Locations map
|
||||
nlocs := len(dm.FieldTermLocations)
|
||||
if nlocs > 0 {
|
||||
if cap(prealloc) < nlocs {
|
||||
prealloc = make([]Location, nlocs)
|
||||
}
|
||||
prealloc = prealloc[:nlocs]
|
||||
|
||||
var lastField string
|
||||
var tlm TermLocationMap
|
||||
|
||||
for i, ftl := range dm.FieldTermLocations {
|
||||
if lastField != ftl.Field {
|
||||
lastField = ftl.Field
|
||||
|
||||
if dm.Locations == nil {
|
||||
dm.Locations = make(FieldTermLocationMap)
|
||||
}
|
||||
|
||||
tlm = dm.Locations[ftl.Field]
|
||||
if tlm == nil {
|
||||
tlm = make(TermLocationMap)
|
||||
dm.Locations[ftl.Field] = tlm
|
||||
}
|
||||
}
|
||||
|
||||
loc := &prealloc[i]
|
||||
*loc = ftl.Location
|
||||
|
||||
if len(loc.ArrayPositions) > 0 { // copy
|
||||
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
|
||||
}
|
||||
|
||||
tlm[ftl.Term] = append(tlm[ftl.Term], loc)
|
||||
|
||||
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
|
||||
Location: Location{
|
||||
ArrayPositions: ftl.Location.ArrayPositions[:0],
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle
|
||||
|
||||
return prealloc
|
||||
}
|
||||
|
||||
func (dm *DocumentMatch) String() string {
|
||||
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
|
||||
}
|
||||
|
@ -135,6 +264,7 @@ type Searcher interface {
|
|||
SetQueryNorm(float64)
|
||||
Count() uint64
|
||||
Min() int
|
||||
Size() int
|
||||
|
||||
DocumentMatchPoolSize() int
|
||||
}
|
||||
|
@ -142,9 +272,26 @@ type Searcher interface {
|
|||
type SearcherOptions struct {
|
||||
Explain bool
|
||||
IncludeTermVectors bool
|
||||
Score string
|
||||
}
|
||||
|
||||
// SearchContext represents the context around a single search
|
||||
type SearchContext struct {
|
||||
DocumentMatchPool *DocumentMatchPool
|
||||
Collector Collector
|
||||
}
|
||||
|
||||
func (sc *SearchContext) Size() int {
|
||||
sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr +
|
||||
reflectStaticSizeDocumentMatchPool + size.SizeOfPtr
|
||||
|
||||
if sc.DocumentMatchPool != nil {
|
||||
for _, entry := range sc.DocumentMatchPool.avail {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
|
|
@ -16,12 +16,21 @@ package searcher
|
|||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeBooleanSearcher int
|
||||
|
||||
func init() {
|
||||
var bs BooleanSearcher
|
||||
reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size())
|
||||
}
|
||||
|
||||
type BooleanSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
mustSearcher search.Searcher
|
||||
|
@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc
|
|||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *BooleanSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr
|
||||
|
||||
if s.mustSearcher != nil {
|
||||
sizeInBytes += s.mustSearcher.Size()
|
||||
}
|
||||
|
||||
if s.shouldSearcher != nil {
|
||||
sizeInBytes += s.shouldSearcher.Size()
|
||||
}
|
||||
|
||||
if s.mustNotSearcher != nil {
|
||||
sizeInBytes += s.mustNotSearcher.Size()
|
||||
}
|
||||
|
||||
sizeInBytes += s.scorer.Size()
|
||||
|
||||
for _, entry := range s.matches {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *BooleanSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
|
@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch
|
|||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
|
@ -296,6 +332,14 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
|||
}
|
||||
}
|
||||
|
||||
// Advance the searchers only if the currentID cursor is trailing the lookup ID,
|
||||
// additionally if the mustNotSearcher has been initialized, ensure that the
|
||||
// cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by
|
||||
// currentID) is trailing the lookup ID as well - for in the case where currentID
|
||||
// is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT
|
||||
// advance the currentID or the currMustNot cursors.
|
||||
if (s.currentID == nil || s.currentID.Compare(ID) < 0) &&
|
||||
(s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) {
|
||||
var err error
|
||||
if s.mustSearcher != nil {
|
||||
if s.currMust != nil {
|
||||
|
@ -306,6 +350,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
|||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if s.shouldSearcher != nil {
|
||||
if s.currShould != nil {
|
||||
ctx.DocumentMatchPool.Put(s.currShould)
|
||||
|
@ -315,6 +360,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
|||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
if s.mustNotSearcher != nil {
|
||||
if s.currMustNot != nil {
|
||||
ctx.DocumentMatchPool.Put(s.currMustNot)
|
||||
|
@ -332,6 +378,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
|||
} else {
|
||||
s.currentID = nil
|
||||
}
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
|
|
@ -16,13 +16,22 @@ package searcher
|
|||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeConjunctionSearcher int
|
||||
|
||||
func init() {
|
||||
var cs ConjunctionSearcher
|
||||
reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size())
|
||||
}
|
||||
|
||||
type ConjunctionSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
searchers OrderedSearcherList
|
||||
|
@ -34,14 +43,27 @@ type ConjunctionSearcher struct {
|
|||
options search.SearcherOptions
|
||||
}
|
||||
|
||||
func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) {
|
||||
// build the downstream searchers
|
||||
func NewConjunctionSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, options search.SearcherOptions) (
|
||||
search.Searcher, error) {
|
||||
// build the sorted downstream searchers
|
||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||
for i, searcher := range qsearchers {
|
||||
searchers[i] = searcher
|
||||
}
|
||||
// sort the searchers
|
||||
sort.Sort(searchers)
|
||||
|
||||
// attempt the "unadorned" conjunction optimization only when we
|
||||
// do not need extra information like freq-norm's or term vectors
|
||||
if len(searchers) > 1 &&
|
||||
options.Score == "none" && !options.IncludeTermVectors {
|
||||
rv, err := optimizeCompositeSearcher("conjunction:unadorned",
|
||||
indexReader, searchers, options)
|
||||
if err != nil || rv != nil {
|
||||
return rv, err
|
||||
}
|
||||
}
|
||||
|
||||
// build our searcher
|
||||
rv := ConjunctionSearcher{
|
||||
indexReader: indexReader,
|
||||
|
@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
|
|||
scorer: scorer.NewConjunctionQueryScorer(options),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
|
||||
// attempt push-down conjunction optimization when there's >1 searchers
|
||||
if len(searchers) > 1 {
|
||||
rv, err := optimizeCompositeSearcher("conjunction",
|
||||
indexReader, searchers, options)
|
||||
if err != nil || rv != nil {
|
||||
return rv, err
|
||||
}
|
||||
}
|
||||
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *ConjunctionSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr +
|
||||
s.scorer.Size()
|
||||
|
||||
for _, entry := range s.searchers {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range s.currs {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *ConjunctionSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
|
@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM
|
|||
var rv *search.DocumentMatch
|
||||
var err error
|
||||
OUTER:
|
||||
for s.currs[s.maxIDIdx] != nil {
|
||||
for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil {
|
||||
maxID := s.currs[s.maxIDIdx].IndexInternalID
|
||||
|
||||
i := 0
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -16,12 +16,9 @@ package searcher
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
)
|
||||
|
||||
// DisjunctionMaxClauseCount is a compile time setting that applications can
|
||||
|
@ -29,17 +26,74 @@ import (
|
|||
// error instead of exeucting searches when the size exceeds this value.
|
||||
var DisjunctionMaxClauseCount = 0
|
||||
|
||||
type DisjunctionSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
searchers OrderedSearcherList
|
||||
numSearchers int
|
||||
queryNorm float64
|
||||
currs []*search.DocumentMatch
|
||||
scorer *scorer.DisjunctionQueryScorer
|
||||
min int
|
||||
matching []*search.DocumentMatch
|
||||
matchingIdxs []int
|
||||
initialized bool
|
||||
// DisjunctionHeapTakeover is a compile time setting that applications can
|
||||
// adjust to control when the DisjunctionSearcher will switch from a simple
|
||||
// slice implementation to a heap implementation.
|
||||
var DisjunctionHeapTakeover = 10
|
||||
|
||||
func NewDisjunctionSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
|
||||
search.Searcher, error) {
|
||||
return newDisjunctionSearcher(indexReader, qsearchers, min, options, true)
|
||||
}
|
||||
|
||||
func newDisjunctionSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (search.Searcher, error) {
|
||||
// attempt the "unadorned" disjunction optimization only when we
|
||||
// do not need extra information like freq-norm's or term vectors
|
||||
// and the requested min is simple
|
||||
if len(qsearchers) > 1 && min <= 1 &&
|
||||
options.Score == "none" && !options.IncludeTermVectors {
|
||||
rv, err := optimizeCompositeSearcher("disjunction:unadorned",
|
||||
indexReader, qsearchers, options)
|
||||
if err != nil || rv != nil {
|
||||
return rv, err
|
||||
}
|
||||
}
|
||||
|
||||
if len(qsearchers) > DisjunctionHeapTakeover {
|
||||
return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options,
|
||||
limit)
|
||||
}
|
||||
return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options,
|
||||
limit)
|
||||
}
|
||||
|
||||
func optimizeCompositeSearcher(optimizationKind string,
|
||||
indexReader index.IndexReader, qsearchers []search.Searcher,
|
||||
options search.SearcherOptions) (search.Searcher, error) {
|
||||
var octx index.OptimizableContext
|
||||
|
||||
for _, searcher := range qsearchers {
|
||||
o, ok := searcher.(index.Optimizable)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var err error
|
||||
octx, err = o.Optimize(optimizationKind, octx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if octx == nil {
|
||||
return nil, nil
|
||||
}
|
||||
}
|
||||
|
||||
optimized, err := octx.Finish()
|
||||
if err != nil || optimized == nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
tfr, ok := optimized.(index.TermFieldReader)
|
||||
if !ok {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return newTermSearcherFromReader(indexReader, tfr,
|
||||
[]byte(optimizationKind), "*", 1.0, options)
|
||||
}
|
||||
|
||||
func tooManyClauses(count int) bool {
|
||||
|
@ -49,226 +103,7 @@ func tooManyClauses(count int) bool {
|
|||
return false
|
||||
}
|
||||
|
||||
func tooManyClausesErr() error {
|
||||
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
|
||||
DisjunctionMaxClauseCount)
|
||||
}
|
||||
|
||||
func NewDisjunctionSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
|
||||
*DisjunctionSearcher, error) {
|
||||
return newDisjunctionSearcher(indexReader, qsearchers, min, options,
|
||||
true)
|
||||
}
|
||||
|
||||
func newDisjunctionSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (
|
||||
*DisjunctionSearcher, error) {
|
||||
if limit && tooManyClauses(len(qsearchers)) {
|
||||
return nil, tooManyClausesErr()
|
||||
}
|
||||
// build the downstream searchers
|
||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||
for i, searcher := range qsearchers {
|
||||
searchers[i] = searcher
|
||||
}
|
||||
// sort the searchers
|
||||
sort.Sort(sort.Reverse(searchers))
|
||||
// build our searcher
|
||||
rv := DisjunctionSearcher{
|
||||
indexReader: indexReader,
|
||||
searchers: searchers,
|
||||
numSearchers: len(searchers),
|
||||
currs: make([]*search.DocumentMatch, len(searchers)),
|
||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||
min: int(min),
|
||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||
matchingIdxs: make([]int, len(searchers)),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
for _, searcher := range s.searchers {
|
||||
sumOfSquaredWeights += searcher.Weight()
|
||||
}
|
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(s.queryNorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||
var err error
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
s.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) updateMatches() error {
|
||||
matching := s.matching[:0]
|
||||
matchingIdxs := s.matchingIdxs[:0]
|
||||
|
||||
for i := 0; i < len(s.currs); i++ {
|
||||
curr := s.currs[i]
|
||||
if curr == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(matching) > 0 {
|
||||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
|
||||
if cmp > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if cmp < 0 {
|
||||
matching = matching[:0]
|
||||
matchingIdxs = matchingIdxs[:0]
|
||||
}
|
||||
}
|
||||
|
||||
matching = append(matching, curr)
|
||||
matchingIdxs = append(matchingIdxs, i)
|
||||
}
|
||||
|
||||
s.matching = matching
|
||||
s.matchingIdxs = matchingIdxs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Weight() float64 {
|
||||
var rv float64
|
||||
for _, searcher := range s.searchers {
|
||||
rv += searcher.Weight()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) {
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(qnorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) (
|
||||
*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
var err error
|
||||
var rv *search.DocumentMatch
|
||||
|
||||
found := false
|
||||
for !found && len(s.matching) > 0 {
|
||||
if len(s.matching) >= s.min {
|
||||
found = true
|
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||
}
|
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, i := range s.matchingIdxs {
|
||||
searcher := s.searchers[i]
|
||||
if s.currs[i] != rv {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
|
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// get all searchers pointing at their first match
|
||||
var err error
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
|
||||
continue
|
||||
}
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Advance(ctx, ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Count() uint64 {
|
||||
// for now return a worst case
|
||||
var sum uint64
|
||||
for _, searcher := range s.searchers {
|
||||
sum += searcher.Count()
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Close() (rv error) {
|
||||
for _, searcher := range s.searchers {
|
||||
err := searcher.Close()
|
||||
if err != nil && rv == nil {
|
||||
rv = err
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) Min() int {
|
||||
return s.min
|
||||
}
|
||||
|
||||
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int {
|
||||
rv := len(s.currs)
|
||||
for _, s := range s.searchers {
|
||||
rv += s.DocumentMatchPoolSize()
|
||||
}
|
||||
return rv
|
||||
func tooManyClausesErr(count int) error {
|
||||
return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]",
|
||||
count, DisjunctionMaxClauseCount)
|
||||
}
|
||||
|
|
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
Normal file
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
Normal file
|
@ -0,0 +1,343 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/heap"
|
||||
"math"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDisjunctionHeapSearcher int
|
||||
var reflectStaticSizeSearcherCurr int
|
||||
|
||||
func init() {
|
||||
var dhs DisjunctionHeapSearcher
|
||||
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size())
|
||||
|
||||
var sc SearcherCurr
|
||||
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size())
|
||||
}
|
||||
|
||||
type SearcherCurr struct {
|
||||
searcher search.Searcher
|
||||
curr *search.DocumentMatch
|
||||
}
|
||||
|
||||
type DisjunctionHeapSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
|
||||
numSearchers int
|
||||
scorer *scorer.DisjunctionQueryScorer
|
||||
min int
|
||||
queryNorm float64
|
||||
initialized bool
|
||||
searchers []search.Searcher
|
||||
heap []*SearcherCurr
|
||||
|
||||
matching []*search.DocumentMatch
|
||||
matchingCurrs []*SearcherCurr
|
||||
}
|
||||
|
||||
func newDisjunctionHeapSearcher(indexReader index.IndexReader,
|
||||
searchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (
|
||||
*DisjunctionHeapSearcher, error) {
|
||||
if limit && tooManyClauses(len(searchers)) {
|
||||
return nil, tooManyClausesErr(len(searchers))
|
||||
}
|
||||
|
||||
// build our searcher
|
||||
rv := DisjunctionHeapSearcher{
|
||||
indexReader: indexReader,
|
||||
searchers: searchers,
|
||||
numSearchers: len(searchers),
|
||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||
min: int(min),
|
||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||
matchingCurrs: make([]*SearcherCurr, len(searchers)),
|
||||
heap: make([]*SearcherCurr, 0, len(searchers)),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
|
||||
s.scorer.Size()
|
||||
|
||||
for _, entry := range s.searchers {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range s.matching {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
// for matchingCurrs and heap, just use static size * len
|
||||
// since searchers and document matches already counted above
|
||||
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
|
||||
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
for _, searcher := range s.searchers {
|
||||
sumOfSquaredWeights += searcher.Weight()
|
||||
}
|
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(s.queryNorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||
// alloc a single block of SearcherCurrs
|
||||
block := make([]SearcherCurr, len(s.searchers))
|
||||
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers {
|
||||
curr, err := searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if curr != nil {
|
||||
block[i].searcher = searcher
|
||||
block[i].curr = curr
|
||||
heap.Push(s, &block[i])
|
||||
}
|
||||
}
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
s.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) updateMatches() error {
|
||||
matching := s.matching[:0]
|
||||
matchingCurrs := s.matchingCurrs[:0]
|
||||
|
||||
if len(s.heap) > 0 {
|
||||
|
||||
// top of the heap is our next hit
|
||||
next := heap.Pop(s).(*SearcherCurr)
|
||||
matching = append(matching, next.curr)
|
||||
matchingCurrs = append(matchingCurrs, next)
|
||||
|
||||
// now as long as top of heap matches, keep popping
|
||||
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
|
||||
next = heap.Pop(s).(*SearcherCurr)
|
||||
matching = append(matching, next.curr)
|
||||
matchingCurrs = append(matchingCurrs, next)
|
||||
}
|
||||
}
|
||||
|
||||
s.matching = matching
|
||||
s.matchingCurrs = matchingCurrs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Weight() float64 {
|
||||
var rv float64
|
||||
for _, searcher := range s.searchers {
|
||||
rv += searcher.Weight()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) {
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(qnorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
|
||||
*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
var rv *search.DocumentMatch
|
||||
found := false
|
||||
for !found && len(s.matching) > 0 {
|
||||
if len(s.matching) >= s.min {
|
||||
found = true
|
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||
}
|
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
if matchingCurr.curr != rv {
|
||||
ctx.DocumentMatchPool.Put(matchingCurr.curr)
|
||||
}
|
||||
curr, err := matchingCurr.searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if curr != nil {
|
||||
matchingCurr.curr = curr
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
}
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext,
|
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// if there is anything in matching, toss it back onto the heap
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
s.matching = s.matching[:0]
|
||||
s.matchingCurrs = s.matchingCurrs[:0]
|
||||
|
||||
// find all searchers that actually need to be advanced
|
||||
// advance them, using s.matchingCurrs as temp storage
|
||||
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 {
|
||||
searcherCurr := heap.Pop(s).(*SearcherCurr)
|
||||
ctx.DocumentMatchPool.Put(searcherCurr.curr)
|
||||
curr, err := searcherCurr.searcher.Advance(ctx, ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if curr != nil {
|
||||
searcherCurr.curr = curr
|
||||
s.matchingCurrs = append(s.matchingCurrs, searcherCurr)
|
||||
}
|
||||
}
|
||||
// now all of the searchers that we advanced have to be pushed back
|
||||
for _, matchingCurr := range s.matchingCurrs {
|
||||
heap.Push(s, matchingCurr)
|
||||
}
|
||||
// reset our temp space
|
||||
s.matchingCurrs = s.matchingCurrs[:0]
|
||||
|
||||
err := s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Count() uint64 {
|
||||
// for now return a worst case
|
||||
var sum uint64
|
||||
for _, searcher := range s.searchers {
|
||||
sum += searcher.Count()
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Close() (rv error) {
|
||||
for _, searcher := range s.searchers {
|
||||
err := searcher.Close()
|
||||
if err != nil && rv == nil {
|
||||
rv = err
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Min() int {
|
||||
return s.min
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int {
|
||||
rv := len(s.searchers)
|
||||
for _, s := range s.searchers {
|
||||
rv += s.DocumentMatchPoolSize()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||
index.OptimizableContext, error) {
|
||||
if len(s.searchers) == 1 {
|
||||
o, ok := s.searchers[0].(index.Optimizable)
|
||||
if ok {
|
||||
return o.Optimize(kind, octx)
|
||||
}
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
||||
|
||||
// heap impl
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) }
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Less(i, j int) bool {
|
||||
if s.heap[i].curr == nil {
|
||||
return true
|
||||
} else if s.heap[j].curr == nil {
|
||||
return false
|
||||
}
|
||||
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Swap(i, j int) {
|
||||
s.heap[i], s.heap[j] = s.heap[j], s.heap[i]
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Push(x interface{}) {
|
||||
s.heap = append(s.heap, x.(*SearcherCurr))
|
||||
}
|
||||
|
||||
func (s *DisjunctionHeapSearcher) Pop() interface{} {
|
||||
old := s.heap
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
s.heap = old[0 : n-1]
|
||||
return x
|
||||
}
|
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
Normal file
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
Normal file
|
@ -0,0 +1,298 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package searcher
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"sort"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDisjunctionSliceSearcher int
|
||||
|
||||
func init() {
|
||||
var ds DisjunctionSliceSearcher
|
||||
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size())
|
||||
}
|
||||
|
||||
type DisjunctionSliceSearcher struct {
|
||||
indexReader index.IndexReader
|
||||
searchers OrderedSearcherList
|
||||
numSearchers int
|
||||
queryNorm float64
|
||||
currs []*search.DocumentMatch
|
||||
scorer *scorer.DisjunctionQueryScorer
|
||||
min int
|
||||
matching []*search.DocumentMatch
|
||||
matchingIdxs []int
|
||||
initialized bool
|
||||
}
|
||||
|
||||
func newDisjunctionSliceSearcher(indexReader index.IndexReader,
|
||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||
limit bool) (
|
||||
*DisjunctionSliceSearcher, error) {
|
||||
if limit && tooManyClauses(len(qsearchers)) {
|
||||
return nil, tooManyClausesErr(len(qsearchers))
|
||||
}
|
||||
// build the downstream searchers
|
||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||
for i, searcher := range qsearchers {
|
||||
searchers[i] = searcher
|
||||
}
|
||||
// sort the searchers
|
||||
sort.Sort(sort.Reverse(searchers))
|
||||
// build our searcher
|
||||
rv := DisjunctionSliceSearcher{
|
||||
indexReader: indexReader,
|
||||
searchers: searchers,
|
||||
numSearchers: len(searchers),
|
||||
currs: make([]*search.DocumentMatch, len(searchers)),
|
||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||
min: int(min),
|
||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||
matchingIdxs: make([]int, len(searchers)),
|
||||
}
|
||||
rv.computeQueryNorm()
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Size() int {
|
||||
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
|
||||
s.scorer.Size()
|
||||
|
||||
for _, entry := range s.searchers {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
|
||||
for _, entry := range s.currs {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
for _, entry := range s.matching {
|
||||
if entry != nil {
|
||||
sizeInBytes += entry.Size()
|
||||
}
|
||||
}
|
||||
|
||||
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
|
||||
|
||||
return sizeInBytes
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
|
||||
// first calculate sum of squared weights
|
||||
sumOfSquaredWeights := 0.0
|
||||
for _, searcher := range s.searchers {
|
||||
sumOfSquaredWeights += searcher.Weight()
|
||||
}
|
||||
// now compute query norm from this
|
||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||
// finally tell all the downstream searchers the norm
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(s.queryNorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||
var err error
|
||||
// get all searchers pointing at their first match
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
s.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) updateMatches() error {
|
||||
matching := s.matching[:0]
|
||||
matchingIdxs := s.matchingIdxs[:0]
|
||||
|
||||
for i := 0; i < len(s.currs); i++ {
|
||||
curr := s.currs[i]
|
||||
if curr == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if len(matching) > 0 {
|
||||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
|
||||
if cmp > 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if cmp < 0 {
|
||||
matching = matching[:0]
|
||||
matchingIdxs = matchingIdxs[:0]
|
||||
}
|
||||
}
|
||||
|
||||
matching = append(matching, curr)
|
||||
matchingIdxs = append(matchingIdxs, i)
|
||||
}
|
||||
|
||||
s.matching = matching
|
||||
s.matchingIdxs = matchingIdxs
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Weight() float64 {
|
||||
var rv float64
|
||||
for _, searcher := range s.searchers {
|
||||
rv += searcher.Weight()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) {
|
||||
for _, searcher := range s.searchers {
|
||||
searcher.SetQueryNorm(qnorm)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
|
||||
*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
var err error
|
||||
var rv *search.DocumentMatch
|
||||
|
||||
found := false
|
||||
for !found && len(s.matching) > 0 {
|
||||
if len(s.matching) >= s.min {
|
||||
found = true
|
||||
// score this match
|
||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||
}
|
||||
|
||||
// invoke next on all the matching searchers
|
||||
for _, i := range s.matchingIdxs {
|
||||
searcher := s.searchers[i]
|
||||
if s.currs[i] != rv {
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Next(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext,
|
||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||
if !s.initialized {
|
||||
err := s.initSearchers(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// get all searchers pointing at their first match
|
||||
var err error
|
||||
for i, searcher := range s.searchers {
|
||||
if s.currs[i] != nil {
|
||||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
|
||||
continue
|
||||
}
|
||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||
}
|
||||
s.currs[i], err = searcher.Advance(ctx, ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
err = s.updateMatches()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return s.Next(ctx)
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Count() uint64 {
|
||||
// for now return a worst case
|
||||
var sum uint64
|
||||
for _, searcher := range s.searchers {
|
||||
sum += searcher.Count()
|
||||
}
|
||||
return sum
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Close() (rv error) {
|
||||
for _, searcher := range s.searchers {
|
||||
err := searcher.Close()
|
||||
if err != nil && rv == nil {
|
||||
rv = err
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) Min() int {
|
||||
return s.min
|
||||
}
|
||||
|
||||
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int {
|
||||
rv := len(s.currs)
|
||||
for _, s := range s.searchers {
|
||||
rv += s.DocumentMatchPoolSize()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
// a disjunction searcher implements the index.Optimizable interface
|
||||
// but only activates on an edge case where the disjunction is a
|
||||
// wrapper around a single Optimizable child searcher
|
||||
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||
index.OptimizableContext, error) {
|
||||
if len(s.searchers) == 1 {
|
||||
o, ok := s.searchers[0].(index.Optimizable)
|
||||
if ok {
|
||||
return o.Optimize(kind, octx)
|
||||
}
|
||||
}
|
||||
|
||||
return octx, nil
|
||||
}
|
|
@ -15,11 +15,21 @@
|
|||
package searcher
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/search/scorer"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeDocIDSearcher int
|
||||
|
||||
func init() {
|
||||
var ds DocIDSearcher
|
||||
reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size())
|
||||
}
|
||||
|
||||
// DocIDSearcher returns documents matching a predefined set of identifiers.
|
||||
type DocIDSearcher struct {
|
||||
reader index.DocIDReader
|
||||
|
@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (s *DocIDSearcher) Size() int {
|
||||
return reflectStaticSizeDocIDSearcher + size.SizeOfPtr +
|
||||
s.reader.Size() +
|
||||
s.scorer.Size()
|
||||
}
|
||||
|
||||
func (s *DocIDSearcher) Count() uint64 {
|
||||
return uint64(s.count)
|
||||
}
|
||||
|
|
|
@ -15,10 +15,20 @@
|
|||
package searcher
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
"github.com/blevesearch/bleve/size"
|
||||
)
|
||||
|
||||
var reflectStaticSizeFilteringSearcher int
|
||||
|
||||
func init() {
|
||||
var fs FilteringSearcher
|
||||
reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size())
|
||||
}
|
||||
|
||||
// FilterFunc defines a function which can filter documents
|
||||
// returning true means keep the document
|
||||
// returning false means do not keep the document
|
||||
|
@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch
|
|||
}
|
||||
}
|
||||
|
||||
func (f *FilteringSearcher) Size() int {
|
||||
return reflectStaticSizeFilteringSearcher + size.SizeOfPtr +
|
||||
f.child.Size()
|
||||
}
|
||||
|
||||
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
|
||||
next, err := f.child.Next(ctx)
|
||||
for next != nil && err == nil {
|
||||
|
|
|
@ -15,13 +15,26 @@
|
|||
package searcher
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/index"
|
||||
"github.com/blevesearch/bleve/search"
|
||||
)
|
||||
|
||||
var MaxFuzziness = 2
|
||||
|
||||
func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
||||
prefix, fuzziness int, field string, boost float64,
|
||||
options search.SearcherOptions) (search.Searcher, error) {
|
||||
|
||||
if fuzziness > MaxFuzziness {
|
||||
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
|
||||
}
|
||||
|
||||
if fuzziness < 0 {
|
||||
return nil, fmt.Errorf("invalid fuzziness, negative")
|
||||
}
|
||||
|
||||
// Note: we don't byte slice the term for a prefix because of runes.
|
||||
prefixTerm := ""
|
||||
for i, r := range term {
|
||||
|
@ -31,7 +44,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
|||
break
|
||||
}
|
||||
}
|
||||
|
||||
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
||||
field, prefixTerm)
|
||||
if err != nil {
|
||||
|
@ -45,12 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
|||
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
||||
fuzziness int, field, prefixTerm string) (rv []string, err error) {
|
||||
rv = make([]string, 0)
|
||||
|
||||
// in case of advanced reader implementations directly call
|
||||
// the levenshtein automaton based iterator to collect the
|
||||
// candidate terms
|
||||
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
|
||||
fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||
err = cerr
|
||||
}
|
||||
}()
|
||||
tfd, err := fieldDict.Next()
|
||||
for err == nil && tfd != nil {
|
||||
rv = append(rv, tfd.Term)
|
||||
if tooManyClauses(len(rv)) {
|
||||
return nil, tooManyClausesErr(len(rv))
|
||||
}
|
||||
tfd, err = fieldDict.Next()
|
||||
}
|
||||
return rv, err
|
||||
}
|
||||
|
||||
var fieldDict index.FieldDict
|
||||
if len(prefixTerm) > 0 {
|
||||
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
||||
} else {
|
||||
fieldDict, err = indexReader.FieldDict(field)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() {
|
||||
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||
err = cerr
|
||||
|
@ -58,13 +98,16 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
|||
}()
|
||||
|
||||
// enumerate terms and check levenshtein distance
|
||||
var reuse []int
|
||||
tfd, err := fieldDict.Next()
|
||||
for err == nil && tfd != nil {
|
||||
ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness)
|
||||
var ld int
|
||||
var exceeded bool
|
||||
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
|
||||
if !exceeded && ld <= fuzziness {
|
||||
rv = append(rv, tfd.Term)
|
||||
if tooManyClauses(len(rv)) {
|
||||
return rv, tooManyClausesErr()
|
||||
return nil, tooManyClausesErr(len(rv))
|
||||
}
|
||||
}
|
||||
tfd, err = fieldDict.Next()
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue