123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909 |
- /*
- * EliasDB
- *
- * Copyright 2016 Matthias Ladkau. All rights reserved.
- *
- * This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/.
- */
- package util
- import (
- "bytes"
- "crypto/md5"
- "encoding/gob"
- "fmt"
- "math"
- "sort"
- "strings"
- "unicode"
- "devt.de/krotik/common/bitutil"
- "devt.de/krotik/common/sortutil"
- "devt.de/krotik/common/stringutil"
- "devt.de/krotik/eliasdb/hash"
- )
- /*
- MaxKeysetSize is the maximum number of keys for a single word lookup.
- */
- const MaxKeysetSize = 1000
- /*
- CaseSensitiveWordIndex is a flag to indicate if the index should be case sensitive.
- */
- var CaseSensitiveWordIndex = false
- /*
- PrefixAttrWord is the prefix used for word index entries
- */
- const PrefixAttrWord = string(0x01)
- /*
- PrefixAttrHash is the prefix used for hashes of attribute values
- */
- const PrefixAttrHash = string(0x02)
- /*
- IndexManager data structure
- */
- type IndexManager struct {
- htree *hash.HTree // Persistent HTree which stores this index
- }
- /*
- indexEntry data structure
- */
- type indexEntry struct {
- WordPos map[string]string // Node id to word position array
- }
- func init() {
- // Make sure we can use indexEntry in a gob operation
- gob.Register(&indexEntry{})
- }
- /*
- NewIndexManager creates a new index manager instance.
- */
- func NewIndexManager(htree *hash.HTree) *IndexManager {
- return &IndexManager{htree}
- }
- /*
- Index indexes (inserts) a given object.
- */
- func (im *IndexManager) Index(key string, obj map[string]string) error {
- return im.updateIndex(key, obj, nil)
- }
- /*
- Reindex reindexes (updates) a given object.
- */
- func (im *IndexManager) Reindex(key string, newObj map[string]string,
- oldObj map[string]string) error {
- return im.updateIndex(key, newObj, oldObj)
- }
- /*
- Deindex deindexes (removes) a given object.
- */
- func (im *IndexManager) Deindex(key string, obj map[string]string) error {
- return im.updateIndex(key, nil, obj)
- }
- /*
- LookupPhrase finds all nodes where an attribute contains a certain phrase. This
- call returns a list of node keys which contain the phrase at least once.
- */
- func (im *IndexManager) LookupPhrase(attr, phrase string) ([]string, error) {
- // Chop up the phrase into words
- phraseWords := strings.FieldsFunc(phrase, func(r rune) bool {
- return !stringutil.IsAlphaNumeric(string(r)) && (unicode.IsSpace(r) || unicode.IsControl(r) || unicode.IsPunct(r))
- })
- // Lookup every phrase word
- results := make([]map[string][]uint64, len(phraseWords))
- for i, phraseWord := range phraseWords {
- res, err := im.LookupWord(attr, phraseWord)
- if err != nil {
- return nil, &GraphError{ErrIndexError, err.Error()}
- }
- results[i] = res
- }
- if len(results) == 0 || len(results[0]) == 0 {
- return nil, nil
- }
- ret := make([]string, 0, len(results[0]))
- // Go through all found nodes and try to find a path
- path := make([]uint64, 0, len(phraseWords))
- for key := range results[0] {
- path = path[:0]
- foundWords := im.findPhrasePath(key, 0, path, phraseWords, results)
- if foundWords == len(phraseWords) {
- // Add key to results if a path was found
- ret = append(ret, key)
- }
- }
- // Guarantee a stable result
- sort.StringSlice(ret).Sort()
- return ret, nil
- }
- /*
- findPhrasePath tries to find a phrase in a given set of lookup results.
- */
- func (im *IndexManager) findPhrasePath(key string, index int, path []uint64,
- phraseWords []string, results []map[string][]uint64) int {
- // Get the results for this word index
- result := results[index]
- // Check if there is a result for the given key
- if posArr, ok := result[key]; ok {
- // Check if any of the positions is at the right place
- if index > 0 {
- // Check with previous result
- for _, pos := range posArr {
- // Check if the position array contains the expected next word position
- if pos == path[index-1]+1 {
- path = append(path, pos)
- break
- }
- // Abort if the expected position cannot be there
- if pos > path[index-1] {
- return len(path)
- }
- }
- // Do the next iteration if a position was found and
- // there are more words in the phrase to match
- if len(path) == index+1 && index < len(phraseWords)-1 {
- return im.findPhrasePath(key, index+1, path, phraseWords, results)
- }
- return index + 1
- }
- // Try every position as start position in the first iteration
- for _, pos := range posArr {
- path = path[:0]
- path = append(path, pos)
- // Test if the phrase only contained one word
- if len(phraseWords) == 1 {
- return 1
- }
- // Find the rest
- ret := im.findPhrasePath(key, 1, path, phraseWords, results)
- if ret == len(phraseWords) {
- return ret
- }
- }
- }
- return len(path)
- }
- /*
- LookupWord finds all nodes where an attribute contains a certain word. This call returns
- a map which maps node key to a list of word positions.
- */
- func (im *IndexManager) LookupWord(attr, word string) (map[string][]uint64, error) {
- var s string
- if CaseSensitiveWordIndex {
- s = word
- } else {
- s = strings.ToLower(word)
- }
- entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s))
- if err != nil {
- return nil, &GraphError{ErrIndexError, err.Error()}
- } else if entry == nil {
- return nil, nil
- }
- ret := make(map[string][]uint64)
- for k, l := range entry.(*indexEntry).WordPos {
- ret[k] = bitutil.UnpackList(l)
- }
- return ret, nil
- }
- /*
- LookupValue finds all nodes where an attribute has a certain value. This call
- returns a list of node keys.
- */
- func (im *IndexManager) LookupValue(attr, value string) ([]string, error) {
- var entry *indexEntry
- var sum [16]byte
- if CaseSensitiveWordIndex {
- sum = md5.Sum([]byte(value))
- } else {
- sum = md5.Sum([]byte(strings.ToLower(value)))
- }
- indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))
- // Retrieve index entry
- obj, err := im.htree.Get(indexkey)
- if err != nil {
- return nil, &GraphError{ErrIndexError, err.Error()}
- }
- if obj == nil {
- return nil, nil
- }
- entry = obj.(*indexEntry)
- ret := make([]string, 0, len(entry.WordPos))
- for key := range entry.WordPos {
- ret = append(ret, key)
- }
- sort.StringSlice(ret).Sort()
- return ret, nil
- }
- /*
- Count returns the number of found nodes for a given word in a given attribute.
- */
- func (im *IndexManager) Count(attr, word string) (int, error) {
- var s string
- if CaseSensitiveWordIndex {
- s = word
- } else {
- s = strings.ToLower(word)
- }
- entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s))
- if err != nil {
- return 0, &GraphError{ErrIndexError, err.Error()}
- } else if entry == nil {
- return 0, nil
- }
- return len(entry.(*indexEntry).WordPos), nil
- }
- /*
- updateIndex updates the index for a specific object. Depending on the
- new and old arguments being set a given object is either indexed/added
- (only new is set), deindexted/removed (only old is set) or reindexted/updated
- (new and old are set).
- */
- func (im *IndexManager) updateIndex(key string, newObj map[string]string,
- oldObj map[string]string) error {
- attrMap := make(map[string][]byte)
- if newObj != nil && oldObj == nil {
- // Insert case
- for attr := range newObj {
- attrMap[attr] = nil
- }
- } else if newObj == nil && oldObj != nil {
- // Remove case
- for attr := range oldObj {
- attrMap[attr] = nil
- }
- } else {
- // Update case
- for attr := range newObj {
- attrMap[attr] = nil
- }
- for attr := range oldObj {
- attrMap[attr] = nil
- }
- }
- emptyws := newWordSet(1)
- for attr := range attrMap {
- var newwords, toadd, oldwords, toremove *wordSet
- newval, newok := newObj[attr]
- oldval, oldok := oldObj[attr]
- // Calculate which words to add or remove
- newwords = emptyws
- oldwords = emptyws
- if newok {
- newwords = extractWords(newval)
- }
- // At this point we have only words to add
- toadd = newwords
- toremove = emptyws
- if oldok {
- oldwords = extractWords(oldval)
- if !oldwords.Empty() && !newwords.Empty() {
- // Here a diff is necessary
- toadd = copyWordSet(newwords)
- toadd.RemoveAll(oldwords)
- toremove = oldwords
- toremove.RemoveAll(newwords)
- } else {
- // Either no new words or no old words
- toremove = oldwords
- }
- }
- // Add and remove index entries
- for w, p := range toremove.set {
- if err := im.removeIndexEntry(key, attr, w, p); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- }
- }
- for w, p := range toadd.set {
- if err := im.addIndexEntry(key, attr, w, p); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- }
- }
- // Update hash lookup
- if newok && oldok {
- // Update hash entry
- if err := im.removeIndexHashEntry(key, attr, oldval); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- } else if err := im.addIndexHashEntry(key, attr, newval); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- }
- } else if newok && !oldok {
- // Insert hash entry
- if err := im.addIndexHashEntry(key, attr, newval); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- }
- } else if oldok {
- // Delete old hash entry
- if err := im.removeIndexHashEntry(key, attr, oldval); err != nil {
- return &GraphError{ErrIndexError, err.Error()}
- }
- }
- }
- return nil
- }
- /*
- addIndexHashEntry add a hash entry from the index. A hash entry stores a whole
- value as MD5 sum.
- */
- func (im *IndexManager) addIndexHashEntry(key string, attr string, value string) error {
- var entry *indexEntry
- var sum [16]byte
- if CaseSensitiveWordIndex {
- sum = md5.Sum([]byte(value))
- } else {
- sum = md5.Sum([]byte(strings.ToLower(value)))
- }
- indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))
- // Retrieve index entry
- obj, err := im.htree.Get(indexkey)
- if err != nil {
- return err
- }
- if obj == nil {
- entry = &indexEntry{make(map[string]string)}
- } else {
- entry = obj.(*indexEntry)
- }
- entry.WordPos[key] = ""
- _, err = im.htree.Put(indexkey, entry)
- return err
- }
- /*
- removeIndexHashEntry removes a hash entry from the index. A hash entry stores a whole
- value as MD5 sum.
- */
- func (im *IndexManager) removeIndexHashEntry(key string, attr string, value string) error {
- var entry *indexEntry
- var sum [16]byte
- if CaseSensitiveWordIndex {
- sum = md5.Sum([]byte(value))
- } else {
- sum = md5.Sum([]byte(strings.ToLower(value)))
- }
- indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))
- // Retrieve index entry
- obj, err := im.htree.Get(indexkey)
- if err != nil {
- return err
- }
- if obj == nil {
- return nil
- }
- entry = obj.(*indexEntry)
- delete(entry.WordPos, key)
- if len(entry.WordPos) == 0 {
- im.htree.Remove(indexkey)
- } else {
- im.htree.Put(indexkey, entry)
- }
- return err
- }
- /*
- removeIndexEntry removes an entry from the index.
- */
- func (im *IndexManager) removeIndexEntry(key string, attr string, word string, pos []uint64) error {
- var entry *indexEntry
- indexkey := []byte(PrefixAttrWord + attr + word)
- // Retrieve index entry
- obj, err := im.htree.Get(indexkey)
- if err != nil {
- return err
- }
- if obj == nil {
- return nil
- }
- entry = obj.(*indexEntry)
- // Remove given pos from existing pos information
- if keyentry, ok := entry.WordPos[key]; ok {
- keyentrylist := bitutil.UnpackList(keyentry)
- res := make([]uint64, 0, len(keyentrylist))
- remLookup := make(map[uint64]bool)
- for _, item := range pos {
- remLookup[item] = true
- }
- for _, item := range keyentrylist {
- if _, ok := remLookup[item]; !ok {
- res = append(res, item)
- }
- }
- if len(res) == 0 {
- delete(entry.WordPos, key)
- } else {
- entry.WordPos[key] = bitutil.PackList(res, res[len(res)-1])
- }
- }
- if len(entry.WordPos) == 0 {
- _, err = im.htree.Remove(indexkey)
- } else {
- _, err = im.htree.Put(indexkey, entry)
- }
- return err
- }
- /*
- addIndexEntry adds an entry to the index.
- */
- func (im *IndexManager) addIndexEntry(key string, attr string, word string, pos []uint64) error {
- var entry *indexEntry
- indexkey := []byte(PrefixAttrWord + attr + word)
- // Retrieve or create index entry
- obj, err := im.htree.Get(indexkey)
- if err != nil {
- return err
- }
- if obj == nil {
- entry = &indexEntry{make(map[string]string)}
- } else {
- entry = obj.(*indexEntry)
- }
- // Create position string
- if len(pos) == 0 {
- panic("Trying to add index entry without position information")
- }
- // Mix in given pos with existing pos information
- if keyentry, ok := entry.WordPos[key]; ok {
- pos = append(bitutil.UnpackList(keyentry), pos...)
- sortutil.UInt64s(pos)
- pos = removeDuplicates(pos)
- }
- // Rely on the fact that position arrays are ordered in ascending order
- maxpos := pos[len(pos)-1]
- // Fill the entry and store it
- entry.WordPos[key] = bitutil.PackList(pos, maxpos)
- _, err = im.htree.Put(indexkey, entry)
- return err
- }
- /*
- Remove all duplicates from a given sorted list.
- */
- func removeDuplicates(list []uint64) []uint64 {
- if len(list) == 0 {
- return list
- }
- res := make([]uint64, 1, len(list))
- res[0] = list[0]
- last := list[0]
- for _, item := range list[1:] {
- if item != last {
- res = append(res, item)
- last = item
- }
- }
- return res
- }
- /*
- String returns a string representation of this index manager.
- */
- func (im *IndexManager) String() string {
- var buf bytes.Buffer
- buf.WriteString(fmt.Sprintf("IndexManager: %v\n", im.htree.Location()))
- it := hash.NewHTreeIterator(im.htree)
- for it.HasNext() {
- key, value := it.Next()
- posmap := make(map[string][]uint64)
- for k, v := range value.(*indexEntry).WordPos {
- posmap[k] = bitutil.UnpackList(v)
- }
- buf.WriteString(fmt.Sprintf(" %v%q %v\n", key[0], string(key[1:]), posmap))
- }
- return buf.String()
- }
- /*
- extractWords extracts all words from a given string and return a wordSet which contains
- all words and their positions.
- */
- func extractWords(s string) *wordSet {
- var text string
- if CaseSensitiveWordIndex {
- text = s
- } else {
- text = strings.ToLower(s)
- }
- initArrCap := int(math.Ceil(float64(len(text)) * 0.01))
- if initArrCap < 4 {
- initArrCap = 4
- }
- ws := newWordSet(initArrCap)
- var pos uint64
- wstart := -1
- for i, rune := range text {
- if !stringutil.IsAlphaNumeric(string(rune)) && (unicode.IsSpace(rune) || unicode.IsControl(rune) || unicode.IsPunct(rune)) {
- if wstart >= 0 {
- ws.Add(text[wstart:i], pos+1)
- pos++
- wstart = -1
- }
- } else if wstart == -1 {
- wstart = i
- }
- }
- if wstart >= 0 {
- ws.Add(text[wstart:], pos+1)
- }
- return ws
- }
- /*
- Internal data structure for sets of words and their positions.
- */
- type wordSet struct {
- set map[string][]uint64 // Map which holds the data
- initArrCap int // Initial capacity for the position array
- }
- /*
- newWordSet creates a new word set.
- */
- func newWordSet(initArrCap int) *wordSet {
- return &wordSet{make(map[string][]uint64), initArrCap}
- }
- /*
- copyWordSet creates a new word set from a present one.
- */
- func copyWordSet(ws *wordSet) *wordSet {
- ret := &wordSet{make(map[string][]uint64), ws.initArrCap}
- ret.AddAll(ws)
- return ret
- }
- /*
- Add adds a word to the word set. Returns true if the word was added and false
- if an existing entry was updated.
- */
- func (ws *wordSet) Add(s string, pos uint64) bool {
- v, ok := ws.set[s]
- if !ok {
- ws.set[s] = make([]uint64, 1, ws.initArrCap)
- ws.set[s][0] = pos
- } else {
- // Make sure the largest entry is always last
- l := len(ws.set[s])
- if ws.set[s][l-1] < pos {
- ws.set[s] = append(v, pos)
- } else {
- // Make sure there is no double entry
- for _, ex := range v {
- if ex == pos {
- return !ok
- }
- }
- ws.set[s] = append(v, pos)
- sortutil.UInt64s(ws.set[s])
- }
- }
- return !ok
- }
- /*
- AddAll adds all words from another word set to this word set.
- */
- func (ws *wordSet) AddAll(ws2 *wordSet) {
- for s, val := range ws2.set {
- for _, v := range val {
- ws.Add(s, v)
- }
- }
- }
- /*
- Empty checks if this word set is empty.
- */
- func (ws *wordSet) Empty() bool {
- return len(ws.set) == 0
- }
- /*
- Has checks if this word set has a certain word.
- */
- func (ws *wordSet) Has(s string) bool {
- _, ok := ws.set[s]
- return ok
- }
- /*
- Pos returns the positions of a certain word.
- */
- func (ws *wordSet) Pos(s string) []uint64 {
- if pos, ok := ws.set[s]; ok {
- return pos
- }
- return nil
- }
- /*
- Remove removes a word from the word set.
- */
- func (ws *wordSet) Remove(s string, pos uint64) {
- if posArr, ok := ws.set[s]; ok {
- // Look for the position
- for i, p := range posArr {
- if p == pos {
- posArr := append(posArr[:i], posArr[i+1:]...)
- ws.set[s] = posArr
- break
- }
- }
- // Remove the word if no more positions are left
- if len(ws.set[s]) == 0 {
- delete(ws.set, s)
- }
- }
- }
- /*
- RemoveAll removes all words from another word set from this word set.
- */
- func (ws *wordSet) RemoveAll(ws2 *wordSet) {
- for s, posArr2 := range ws2.set {
- if posArr, ok := ws.set[s]; ok {
- j := 0
- for i := 0; i < len(posArr2); i++ {
- for ; j < len(posArr); j++ {
- if posArr[j] == posArr2[i] {
- // If a matching entry was found remove it
- posArr = append(posArr[:j], posArr[j+1:]...)
- ws.set[s] = posArr
- break
- } else if posArr[j] > posArr2[i] {
- // Skip over if a position is not in the current posArr
- break
- }
- }
- }
- }
- // Remove the word if no more positions are left
- if len(ws.set[s]) == 0 {
- delete(ws.set, s)
- }
- }
- }
- /*
- String returns a string representation of this word set.
- */
- func (ws *wordSet) String() string {
- var buf bytes.Buffer
- c := make([]string, 0, len(ws.set))
- for s := range ws.set {
- c = append(c, s)
- }
- sort.StringSlice(c).Sort()
- buf.WriteString("WordSet:\n")
- for _, k := range c {
- buf.WriteString(fmt.Sprintf(" %v %v\n", k, ws.set[k]))
- }
- return buf.String()
- }
|