/* * EliasDB * * Copyright 2016 Matthias Ladkau. All rights reserved. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ package util import ( "bytes" "crypto/md5" "encoding/gob" "fmt" "math" "sort" "strings" "unicode" "devt.de/krotik/common/bitutil" "devt.de/krotik/common/sortutil" "devt.de/krotik/common/stringutil" "devt.de/krotik/eliasdb/hash" ) /* MaxKeysetSize is the maximum number of keys for a single word lookup. */ const MaxKeysetSize = 1000 /* CaseSensitiveWordIndex is a flag to indicate if the index should be case sensitive. */ var CaseSensitiveWordIndex = false /* PrefixAttrWord is the prefix used for word index entries */ const PrefixAttrWord = "\x01" /* PrefixAttrHash is the prefix used for hashes of attribute values */ const PrefixAttrHash = "\x01" /* IndexManager data structure */ type IndexManager struct { htree *hash.HTree // Persistent HTree which stores this index } /* indexEntry data structure */ type indexEntry struct { WordPos map[string]string // Node id to word position array } func init() { // Make sure we can use indexEntry in a gob operation gob.Register(&indexEntry{}) } /* NewIndexManager creates a new index manager instance. */ func NewIndexManager(htree *hash.HTree) *IndexManager { return &IndexManager{htree} } /* Index indexes (inserts) a given object. */ func (im *IndexManager) Index(key string, obj map[string]string) error { return im.updateIndex(key, obj, nil) } /* Reindex reindexes (updates) a given object. */ func (im *IndexManager) Reindex(key string, newObj map[string]string, oldObj map[string]string) error { return im.updateIndex(key, newObj, oldObj) } /* Deindex deindexes (removes) a given object. */ func (im *IndexManager) Deindex(key string, obj map[string]string) error { return im.updateIndex(key, nil, obj) } /* LookupPhrase finds all nodes where an attribute contains a certain phrase. This call returns a list of node keys which contain the phrase at least once. */ func (im *IndexManager) LookupPhrase(attr, phrase string) ([]string, error) { // Chop up the phrase into words phraseWords := strings.FieldsFunc(phrase, func(r rune) bool { return !stringutil.IsAlphaNumeric(string(r)) && (unicode.IsSpace(r) || unicode.IsControl(r) || unicode.IsPunct(r)) }) // Lookup every phrase word results := make([]map[string][]uint64, len(phraseWords)) for i, phraseWord := range phraseWords { res, err := im.LookupWord(attr, phraseWord) if err != nil { return nil, &GraphError{ErrIndexError, err.Error()} } results[i] = res } if len(results) == 0 || len(results[0]) == 0 { return nil, nil } ret := make([]string, 0, len(results[0])) // Go through all found nodes and try to find a path path := make([]uint64, 0, len(phraseWords)) for key := range results[0] { path = path[:0] foundWords := im.findPhrasePath(key, 0, path, phraseWords, results) if foundWords == len(phraseWords) { // Add key to results if a path was found ret = append(ret, key) } } // Guarantee a stable result sort.StringSlice(ret).Sort() return ret, nil } /* findPhrasePath tries to find a phrase in a given set of lookup results. */ func (im *IndexManager) findPhrasePath(key string, index int, path []uint64, phraseWords []string, results []map[string][]uint64) int { // Get the results for this word index result := results[index] // Check if there is a result for the given key if posArr, ok := result[key]; ok { // Check if any of the positions is at the right place if index > 0 { // Check with previous result for _, pos := range posArr { // Check if the position array contains the expected next word position if pos == path[index-1]+1 { path = append(path, pos) break } // Abort if the expected position cannot be there if pos > path[index-1] { return len(path) } } // Do the next iteration if a position was found and // there are more words in the phrase to match if len(path) == index+1 && index < len(phraseWords)-1 { return im.findPhrasePath(key, index+1, path, phraseWords, results) } return index + 1 } // Try every position as start position in the first iteration for _, pos := range posArr { path = path[:0] path = append(path, pos) // Test if the phrase only contained one word if len(phraseWords) == 1 { return 1 } // Find the rest ret := im.findPhrasePath(key, 1, path, phraseWords, results) if ret == len(phraseWords) { return ret } } } return len(path) } /* LookupWord finds all nodes where an attribute contains a certain word. This call returns a map which maps node key to a list of word positions. */ func (im *IndexManager) LookupWord(attr, word string) (map[string][]uint64, error) { var s string if CaseSensitiveWordIndex { s = word } else { s = strings.ToLower(word) } entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s)) if err != nil { return nil, &GraphError{ErrIndexError, err.Error()} } else if entry == nil { return nil, nil } ret := make(map[string][]uint64) for k, l := range entry.(*indexEntry).WordPos { ret[k] = bitutil.UnpackList(l) } return ret, nil } /* LookupValue finds all nodes where an attribute has a certain value. This call returns a list of node keys. */ func (im *IndexManager) LookupValue(attr, value string) ([]string, error) { var entry *indexEntry var sum [16]byte if CaseSensitiveWordIndex { sum = md5.Sum([]byte(value)) } else { sum = md5.Sum([]byte(strings.ToLower(value))) } indexkey := []byte(PrefixAttrHash + attr + string(sum[:16])) // Retrieve index entry obj, err := im.htree.Get(indexkey) if err != nil { return nil, &GraphError{ErrIndexError, err.Error()} } if obj == nil { return nil, nil } entry = obj.(*indexEntry) ret := make([]string, 0, len(entry.WordPos)) for key := range entry.WordPos { ret = append(ret, key) } sort.StringSlice(ret).Sort() return ret, nil } /* Count returns the number of found nodes for a given word in a given attribute. */ func (im *IndexManager) Count(attr, word string) (int, error) { var s string if CaseSensitiveWordIndex { s = word } else { s = strings.ToLower(word) } entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s)) if err != nil { return 0, &GraphError{ErrIndexError, err.Error()} } else if entry == nil { return 0, nil } return len(entry.(*indexEntry).WordPos), nil } /* updateIndex updates the index for a specific object. Depending on the new and old arguments being set a given object is either indexed/added (only new is set), deindexted/removed (only old is set) or reindexted/updated (new and old are set). */ func (im *IndexManager) updateIndex(key string, newObj map[string]string, oldObj map[string]string) error { attrMap := make(map[string][]byte) if newObj != nil && oldObj == nil { // Insert case for attr := range newObj { attrMap[attr] = nil } } else if newObj == nil && oldObj != nil { // Remove case for attr := range oldObj { attrMap[attr] = nil } } else { // Update case for attr := range newObj { attrMap[attr] = nil } for attr := range oldObj { attrMap[attr] = nil } } emptyws := newWordSet(1) for attr := range attrMap { var newwords, toadd, oldwords, toremove *wordSet newval, newok := newObj[attr] oldval, oldok := oldObj[attr] // Calculate which words to add or remove newwords = emptyws oldwords = emptyws if newok { newwords = extractWords(newval) } // At this point we have only words to add toadd = newwords toremove = emptyws if oldok { oldwords = extractWords(oldval) if !oldwords.Empty() && !newwords.Empty() { // Here a diff is necessary toadd = copyWordSet(newwords) toadd.RemoveAll(oldwords) toremove = oldwords toremove.RemoveAll(newwords) } else { // Either no new words or no old words toremove = oldwords } } // Add and remove index entries for w, p := range toremove.set { if err := im.removeIndexEntry(key, attr, w, p); err != nil { return &GraphError{ErrIndexError, err.Error()} } } for w, p := range toadd.set { if err := im.addIndexEntry(key, attr, w, p); err != nil { return &GraphError{ErrIndexError, err.Error()} } } // Update hash lookup if newok && oldok { // Update hash entry if err := im.removeIndexHashEntry(key, attr, oldval); err != nil { return &GraphError{ErrIndexError, err.Error()} } else if err := im.addIndexHashEntry(key, attr, newval); err != nil { return &GraphError{ErrIndexError, err.Error()} } } else if newok && !oldok { // Insert hash entry if err := im.addIndexHashEntry(key, attr, newval); err != nil { return &GraphError{ErrIndexError, err.Error()} } } else if oldok { // Delete old hash entry if err := im.removeIndexHashEntry(key, attr, oldval); err != nil { return &GraphError{ErrIndexError, err.Error()} } } } return nil } /* addIndexHashEntry add a hash entry from the index. A hash entry stores a whole value as MD5 sum. */ func (im *IndexManager) addIndexHashEntry(key string, attr string, value string) error { var entry *indexEntry var sum [16]byte if CaseSensitiveWordIndex { sum = md5.Sum([]byte(value)) } else { sum = md5.Sum([]byte(strings.ToLower(value))) } indexkey := []byte(PrefixAttrHash + attr + string(sum[:16])) // Retrieve index entry obj, err := im.htree.Get(indexkey) if err != nil { return err } if obj == nil { entry = &indexEntry{make(map[string]string)} } else { entry = obj.(*indexEntry) } entry.WordPos[key] = "" _, err = im.htree.Put(indexkey, entry) return err } /* removeIndexHashEntry removes a hash entry from the index. A hash entry stores a whole value as MD5 sum. */ func (im *IndexManager) removeIndexHashEntry(key string, attr string, value string) error { var entry *indexEntry var sum [16]byte if CaseSensitiveWordIndex { sum = md5.Sum([]byte(value)) } else { sum = md5.Sum([]byte(strings.ToLower(value))) } indexkey := []byte(PrefixAttrHash + attr + string(sum[:16])) // Retrieve index entry obj, err := im.htree.Get(indexkey) if err != nil { return err } if obj == nil { return nil } entry = obj.(*indexEntry) delete(entry.WordPos, key) if len(entry.WordPos) == 0 { im.htree.Remove(indexkey) } else { im.htree.Put(indexkey, entry) } return err } /* removeIndexEntry removes an entry from the index. */ func (im *IndexManager) removeIndexEntry(key string, attr string, word string, pos []uint64) error { var entry *indexEntry indexkey := []byte(PrefixAttrWord + attr + word) // Retrieve index entry obj, err := im.htree.Get(indexkey) if err != nil { return err } if obj == nil { return nil } entry = obj.(*indexEntry) // Remove given pos from existing pos information if keyentry, ok := entry.WordPos[key]; ok { keyentrylist := bitutil.UnpackList(keyentry) res := make([]uint64, 0, len(keyentrylist)) remLookup := make(map[uint64]bool) for _, item := range pos { remLookup[item] = true } for _, item := range keyentrylist { if _, ok := remLookup[item]; !ok { res = append(res, item) } } if len(res) == 0 { delete(entry.WordPos, key) } else { entry.WordPos[key] = bitutil.PackList(res, res[len(res)-1]) } } if len(entry.WordPos) == 0 { _, err = im.htree.Remove(indexkey) } else { _, err = im.htree.Put(indexkey, entry) } return err } /* addIndexEntry adds an entry to the index. */ func (im *IndexManager) addIndexEntry(key string, attr string, word string, pos []uint64) error { var entry *indexEntry indexkey := []byte(PrefixAttrWord + attr + word) // Retrieve or create index entry obj, err := im.htree.Get(indexkey) if err != nil { return err } if obj == nil { entry = &indexEntry{make(map[string]string)} } else { entry = obj.(*indexEntry) } // Create position string if len(pos) == 0 { panic("Trying to add index entry without position information") } // Mix in given pos with existing pos information if keyentry, ok := entry.WordPos[key]; ok { pos = append(bitutil.UnpackList(keyentry), pos...) sortutil.UInt64s(pos) pos = removeDuplicates(pos) } // Rely on the fact that position arrays are ordered in ascending order maxpos := pos[len(pos)-1] // Fill the entry and store it entry.WordPos[key] = bitutil.PackList(pos, maxpos) _, err = im.htree.Put(indexkey, entry) return err } /* Remove all duplicates from a given sorted list. */ func removeDuplicates(list []uint64) []uint64 { if len(list) == 0 { return list } res := make([]uint64, 1, len(list)) res[0] = list[0] last := list[0] for _, item := range list[1:] { if item != last { res = append(res, item) last = item } } return res } /* String returns a string representation of this index manager. */ func (im *IndexManager) String() string { var buf bytes.Buffer buf.WriteString(fmt.Sprintf("IndexManager: %v\n", im.htree.Location())) it := hash.NewHTreeIterator(im.htree) for it.HasNext() { key, value := it.Next() posmap := make(map[string][]uint64) for k, v := range value.(*indexEntry).WordPos { posmap[k] = bitutil.UnpackList(v) } buf.WriteString(fmt.Sprintf(" %v%q %v\n", key[0], string(key[1:]), posmap)) } return buf.String() } /* extractWords extracts all words from a given string and return a wordSet which contains all words and their positions. */ func extractWords(s string) *wordSet { var text string if CaseSensitiveWordIndex { text = s } else { text = strings.ToLower(s) } initArrCap := int(math.Ceil(float64(len(text)) * 0.01)) if initArrCap < 4 { initArrCap = 4 } ws := newWordSet(initArrCap) var pos uint64 wstart := -1 for i, rune := range text { if !stringutil.IsAlphaNumeric(string(rune)) && (unicode.IsSpace(rune) || unicode.IsControl(rune) || unicode.IsPunct(rune)) { if wstart >= 0 { ws.Add(text[wstart:i], pos+1) pos++ wstart = -1 } } else if wstart == -1 { wstart = i } } if wstart >= 0 { ws.Add(text[wstart:], pos+1) } return ws } /* Internal data structure for sets of words and their positions. */ type wordSet struct { set map[string][]uint64 // Map which holds the data initArrCap int // Initial capacity for the position array } /* newWordSet creates a new word set. */ func newWordSet(initArrCap int) *wordSet { return &wordSet{make(map[string][]uint64), initArrCap} } /* copyWordSet creates a new word set from a present one. */ func copyWordSet(ws *wordSet) *wordSet { ret := &wordSet{make(map[string][]uint64), ws.initArrCap} ret.AddAll(ws) return ret } /* Add adds a word to the word set. Returns true if the word was added and false if an existing entry was updated. */ func (ws *wordSet) Add(s string, pos uint64) bool { v, ok := ws.set[s] if !ok { ws.set[s] = make([]uint64, 1, ws.initArrCap) ws.set[s][0] = pos } else { // Make sure the largest entry is always last l := len(ws.set[s]) if ws.set[s][l-1] < pos { ws.set[s] = append(v, pos) } else { // Make sure there is no double entry for _, ex := range v { if ex == pos { return !ok } } ws.set[s] = append(v, pos) sortutil.UInt64s(ws.set[s]) } } return !ok } /* AddAll adds all words from another word set to this word set. */ func (ws *wordSet) AddAll(ws2 *wordSet) { for s, val := range ws2.set { for _, v := range val { ws.Add(s, v) } } } /* Empty checks if this word set is empty. */ func (ws *wordSet) Empty() bool { return len(ws.set) == 0 } /* Has checks if this word set has a certain word. */ func (ws *wordSet) Has(s string) bool { _, ok := ws.set[s] return ok } /* Pos returns the positions of a certain word. */ func (ws *wordSet) Pos(s string) []uint64 { if pos, ok := ws.set[s]; ok { return pos } return nil } /* Remove removes a word from the word set. */ func (ws *wordSet) Remove(s string, pos uint64) { if posArr, ok := ws.set[s]; ok { // Look for the position for i, p := range posArr { if p == pos { posArr := append(posArr[:i], posArr[i+1:]...) ws.set[s] = posArr break } } // Remove the word if no more positions are left if len(ws.set[s]) == 0 { delete(ws.set, s) } } } /* RemoveAll removes all words from another word set from this word set. */ func (ws *wordSet) RemoveAll(ws2 *wordSet) { for s, posArr2 := range ws2.set { if posArr, ok := ws.set[s]; ok { j := 0 for i := 0; i < len(posArr2); i++ { for ; j < len(posArr); j++ { if posArr[j] == posArr2[i] { // If a matching entry was found remove it posArr = append(posArr[:j], posArr[j+1:]...) ws.set[s] = posArr break } else if posArr[j] > posArr2[i] { // Skip over if a position is not in the current posArr break } } } } // Remove the word if no more positions are left if len(ws.set[s]) == 0 { delete(ws.set, s) } } } /* String returns a string representation of this word set. */ func (ws *wordSet) String() string { var buf bytes.Buffer c := make([]string, 0, len(ws.set)) for s := range ws.set { c = append(c, s) } sort.StringSlice(c).Sort() buf.WriteString("WordSet:\n") for _, k := range c { buf.WriteString(fmt.Sprintf(" %v %v\n", k, ws.set[k])) } return buf.String() }