| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909 | /* * EliasDB * * Copyright 2016 Matthias Ladkau. All rights reserved. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */package utilimport (	"bytes"	"crypto/md5"	"encoding/gob"	"fmt"	"math"	"sort"	"strings"	"unicode"	"devt.de/krotik/common/bitutil"	"devt.de/krotik/common/sortutil"	"devt.de/krotik/common/stringutil"	"devt.de/krotik/eliasdb/hash")/*MaxKeysetSize is the maximum number of keys for a single word lookup.*/const MaxKeysetSize = 1000/*CaseSensitiveWordIndex is a flag to indicate if the index should be case sensitive.*/var CaseSensitiveWordIndex = false/*PrefixAttrWord is the prefix used for word index entries*/const PrefixAttrWord = string(0x01)/*PrefixAttrHash is the prefix used for hashes of attribute values*/const PrefixAttrHash = string(0x02)/*IndexManager data structure*/type IndexManager struct {	htree *hash.HTree // Persistent HTree which stores this index}/*indexEntry data structure*/type indexEntry struct {	WordPos map[string]string // Node id to word position array}func init() {	// Make sure we can use indexEntry in a gob operation	gob.Register(&indexEntry{})}/*NewIndexManager creates a new index manager instance.*/func NewIndexManager(htree *hash.HTree) *IndexManager {	return &IndexManager{htree}}/*Index indexes (inserts) a given object.*/func (im *IndexManager) Index(key string, obj map[string]string) error {	return im.updateIndex(key, obj, nil)}/*Reindex reindexes (updates) a given object.*/func (im *IndexManager) Reindex(key string, newObj map[string]string,	oldObj map[string]string) error {	return im.updateIndex(key, newObj, oldObj)}/*Deindex deindexes (removes) a given object.*/func (im *IndexManager) Deindex(key string, obj map[string]string) error {	return im.updateIndex(key, nil, obj)}/*LookupPhrase finds all nodes where an attribute contains a certain phrase. Thiscall returns a list of node keys which contain the phrase at least once.*/func (im *IndexManager) LookupPhrase(attr, phrase string) ([]string, error) {	// Chop up the phrase into words	phraseWords := strings.FieldsFunc(phrase, func(r rune) bool {		return !stringutil.IsAlphaNumeric(string(r)) && (unicode.IsSpace(r) || unicode.IsControl(r) || unicode.IsPunct(r))	})	// Lookup every phrase word	results := make([]map[string][]uint64, len(phraseWords))	for i, phraseWord := range phraseWords {		res, err := im.LookupWord(attr, phraseWord)		if err != nil {			return nil, &GraphError{ErrIndexError, err.Error()}		}		results[i] = res	}	if len(results) == 0 || len(results[0]) == 0 {		return nil, nil	}	ret := make([]string, 0, len(results[0]))	// Go through all found nodes and try to find a path	path := make([]uint64, 0, len(phraseWords))	for key := range results[0] {		path = path[:0]		foundWords := im.findPhrasePath(key, 0, path, phraseWords, results)		if foundWords == len(phraseWords) {			// Add key to results if a path was found			ret = append(ret, key)		}	}	// Guarantee a stable result	sort.StringSlice(ret).Sort()	return ret, nil}/*findPhrasePath tries to find a phrase in a given set of lookup results.*/func (im *IndexManager) findPhrasePath(key string, index int, path []uint64,	phraseWords []string, results []map[string][]uint64) int {	// Get the results for this word index	result := results[index]	// Check if there is a result for the given key	if posArr, ok := result[key]; ok {		// Check if any of the positions is at the right place		if index > 0 {			// Check with previous result			for _, pos := range posArr {				// Check if the position array contains the expected next word position				if pos == path[index-1]+1 {					path = append(path, pos)					break				}				// Abort if the expected position cannot be there				if pos > path[index-1] {					return len(path)				}			}			// Do the next iteration if a position was found and			// there are more words in the phrase to match			if len(path) == index+1 && index < len(phraseWords)-1 {				return im.findPhrasePath(key, index+1, path, phraseWords, results)			}			return index + 1		}		// Try every position as start position in the first iteration		for _, pos := range posArr {			path = path[:0]			path = append(path, pos)			// Test if the phrase only contained one word			if len(phraseWords) == 1 {				return 1			}			// Find the rest			ret := im.findPhrasePath(key, 1, path, phraseWords, results)			if ret == len(phraseWords) {				return ret			}		}	}	return len(path)}/*LookupWord finds all nodes where an attribute contains a certain word. This call returnsa map which maps node key to a list of word positions.*/func (im *IndexManager) LookupWord(attr, word string) (map[string][]uint64, error) {	var s string	if CaseSensitiveWordIndex {		s = word	} else {		s = strings.ToLower(word)	}	entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s))	if err != nil {		return nil, &GraphError{ErrIndexError, err.Error()}	} else if entry == nil {		return nil, nil	}	ret := make(map[string][]uint64)	for k, l := range entry.(*indexEntry).WordPos {		ret[k] = bitutil.UnpackList(l)	}	return ret, nil}/*LookupValue finds all nodes where an attribute has a certain value. This callreturns a list of node keys.*/func (im *IndexManager) LookupValue(attr, value string) ([]string, error) {	var entry *indexEntry	var sum [16]byte	if CaseSensitiveWordIndex {		sum = md5.Sum([]byte(value))	} else {		sum = md5.Sum([]byte(strings.ToLower(value)))	}	indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))	// Retrieve index entry	obj, err := im.htree.Get(indexkey)	if err != nil {		return nil, &GraphError{ErrIndexError, err.Error()}	}	if obj == nil {		return nil, nil	}	entry = obj.(*indexEntry)	ret := make([]string, 0, len(entry.WordPos))	for key := range entry.WordPos {		ret = append(ret, key)	}	sort.StringSlice(ret).Sort()	return ret, nil}/*Count returns the number of found nodes for a given word in a given attribute.*/func (im *IndexManager) Count(attr, word string) (int, error) {	var s string	if CaseSensitiveWordIndex {		s = word	} else {		s = strings.ToLower(word)	}	entry, err := im.htree.Get([]byte(PrefixAttrWord + attr + s))	if err != nil {		return 0, &GraphError{ErrIndexError, err.Error()}	} else if entry == nil {		return 0, nil	}	return len(entry.(*indexEntry).WordPos), nil}/*updateIndex updates the index for a specific object. Depending on thenew and old arguments being set a given object is either indexed/added(only new is set), deindexted/removed (only old is set) or reindexted/updated(new and old are set).*/func (im *IndexManager) updateIndex(key string, newObj map[string]string,	oldObj map[string]string) error {	attrMap := make(map[string][]byte)	if newObj != nil && oldObj == nil {		// Insert case		for attr := range newObj {			attrMap[attr] = nil		}	} else if newObj == nil && oldObj != nil {		// Remove case		for attr := range oldObj {			attrMap[attr] = nil		}	} else {		// Update case		for attr := range newObj {			attrMap[attr] = nil		}		for attr := range oldObj {			attrMap[attr] = nil		}	}	emptyws := newWordSet(1)	for attr := range attrMap {		var newwords, toadd, oldwords, toremove *wordSet		newval, newok := newObj[attr]		oldval, oldok := oldObj[attr]		// Calculate which words to add or remove		newwords = emptyws		oldwords = emptyws		if newok {			newwords = extractWords(newval)		}		// At this point we have only words to add		toadd = newwords		toremove = emptyws		if oldok {			oldwords = extractWords(oldval)			if !oldwords.Empty() && !newwords.Empty() {				// Here a diff is necessary				toadd = copyWordSet(newwords)				toadd.RemoveAll(oldwords)				toremove = oldwords				toremove.RemoveAll(newwords)			} else {				// Either no new words or no old words				toremove = oldwords			}		}		// Add and remove index entries		for w, p := range toremove.set {			if err := im.removeIndexEntry(key, attr, w, p); err != nil {				return &GraphError{ErrIndexError, err.Error()}			}		}		for w, p := range toadd.set {			if err := im.addIndexEntry(key, attr, w, p); err != nil {				return &GraphError{ErrIndexError, err.Error()}			}		}		// Update hash lookup		if newok && oldok {			// Update hash entry			if err := im.removeIndexHashEntry(key, attr, oldval); err != nil {				return &GraphError{ErrIndexError, err.Error()}			} else if err := im.addIndexHashEntry(key, attr, newval); err != nil {				return &GraphError{ErrIndexError, err.Error()}			}		} else if newok && !oldok {			// Insert hash entry			if err := im.addIndexHashEntry(key, attr, newval); err != nil {				return &GraphError{ErrIndexError, err.Error()}			}		} else if oldok {			// Delete old hash entry			if err := im.removeIndexHashEntry(key, attr, oldval); err != nil {				return &GraphError{ErrIndexError, err.Error()}			}		}	}	return nil}/*addIndexHashEntry add a hash entry from the index. A hash entry stores a wholevalue as MD5 sum.*/func (im *IndexManager) addIndexHashEntry(key string, attr string, value string) error {	var entry *indexEntry	var sum [16]byte	if CaseSensitiveWordIndex {		sum = md5.Sum([]byte(value))	} else {		sum = md5.Sum([]byte(strings.ToLower(value)))	}	indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))	// Retrieve index entry	obj, err := im.htree.Get(indexkey)	if err != nil {		return err	}	if obj == nil {		entry = &indexEntry{make(map[string]string)}	} else {		entry = obj.(*indexEntry)	}	entry.WordPos[key] = ""	_, err = im.htree.Put(indexkey, entry)	return err}/*removeIndexHashEntry removes a hash entry from the index. A hash entry stores a wholevalue as MD5 sum.*/func (im *IndexManager) removeIndexHashEntry(key string, attr string, value string) error {	var entry *indexEntry	var sum [16]byte	if CaseSensitiveWordIndex {		sum = md5.Sum([]byte(value))	} else {		sum = md5.Sum([]byte(strings.ToLower(value)))	}	indexkey := []byte(PrefixAttrHash + attr + string(sum[:16]))	// Retrieve index entry	obj, err := im.htree.Get(indexkey)	if err != nil {		return err	}	if obj == nil {		return nil	}	entry = obj.(*indexEntry)	delete(entry.WordPos, key)	if len(entry.WordPos) == 0 {		im.htree.Remove(indexkey)	} else {		im.htree.Put(indexkey, entry)	}	return err}/*removeIndexEntry removes an entry from the index.*/func (im *IndexManager) removeIndexEntry(key string, attr string, word string, pos []uint64) error {	var entry *indexEntry	indexkey := []byte(PrefixAttrWord + attr + word)	// Retrieve index entry	obj, err := im.htree.Get(indexkey)	if err != nil {		return err	}	if obj == nil {		return nil	}	entry = obj.(*indexEntry)	// Remove given pos from existing pos information	if keyentry, ok := entry.WordPos[key]; ok {		keyentrylist := bitutil.UnpackList(keyentry)		res := make([]uint64, 0, len(keyentrylist))		remLookup := make(map[uint64]bool)		for _, item := range pos {			remLookup[item] = true		}		for _, item := range keyentrylist {			if _, ok := remLookup[item]; !ok {				res = append(res, item)			}		}		if len(res) == 0 {			delete(entry.WordPos, key)		} else {			entry.WordPos[key] = bitutil.PackList(res, res[len(res)-1])		}	}	if len(entry.WordPos) == 0 {		_, err = im.htree.Remove(indexkey)	} else {		_, err = im.htree.Put(indexkey, entry)	}	return err}/*addIndexEntry adds an entry to the index.*/func (im *IndexManager) addIndexEntry(key string, attr string, word string, pos []uint64) error {	var entry *indexEntry	indexkey := []byte(PrefixAttrWord + attr + word)	// Retrieve or create index entry	obj, err := im.htree.Get(indexkey)	if err != nil {		return err	}	if obj == nil {		entry = &indexEntry{make(map[string]string)}	} else {		entry = obj.(*indexEntry)	}	// Create position string	if len(pos) == 0 {		panic("Trying to add index entry without position information")	}	// Mix in given pos with existing pos information	if keyentry, ok := entry.WordPos[key]; ok {		pos = append(bitutil.UnpackList(keyentry), pos...)		sortutil.UInt64s(pos)		pos = removeDuplicates(pos)	}	// Rely on the fact that position arrays are ordered in ascending order	maxpos := pos[len(pos)-1]	// Fill the entry and store it	entry.WordPos[key] = bitutil.PackList(pos, maxpos)	_, err = im.htree.Put(indexkey, entry)	return err}/*Remove all duplicates from a given sorted list.*/func removeDuplicates(list []uint64) []uint64 {	if len(list) == 0 {		return list	}	res := make([]uint64, 1, len(list))	res[0] = list[0]	last := list[0]	for _, item := range list[1:] {		if item != last {			res = append(res, item)			last = item		}	}	return res}/*String returns a string representation of this index manager.*/func (im *IndexManager) String() string {	var buf bytes.Buffer	buf.WriteString(fmt.Sprintf("IndexManager: %v\n", im.htree.Location()))	it := hash.NewHTreeIterator(im.htree)	for it.HasNext() {		key, value := it.Next()		posmap := make(map[string][]uint64)		for k, v := range value.(*indexEntry).WordPos {			posmap[k] = bitutil.UnpackList(v)		}		buf.WriteString(fmt.Sprintf("    %v%q %v\n", key[0], string(key[1:]), posmap))	}	return buf.String()}/*extractWords extracts all words from a given string and return a wordSet which containsall words and their positions.*/func extractWords(s string) *wordSet {	var text string	if CaseSensitiveWordIndex {		text = s	} else {		text = strings.ToLower(s)	}	initArrCap := int(math.Ceil(float64(len(text)) * 0.01))	if initArrCap < 4 {		initArrCap = 4	}	ws := newWordSet(initArrCap)	var pos uint64	wstart := -1	for i, rune := range text {		if !stringutil.IsAlphaNumeric(string(rune)) && (unicode.IsSpace(rune) || unicode.IsControl(rune) || unicode.IsPunct(rune)) {			if wstart >= 0 {				ws.Add(text[wstart:i], pos+1)				pos++				wstart = -1			}		} else if wstart == -1 {			wstart = i		}	}	if wstart >= 0 {		ws.Add(text[wstart:], pos+1)	}	return ws}/*Internal data structure for sets of words and their positions.*/type wordSet struct {	set        map[string][]uint64 // Map which holds the data	initArrCap int                 // Initial capacity for the position array}/*newWordSet creates a new word set.*/func newWordSet(initArrCap int) *wordSet {	return &wordSet{make(map[string][]uint64), initArrCap}}/*copyWordSet creates a new word set from a present one.*/func copyWordSet(ws *wordSet) *wordSet {	ret := &wordSet{make(map[string][]uint64), ws.initArrCap}	ret.AddAll(ws)	return ret}/*Add adds a word to the word set. Returns true if the word was added and falseif an existing entry was updated.*/func (ws *wordSet) Add(s string, pos uint64) bool {	v, ok := ws.set[s]	if !ok {		ws.set[s] = make([]uint64, 1, ws.initArrCap)		ws.set[s][0] = pos	} else {		// Make sure the largest entry is always last		l := len(ws.set[s])		if ws.set[s][l-1] < pos {			ws.set[s] = append(v, pos)		} else {			// Make sure there is no double entry			for _, ex := range v {				if ex == pos {					return !ok				}			}			ws.set[s] = append(v, pos)			sortutil.UInt64s(ws.set[s])		}	}	return !ok}/*AddAll adds all words from another word set to this word set.*/func (ws *wordSet) AddAll(ws2 *wordSet) {	for s, val := range ws2.set {		for _, v := range val {			ws.Add(s, v)		}	}}/*Empty checks if this word set is empty.*/func (ws *wordSet) Empty() bool {	return len(ws.set) == 0}/*Has checks if this word set has a certain word.*/func (ws *wordSet) Has(s string) bool {	_, ok := ws.set[s]	return ok}/*Pos returns the positions of a certain word.*/func (ws *wordSet) Pos(s string) []uint64 {	if pos, ok := ws.set[s]; ok {		return pos	}	return nil}/*Remove removes a word from the word set.*/func (ws *wordSet) Remove(s string, pos uint64) {	if posArr, ok := ws.set[s]; ok {		// Look for the position		for i, p := range posArr {			if p == pos {				posArr := append(posArr[:i], posArr[i+1:]...)				ws.set[s] = posArr				break			}		}		// Remove the word if no more positions are left		if len(ws.set[s]) == 0 {			delete(ws.set, s)		}	}}/*RemoveAll removes all words from another word set from this word set.*/func (ws *wordSet) RemoveAll(ws2 *wordSet) {	for s, posArr2 := range ws2.set {		if posArr, ok := ws.set[s]; ok {			j := 0			for i := 0; i < len(posArr2); i++ {				for ; j < len(posArr); j++ {					if posArr[j] == posArr2[i] {						// If a matching entry was found remove it						posArr = append(posArr[:j], posArr[j+1:]...)						ws.set[s] = posArr						break					} else if posArr[j] > posArr2[i] {						// Skip over if a position is not in the current posArr						break					}				}			}		}		// Remove the word if no more positions are left		if len(ws.set[s]) == 0 {			delete(ws.set, s)		}	}}/*String returns a string representation of this word set.*/func (ws *wordSet) String() string {	var buf bytes.Buffer	c := make([]string, 0, len(ws.set))	for s := range ws.set {		c = append(c, s)	}	sort.StringSlice(c).Sort()	buf.WriteString("WordSet:\n")	for _, k := range c {		buf.WriteString(fmt.Sprintf("    %v %v\n", k, ws.set[k]))	}	return buf.String()}
 |