| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558 | /* * EliasDB * * Copyright 2016 Matthias Ladkau. All rights reserved. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */package parserimport (	"fmt"	"strconv"	"strings"	"unicode"	"unicode/utf8"	"devt.de/krotik/common/stringutil")/*LexToken represents a token which is returned by the lexer.*/type LexToken struct {	ID    LexTokenID // Token kind	Pos   int        // Starting position (in runes)	Val   string     // Token value	Lline int        // Line in the input this token appears	Lpos  int        // Position in the input line this token appears}/*PosString returns the position of this token in the origianl input as a string.*/func (t LexToken) PosString() string {	return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)}/*String returns a string representation of a token.*/func (t LexToken) String() string {	switch {	case t.ID == TokenEOF:		return "EOF"	case t.ID == TokenError:		return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())	case t.ID > TOKENodeSYMBOLS && t.ID < TOKENodeKEYWORDS:		return fmt.Sprintf("%s", strings.ToUpper(t.Val))	case t.ID > TOKENodeKEYWORDS:		return fmt.Sprintf("<%s>", strings.ToUpper(t.Val))	case len(t.Val) > 10:		// Special case for very long values		return fmt.Sprintf("%.10q...", t.Val)	}	return fmt.Sprintf("%q", t.Val)}/*Map of keywords - these require spaces between them*/var keywordMap = map[string]LexTokenID{	"get":           TokenGET,	"lookup":        TokenLOOKUP,	"from":          TokenFROM,	"group":         TokenGROUP,	"with":          TokenWITH,	"filtering":     TokenFILTERING,	"ordering":      TokenORDERING,	"nulltraversal": TokenNULLTRAVERSAL,	"where":         TokenWHERE,	"traverse":      TokenTRAVERSE,	"end":           TokenEND,	"primary":       TokenPRIMARY,	"show":          TokenSHOW,	"as":            TokenAS,	"format":        TokenFORMAT,	"and":           TokenAND,	"or":            TokenOR,	"like":          TokenLIKE,	"in":            TokenIN,	"contains":      TokenCONTAINS,	"beginswith":    TokenBEGINSWITH,	"endswith":      TokenENDSWITH,	"containsnot":   TokenCONTAINSNOT,	"not":           TokenNOT,	"notin":         TokenNOTIN,	"false":         TokenFALSE,	"true":          TokenTRUE,	"unique":        TokenUNIQUE,	"uniquecount":   TokenUNIQUECOUNT,	"null":          TokenNULL,	"isnotnull":     TokenISNOTNULL,	"ascending":     TokenASCENDING,	"descending":    TokenDESCENDING,}/*Special symbols which will always be unique - these will separate unquoted strings*/var symbolMap = map[string]LexTokenID{	"@":  TokenAT,	">=": TokenGEQ,	"<=": TokenLEQ,	"!=": TokenNEQ,	"=":  TokenEQ,	">":  TokenGT,	"<":  TokenLT,	"(":  TokenLPAREN,	")":  TokenRPAREN,	"[":  TokenLBRACK,	"]":  TokenRBRACK,	",":  TokenCOMMA,	"+":  TokenPLUS,	"-":  TokenMINUS,	"*":  TokenTIMES,	"/":  TokenDIV,	"//": TokenDIVINT,	"%":  TokenMODINT,}// Lexer// =====/*RuneEOF is a special rune which represents the end of the input*/const RuneEOF = -1/*Function which represents the current state of the lexer and returns the next state*/type lexFunc func(*lexer) lexFunc/*Lexer data structure*/type lexer struct {	name   string        // Name to identify the input	input  string        // Input string of the lexer	pos    int           // Current rune pointer	line   int           // Current line pointer	lastnl int           // Last newline position	width  int           // Width of last rune	start  int           // Start position of the current read token	scope  LexTokenID    // Current scope	tokens chan LexToken // Channel for lexer output}/*FirstWord returns the first word of a given input.*/func FirstWord(input string) string {	var word string	l := &lexer{"", input, 0, 0, 0, 0, 0, -1, nil}	if skipWhiteSpace(l) {		l.startNew()		lexTextBlock(l, false)		word = input[l.start:l.pos]	}	return word}/*Lex lexes a given input. Returns a channel which contains tokens.*/func Lex(name string, input string) chan LexToken {	l := &lexer{name, input, 0, 0, 0, 0, 0, -1, make(chan LexToken)}	go l.run()	return l.tokens}/*LexToList lexes a given input. Returns a list of tokens.*/func LexToList(name string, input string) []LexToken {	var tokens []LexToken	for t := range Lex(name, input) {		tokens = append(tokens, t)	}	return tokens}/*Main look of the lexer.*/func (l *lexer) run() {	if skipWhiteSpace(l) {		for state := lexToken; state != nil; {			state = state(l)			if !skipWhiteSpace(l) {				break			}		}	}	close(l.tokens)}/*next returns the next rune in the input and advances the current rune pointerif the peek flag is not set. If the peek flag is set then the rune pointeris not advanced.*/func (l *lexer) next(peek bool) rune {	// Check if we reached the end	if int(l.pos) >= len(l.input) {		return RuneEOF	}	// Decode the next rune	r, w := utf8.DecodeRuneInString(l.input[l.pos:])	if !peek {		l.width = w		l.pos += l.width	}	return r}/*backup sets the pointer one rune back. Can only be called once per next call.*/func (l *lexer) backup() {	if l.width == -1 {		panic("Can only backup once per next call")	}	l.pos -= l.width	l.width = -1}/*startNew starts a new token.*/func (l *lexer) startNew() {	l.start = l.pos}/*emitToken passes a token back to the client.*/func (l *lexer) emitToken(t LexTokenID) {	if t == TokenEOF {		l.emitTokenAndValue(t, "")		return	}	if l.tokens != nil {		l.tokens <- LexToken{t, l.start, l.input[l.start:l.pos],			l.line + 1, l.start - l.lastnl + 1}	}}/*emitTokenAndValue passes a token with a given value back to the client.*/func (l *lexer) emitTokenAndValue(t LexTokenID, val string) {	if l.tokens != nil {		l.tokens <- LexToken{t, l.start, val, l.line + 1, l.start - l.lastnl + 1}	}}/*emitError passes an error token back to the client.*/func (l *lexer) emitError(msg string) {	if l.tokens != nil {		l.tokens <- LexToken{TokenError, l.start, msg, l.line + 1, l.start - l.lastnl + 1}	}}// State functions// ===============/*lexToken is the main entry function for the lexer.*/func lexToken(l *lexer) lexFunc {	// Check if we got a quoted value or a comment	n1 := l.next(false)	n2 := l.next(true)	l.backup()	if n1 == '#' {		return skipRestOfLine	}	if (n1 == '"' || n1 == '\'') || (n1 == 'r' && (n2 == '"' || n2 == '\'')) {		return lexValue	}	// Lex a block of text and emit any found tokens	l.startNew()	lexTextBlock(l, true)	// Try to lookup the keyword or an unquoted value	keywordCandidate := strings.ToLower(l.input[l.start:l.pos])	token, ok := keywordMap[keywordCandidate]	if !ok {		token, ok = symbolMap[keywordCandidate]	}	if ok {		// Special start token was found		l.emitToken(token)		switch token {		case TokenGET:			l.scope = token			return lexNodeKind		case TokenLOOKUP:			l.scope = token			return lexNodeKind		}	} else {		// An unknown token was found - it must be an unquoted value		// emit and continue		l.emitToken(TokenVALUE)	}	return lexToken}/*skipRestOfLine skips all characters until the next newline character.*/func skipRestOfLine(l *lexer) lexFunc {	r := l.next(false)	for r != '\n' && r != RuneEOF {		r = l.next(false)	}	if r == RuneEOF {		return nil	}	return lexToken}/*lexNodeKind lexes a node kind string.*/func lexNodeKind(l *lexer) lexFunc {	l.startNew()	lexTextBlock(l, false)	nodeKindCandidate := strings.ToLower(l.input[l.start:l.pos])	if !stringutil.IsAlphaNumeric(nodeKindCandidate) {		l.emitError("Invalid node kind " + fmt.Sprintf("'%v'", nodeKindCandidate) +			" - can only contain [a-zA-Z0-9_]")		return nil	}	l.emitToken(TokenNODEKIND)	if l.scope == TokenGET {		return lexToken	}	// In a lookup scope more values are following	return lexValue}/*lexValue lexes a value which can describe names, values, regexes, etc ...Values can be declared in different ways:' ... ' or " ... "Characters are parsed between quotes (escape sequences are interpreted)r' ... ' or r" ... "Characters are parsed plain between quote*/func lexValue(l *lexer) lexFunc {	var endToken rune	l.startNew()	allowEscapes := false	r := l.next(false)	// Check if we have a raw quoted string	if q := l.next(true); r == 'r' && (q == '"' || q == '\'') {		endToken = q		l.next(false)	} else if r == '"' || r == '\'' {		allowEscapes = true		endToken = r	} else {		l.emitError("Value expected")		return nil	}	r = l.next(false)	rprev := ' '	lLine := l.line	lLastnl := l.lastnl	for (!allowEscapes && r != endToken) ||		(allowEscapes && (r != endToken || rprev == '\\')) {		if r == '\n' {			lLine++			lLastnl = l.pos		}		rprev = r		r = l.next(false)		if r == RuneEOF {			l.emitError("Unexpected end while reading value")			return nil		}	}	if allowEscapes {		val := l.input[l.start+1 : l.pos-1]		// Interpret escape sequences right away		if endToken == '\'' {			// Escape double quotes in a single quoted string			val = strings.Replace(val, "\"", "\\\"", -1)		}		s, err := strconv.Unquote("\"" + val + "\"")		if err != nil {			l.emitError(err.Error() + " while parsing escape sequences")			return nil		}		l.emitTokenAndValue(TokenVALUE, s)	} else {		l.emitTokenAndValue(TokenVALUE, l.input[l.start+2:l.pos-1])	}	//  Set newline	l.line = lLine	l.lastnl = lLastnl	return lexToken}// Helper functions// ================/*skipWhiteSpace skips any number of whitespace characters. Returns false if the parserreaches EOF while skipping whitespaces.*/func skipWhiteSpace(l *lexer) bool {	r := l.next(false)	for unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF {		if r == '\n' {			l.line++			l.lastnl = l.pos		}		r = l.next(false)		if r == RuneEOF {			l.emitToken(TokenEOF)			return false		}	}	l.backup()	return true}/*lexTextBlock lexes a block of text without whitespaces. Interpretsoptionally all one or two letter tokens.*/func lexTextBlock(l *lexer, interpretToken bool) {	r := l.next(false)	if interpretToken {		// Check if we start with a known symbol		nr := l.next(true)		if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {			l.next(false)			return		}		if _, ok := symbolMap[strings.ToLower(string(r))]; ok {			return		}	}	for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {		if interpretToken {			// Check if we find a token in the block			if _, ok := symbolMap[strings.ToLower(string(r))]; ok {				l.backup()				return			}			nr := l.next(true)			if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {				l.backup()				return			}		}		r = l.next(false)	}	if r != RuneEOF {		l.backup()	}}
 |