krotik
/
eliasdb


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
							/*
 * EliasDB
 *
 * Copyright 2016 Matthias Ladkau. All rights reserved.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

package parser

import (
	"fmt"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"

	"devt.de/krotik/common/stringutil"
)

/*
LexToken represents a token which is returned by the lexer.
*/
type LexToken struct {
	ID    LexTokenID // Token kind
	Pos   int        // Starting position (in runes)
	Val   string     // Token value
	Lline int        // Line in the input this token appears
	Lpos  int        // Position in the input line this token appears
}

/*
PosString returns the position of this token in the origianl input as a string.
*/
func (t LexToken) PosString() string {
	return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)
}

/*
String returns a string representation of a token.
*/
func (t LexToken) String() string {

	switch {

	case t.ID == TokenEOF:
		return "EOF"

	case t.ID == TokenError:
		return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())

	case t.ID > TOKENodeSYMBOLS && t.ID < TOKENodeKEYWORDS:
		return fmt.Sprintf("%s", strings.ToUpper(t.Val))

	case t.ID > TOKENodeKEYWORDS:
		return fmt.Sprintf("<%s>", strings.ToUpper(t.Val))

	case len(t.Val) > 10:

		// Special case for very long values

		return fmt.Sprintf("%.10q...", t.Val)
	}

	return fmt.Sprintf("%q", t.Val)
}

/*
Map of keywords - these require spaces between them
*/
var keywordMap = map[string]LexTokenID{
	"get":           TokenGET,
	"lookup":        TokenLOOKUP,
	"from":          TokenFROM,
	"group":         TokenGROUP,
	"with":          TokenWITH,
	"filtering":     TokenFILTERING,
	"ordering":      TokenORDERING,
	"nulltraversal": TokenNULLTRAVERSAL,
	"where":         TokenWHERE,
	"traverse":      TokenTRAVERSE,
	"end":           TokenEND,
	"primary":       TokenPRIMARY,
	"show":          TokenSHOW,
	"as":            TokenAS,
	"format":        TokenFORMAT,
	"and":           TokenAND,
	"or":            TokenOR,
	"like":          TokenLIKE,
	"in":            TokenIN,
	"contains":      TokenCONTAINS,
	"beginswith":    TokenBEGINSWITH,
	"endswith":      TokenENDSWITH,
	"containsnot":   TokenCONTAINSNOT,
	"not":           TokenNOT,
	"notin":         TokenNOTIN,
	"false":         TokenFALSE,
	"true":          TokenTRUE,
	"unique":        TokenUNIQUE,
	"uniquecount":   TokenUNIQUECOUNT,
	"null":          TokenNULL,
	"isnotnull":     TokenISNOTNULL,
	"ascending":     TokenASCENDING,
	"descending":    TokenDESCENDING,
}

/*
Special symbols which will always be unique - these will separate unquoted strings
*/
var symbolMap = map[string]LexTokenID{
	"@":  TokenAT,
	">=": TokenGEQ,
	"<=": TokenLEQ,
	"!=": TokenNEQ,
	"=":  TokenEQ,
	">":  TokenGT,
	"<":  TokenLT,
	"(":  TokenLPAREN,
	")":  TokenRPAREN,
	"[":  TokenLBRACK,
	"]":  TokenRBRACK,
	",":  TokenCOMMA,
	"+":  TokenPLUS,
	"-":  TokenMINUS,
	"*":  TokenTIMES,
	"/":  TokenDIV,
	"//": TokenDIVINT,
	"%":  TokenMODINT,
}

// Lexer
// =====

/*
RuneEOF is a special rune which represents the end of the input
*/
const RuneEOF = -1

/*
Function which represents the current state of the lexer and returns the next state
*/
type lexFunc func(*lexer) lexFunc

/*
Lexer data structure
*/
type lexer struct {
	name   string        // Name to identify the input
	input  string        // Input string of the lexer
	pos    int           // Current rune pointer
	line   int           // Current line pointer
	lastnl int           // Last newline position
	width  int           // Width of last rune
	start  int           // Start position of the current read token
	scope  LexTokenID    // Current scope
	tokens chan LexToken // Channel for lexer output
}

/*
FirstWord returns the first word of a given input.
*/
func FirstWord(input string) string {
	var word string
	l := &lexer{"", input, 0, 0, 0, 0, 0, -1, nil}

	if skipWhiteSpace(l) {
		l.startNew()
		lexTextBlock(l, false)
		word = input[l.start:l.pos]
	}

	return word
}

/*
Lex lexes a given input. Returns a channel which contains tokens.
*/
func Lex(name string, input string) chan LexToken {
	l := &lexer{name, input, 0, 0, 0, 0, 0, -1, make(chan LexToken)}
	go l.run()
	return l.tokens
}

/*
LexToList lexes a given input. Returns a list of tokens.
*/
func LexToList(name string, input string) []LexToken {
	var tokens []LexToken

	for t := range Lex(name, input) {
		tokens = append(tokens, t)
	}

	return tokens
}

/*
Main look of the lexer.
*/
func (l *lexer) run() {

	if skipWhiteSpace(l) {
		for state := lexToken; state != nil; {
			state = state(l)

			if !skipWhiteSpace(l) {
				break
			}
		}
	}

	close(l.tokens)
}

/*
next returns the next rune in the input and advances the current rune pointer
if the peek flag is not set. If the peek flag is set then the rune pointer
is not advanced.
*/
func (l *lexer) next(peek bool) rune {

	// Check if we reached the end

	if int(l.pos) >= len(l.input) {
		return RuneEOF
	}

	// Decode the next rune

	r, w := utf8.DecodeRuneInString(l.input[l.pos:])

	if !peek {
		l.width = w
		l.pos += l.width
	}

	return r
}

/*
backup sets the pointer one rune back. Can only be called once per next call.
*/
func (l *lexer) backup() {
	if l.width == -1 {
		panic("Can only backup once per next call")
	}

	l.pos -= l.width
	l.width = -1
}

/*
startNew starts a new token.
*/
func (l *lexer) startNew() {
	l.start = l.pos
}

/*
emitToken passes a token back to the client.
*/
func (l *lexer) emitToken(t LexTokenID) {
	if t == TokenEOF {
		l.emitTokenAndValue(t, "")
		return
	}

	if l.tokens != nil {
		l.tokens <- LexToken{t, l.start, l.input[l.start:l.pos],
			l.line + 1, l.start - l.lastnl + 1}
	}
}

/*
emitTokenAndValue passes a token with a given value back to the client.
*/
func (l *lexer) emitTokenAndValue(t LexTokenID, val string) {
	if l.tokens != nil {
		l.tokens <- LexToken{t, l.start, val, l.line + 1, l.start - l.lastnl + 1}
	}
}

/*
emitError passes an error token back to the client.
*/
func (l *lexer) emitError(msg string) {
	if l.tokens != nil {
		l.tokens <- LexToken{TokenError, l.start, msg, l.line + 1, l.start - l.lastnl + 1}
	}
}

// State functions
// ===============

/*
lexToken is the main entry function for the lexer.
*/
func lexToken(l *lexer) lexFunc {

	// Check if we got a quoted value or a comment

	n1 := l.next(false)
	n2 := l.next(true)
	l.backup()

	if n1 == '#' {
		return skipRestOfLine
	}

	if (n1 == '"' || n1 == '\'') || (n1 == 'r' && (n2 == '"' || n2 == '\'')) {
		return lexValue
	}

	// Lex a block of text and emit any found tokens

	l.startNew()
	lexTextBlock(l, true)

	// Try to lookup the keyword or an unquoted value

	keywordCandidate := strings.ToLower(l.input[l.start:l.pos])

	token, ok := keywordMap[keywordCandidate]

	if !ok {
		token, ok = symbolMap[keywordCandidate]
	}

	if ok {

		// Special start token was found

		l.emitToken(token)

		switch token {
		case TokenGET:
			l.scope = token
			return lexNodeKind
		case TokenLOOKUP:
			l.scope = token
			return lexNodeKind
		}

	} else {

		// An unknown token was found - it must be an unquoted value
		// emit and continue

		l.emitToken(TokenVALUE)
	}

	return lexToken
}

/*
skipRestOfLine skips all characters until the next newline character.
*/
func skipRestOfLine(l *lexer) lexFunc {
	r := l.next(false)

	for r != '\n' && r != RuneEOF {
		r = l.next(false)
	}

	if r == RuneEOF {
		return nil
	}

	return lexToken
}

/*
lexNodeKind lexes a node kind string.
*/
func lexNodeKind(l *lexer) lexFunc {
	l.startNew()
	lexTextBlock(l, false)

	nodeKindCandidate := strings.ToLower(l.input[l.start:l.pos])
	if !stringutil.IsAlphaNumeric(nodeKindCandidate) {
		l.emitError("Invalid node kind " + fmt.Sprintf("'%v'", nodeKindCandidate) +
			" - can only contain [a-zA-Z0-9_]")
		return nil
	}

	l.emitToken(TokenNODEKIND)

	if l.scope == TokenGET {
		return lexToken
	}

	// In a lookup scope more values are following

	return lexValue
}

/*
lexValue lexes a value which can describe names, values, regexes, etc ...

Values can be declared in different ways:

' ... ' or " ... "
Characters are parsed between quotes (escape sequences are interpreted)

r' ... ' or r" ... "
Characters are parsed plain between quote
*/
func lexValue(l *lexer) lexFunc {
	var endToken rune

	l.startNew()
	allowEscapes := false

	r := l.next(false)

	// Check if we have a raw quoted string

	if q := l.next(true); r == 'r' && (q == '"' || q == '\'') {
		endToken = q
		l.next(false)
	} else if r == '"' || r == '\'' {
		allowEscapes = true
		endToken = r
	} else {
		l.emitError("Value expected")
		return nil
	}

	r = l.next(false)
	rprev := ' '
	lLine := l.line
	lLastnl := l.lastnl

	for (!allowEscapes && r != endToken) ||
		(allowEscapes && (r != endToken || rprev == '\\')) {

		if r == '\n' {
			lLine++
			lLastnl = l.pos
		}
		rprev = r
		r = l.next(false)

		if r == RuneEOF {
			l.emitError("Unexpected end while reading value")
			return nil
		}
	}

	if allowEscapes {
		val := l.input[l.start+1 : l.pos-1]

		// Interpret escape sequences right away

		if endToken == '\'' {

			// Escape double quotes in a single quoted string

			val = strings.Replace(val, "\"", "\\\"", -1)
		}

		s, err := strconv.Unquote("\"" + val + "\"")
		if err != nil {
			l.emitError(err.Error() + " while parsing escape sequences")
			return nil
		}

		l.emitTokenAndValue(TokenVALUE, s)

	} else {
		l.emitTokenAndValue(TokenVALUE, l.input[l.start+2:l.pos-1])

	}

	//  Set newline

	l.line = lLine
	l.lastnl = lLastnl

	return lexToken
}

// Helper functions
// ================

/*
skipWhiteSpace skips any number of whitespace characters. Returns false if the parser
reaches EOF while skipping whitespaces.
*/
func skipWhiteSpace(l *lexer) bool {
	r := l.next(false)
	for unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF {
		if r == '\n' {
			l.line++
			l.lastnl = l.pos
		}
		r = l.next(false)

		if r == RuneEOF {
			l.emitToken(TokenEOF)
			return false
		}
	}

	l.backup()
	return true
}

/*
lexTextBlock lexes a block of text without whitespaces. Interprets
optionally all one or two letter tokens.
*/
func lexTextBlock(l *lexer, interpretToken bool) {

	r := l.next(false)

	if interpretToken {

		// Check if we start with a known symbol

		nr := l.next(true)
		if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {
			l.next(false)
			return
		}

		if _, ok := symbolMap[strings.ToLower(string(r))]; ok {
			return
		}
	}

	for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {

		if interpretToken {

			// Check if we find a token in the block

			if _, ok := symbolMap[strings.ToLower(string(r))]; ok {
				l.backup()
				return
			}

			nr := l.next(true)
			if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {
				l.backup()
				return
			}
		}

		r = l.next(false)
	}

	if r != RuneEOF {
		l.backup()
	}
}