krotik
/
common


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
							/*
 * Public Domain Software
 *
 * I (Matthias Ladkau) am the author of the source code in this file.
 * I have placed the source code in this file in the public domain.
 *
 * For further information see: http://creativecommons.org/publicdomain/zero/1.0/
 */

package parser

import (
	"fmt"
	"regexp"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf8"

	"devt.de/krotik/common/stringutil"
)

/*
LexToken represents a token which is returned by the lexer.
*/
type LexToken struct {
	ID    LexTokenID // Token kind
	Pos   int        // Starting position (in runes)
	Val   string     // Token value
	Lline int        // Line in the input this token appears
	Lpos  int        // Position in the input line this token appears
}

/*
PosString returns the position of this token in the origianl input as a string.
*/
func (t LexToken) PosString() string {
	return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)
}

/*
String returns a string representation of a token.
*/
func (t LexToken) String() string {

	switch {

	case t.ID == TokenEOF:
		return "EOF"

	case t.ID == TokenError:
		return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())

	case t.ID == TokenName:
		return fmt.Sprintf("<%s>", t.Val)

	case t.ID == TokenStringValue:
		return fmt.Sprintf("\"%s\"", t.Val)

	case t.ID == TokenIntValue:
		return fmt.Sprintf("int(%s)", t.Val)

	case t.ID == TokenFloatValue:
		return fmt.Sprintf("flt(%s)", t.Val)
	}

	return fmt.Sprintf("%s", t.Val)
}

/*
SymbolMap is a map of special symbols
*/
var SymbolMap = map[string]LexTokenID{
	"!": TokenPunctuator,
	"$": TokenPunctuator,
	"(": TokenPunctuator,
	")": TokenPunctuator,
	":": TokenPunctuator,
	"=": TokenPunctuator,
	"@": TokenPunctuator,
	"[": TokenPunctuator,
	"]": TokenPunctuator,
	"{": TokenPunctuator,
	"|": TokenPunctuator,
	"}": TokenPunctuator,
	// "..." Is checked as a special case
}

// Lexer
// =====

/*
RuneEOF is a special rune which represents the end of the input
*/
const RuneEOF = -1

/*
RuneComma is the rune for a comma
*/
const RuneComma = ','

/*
Function which represents the current state of the lexer and returns the next state
*/
type lexFunc func() lexFunc

/*
Lexer data structure
*/
type lexer struct {
	name   string        // Name to identify the input
	input  string        // Input string of the lexer
	pos    int           // Current rune pointer
	line   int           // Current line pointer
	lastnl int           // Last newline position
	width  int           // Width of last rune
	start  int           // Start position of the current red token
	tokens chan LexToken // Channel for lexer output
}

/*
Lex lexes a given input. Returns a channel which contains tokens.
*/
func Lex(name string, input string) chan LexToken {

	l := &lexer{name, input, 0, 0, 0, 0, 0, make(chan LexToken)}
	go l.run()

	return l.tokens
}

/*
LexToList lexes a given input. Returns a list of tokens.
*/
func LexToList(name string, input string) []LexToken {
	var tokens []LexToken

	for t := range Lex(name, input) {
		tokens = append(tokens, t)
	}

	return tokens
}

/*
run is the main loop of the lexer.
*/
func (l *lexer) run() {

	if l.skipWhiteSpace() {
		for state := l.lexToken; state != nil; {
			state = state()

			if !l.skipWhiteSpace() {
				break
			}
		}
	}

	close(l.tokens)
}

/*
next returns the next rune in the input and advances the current rune pointer if the
peek value is -1 or smaller. If the peek value is 0 or greater then the nth token from the current
position is returned without advancing the current rune pointer.
*/
func (l *lexer) next(peek int) rune {
	var r rune
	var w, peekw int

	// Check if we reached the end

	if int(l.pos) >= len(l.input) {
		return RuneEOF
	}

	// Decode the next rune

	peeklen := 1 + peek
	if peeklen < 1 {
		peeklen = 1
	}

	for i := 0; i < peeklen; i++ {
		r, w = utf8.DecodeRuneInString(l.input[l.pos+peekw:])
		peekw += w
	}

	if peek == -1 {
		l.width = w
		l.pos += l.width
	}

	return r
}

/*
hasSequence checks if the next characters are of the following sequence.
*/
func (l *lexer) hasSequence(s string) bool {
	runes := stringutil.StringToRuneSlice(s)
	for i := 0; i < len(runes); i++ {
		if l.next(i) != runes[i] {
			return false
		}
	}
	return true
}

/*
startNew starts a new token.
*/
func (l *lexer) startNew() {
	l.start = l.pos
}

/*
emitTokenAndValue passes a token with a given value back to the client.
*/
func (l *lexer) emitToken(i LexTokenID, val string) {
	if l.tokens != nil {
		l.tokens <- LexToken{i, l.start, val, l.line + 1, l.start - l.lastnl + 1}
	}
}

// State functions
// ===============

/*
lexToken is the main entry function for the lexer.
*/
func (l *lexer) lexToken() lexFunc {

	l.startNew()
	l.lexTextBlock()

	token := l.input[l.start:l.pos]

	// Check for Comment - @spec 2.1.4, 2.1.7

	if token == "#" {
		return l.skipRestOfLine()
	}

	// Lexical tokens - @spec 2.1.6

	// Check for String

	if token == "\"" {
		return l.lexStringValue()
	}

	// Check for Punctuator - @spec 2.1.8

	if _, ok := SymbolMap[token]; ok || token == "..." {
		l.emitToken(TokenPunctuator, token)
		return l.lexToken
	}

	// Check for Name - @spec 2.1.9

	isName, _ := regexp.MatchString("^[_A-Za-z][_0-9A-Za-z]*$", token)
	if isName {
		l.emitToken(TokenName, token)
		return l.lexToken
	}

	// Check for IntValue - @spec 2.9.1

	isZero, _ := regexp.MatchString("^-?0$", token)
	isInt, _ := regexp.MatchString("^-?[1-9][0-9]*$", token)
	if isZero || isInt {
		l.emitToken(TokenIntValue, token)
		return l.lexToken
	}

	// Check for FloatValue - @spec 2.9.2

	isFloat1, _ := regexp.MatchString("^[0-9]*\\.[0-9]*$", token)
	isFloat2, _ := regexp.MatchString("^[0-9][eE][+-]?[0-9]*$", token)
	isFloat3, _ := regexp.MatchString("^[0-9]*\\.[0-9][eE][+-]?[0-9]*$", token)

	if isFloat1 || isFloat2 || isFloat3 {
		l.emitToken(TokenFloatValue, strings.ToLower(token))
		return l.lexToken
	}

	// Everything else is an error

	l.emitToken(TokenError, token)

	return l.lexToken
}

/*
lexTextBlock lexes a block of text without whitespaces. Interprets
optionally all one or two letter tokens.
*/
func (l *lexer) lexTextBlock() {

	r := l.next(0)

	// Check if we start with a known symbol

	if _, ok := SymbolMap[strings.ToLower(string(r))]; ok || r == '#' || r == '"' {
		l.next(-1)
		return
	} else if r == '.' && l.hasSequence("...") {
		l.next(-1)
		l.next(-1)
		l.next(-1)
		return
	}

	for !l.isIgnoredRune(r) {
		l.next(-1)

		r = l.next(0)

		// Check if we find a token in the block

		if _, ok := SymbolMap[strings.ToLower(string(r))]; ok || r == '#' || r == '"' {
			return
		} else if r == '.' && l.hasSequence("...") {
			return
		}
	}
}

/*
lexStringValue lexes a string value either as a simple string or a block string.

Values can be declared in different ways:

" ... " A normal string (escape sequences are interpreted)

""" ... """ A multi-line string (escape sequences are not interpreted)
*/
func (l *lexer) lexStringValue() lexFunc {
	var isEnd func(rune) bool

	// String value lexing - @spec 2.9.4

	// Lookahead 2 tokens

	r1 := l.next(0)
	r2 := l.next(1)

	isBlockString := r1 == '"' && r2 == '"'

	if isBlockString {

		// Consume the initial quotes for blockstrings

		l.next(-1)
		l.next(-1)

		isEnd = func(r rune) bool {
			r1 := l.next(0)
			r2 := l.next(1)
			return r == '"' && r1 == '"' && r2 == '"'
		}

	} else {

		isEnd = func(r rune) bool {
			return r == '"'
		}
	}

	r := l.next(-1)
	lLine := l.line
	lLastnl := l.lastnl

	for !isEnd(r) {

		if r == '\n' {
			lLine++
			lLastnl = l.pos
		}

		r = l.next(-1)

		if r == RuneEOF {
			l.emitToken(TokenError, "EOF inside quotes")
			return nil
		} else if r == '\\' {

			// Consume escaped characters

			r = l.next(-1)
			r = l.next(-1)
		}
	}

	if !isBlockString {
		val := l.input[l.start+1 : l.pos-1]

		s, err := strconv.Unquote("\"" + val + "\"")
		if err != nil {
			l.emitToken(TokenError, "Could not interpret escape sequence: "+err.Error())
			return nil
		}

		l.emitToken(TokenStringValue, s)

	} else {

		// Consume the final quotes for blockstrings

		l.next(-1)
		l.next(-1)

		token := l.input[l.start+3 : l.pos-3]

		// Since block strings represent freeform text often used in indented
		// positions, the string value semantics of a block string excludes uniform
		// indentation and blank initial and trailing lines
		// (from spec about 'Block Strings')

		token = stringutil.StripUniformIndentation(token)
		token = stringutil.TrimBlankLines(token)

		l.emitToken(TokenStringValue, token)
	}

	//  Set newline

	l.line = lLine
	l.lastnl = lLastnl

	return l.lexToken
}

/*
isIgnoredRune checks if a given rune should be ignored.
*/
func (l *lexer) isIgnoredRune(r rune) bool {

	// Ignored tokens - @spec 2.1.1, 2.1.2, 2.1.3, 2.1.3, 2.1.5, 2.1.7

	return unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF ||
		r == RuneComma || r == '\ufeff'
}

/*
skipWhiteSpace skips any number of whitespace characters. Returns false if the parser
reaches EOF while skipping whitespaces.
*/
func (l *lexer) skipWhiteSpace() bool {
	r := l.next(0)

	for l.isIgnoredRune(r) {
		if r == '\n' {
			l.line++
			l.lastnl = l.pos
		}

		l.next(-1)

		if r == RuneEOF {
			l.startNew()
			l.start--
			l.emitToken(TokenEOF, "")
			return false
		}

		r = l.next(0)
	}

	return true
}

/*
skipRestOfLine skips all characters until the next newline character.
*/
func (l *lexer) skipRestOfLine() lexFunc {
	r := l.next(-1)

	for r != '\n' && r != RuneEOF {
		r = l.next(-1)
	}

	if r == RuneEOF {
		return nil
	}

	l.line++
	l.lastnl = l.pos - 1

	return l.lexToken
}