Browse Source

feat: Adding lexer for ECAL.

Matthias Ladkau 3 years ago
parent
commit
5740c81f96
5 changed files with 1404 additions and 6 deletions
  1. 1 1
      httputil/util_test.go
  2. 221 5
      lang/ecal/README.md
  3. 247 0
      lang/ecal/parser/const.go
  4. 664 0
      lang/ecal/parser/lexer.go
  5. 271 0
      lang/ecal/parser/lexer_test.go

+ 1 - 1
httputil/util_test.go

@@ -70,7 +70,7 @@ func TestCheckLocalRedirect(t *testing.T) {
 		return
 	}
 
-	if err := CheckLocalRedirect("://hans.foo/bla"); err == nil || err.Error() != "parse ://hans.foo/bla: missing protocol scheme" {
+	if err := CheckLocalRedirect("://hans.foo/bla"); err == nil || err.Error() != "parse \"://hans.foo/bla\": missing protocol scheme" {
 		t.Error(err)
 		return
 	}

+ 221 - 5
lang/ecal/README.md

@@ -1,16 +1,38 @@
 ECAL - Event Condition Action Language
 --
-ECAL is a language to create rule based system which react to events provided that a defined condition holds:
+ECAL is a language to create a rule based system which reacts to events provided that a defined condition holds:
 
 Event -> Condition -> Action
 
-Rules are defined as event sinks and have the following form:
+The condition and action part are defined by rules called event sinks which are the core constructs of ECAL.
 
-sink "mysink" 
-    "
+Notation
+--
+Source code is Unicode text encoded in UTF-8. Single language statements are separated by a semicolon or a newline.
+
+Constant values are usually enclosed in double quotes "" or single quotes '', both supporting escape sequences. Constant values can also be provided as raw strings prefixing a single or double quote with an 'r'. A raw string can contain any character including newlines and does not contain escape sequences.
+
+Blocks are denoted with curley brackets. Most language constructs (conditions, loops, etc.) are very similar to other languages.
+
+Event Sinks
+--
+Sinks are should have unique names which identify them and the following attributes:
+
+Attribute | Description
+-|-
+kindmatch  | Matching condition for event kind e.g. db.op.TableInsert. A list of strings in dot notation which describes event kinds. May contain `*` characters as wildcards.
+scopematch | Matching condition for event cascade scope e.g. db.dbRead db.dbWrite. A list of strings in dot notation which describe the scopes which are required for this sink to trigger.
+statematch | Match on event state: A simple map of required key / value states in the event state. `NULL` values can be used as wildcards (i.e. match is only on key).
+priority | Priority of the sink. Sinks of higher priority are executed first. The higher the number the lower the priority - 0 is the highest priority.
+suppresses | A list of sink names which should be suppressed if this sink is executed.
+
+Example:
+```
+sink "mysink"
+    r"
     A comment describing the sink.
     "
-    kindmatch [ "foo", a.b.bar ],
+    kindmatch [ foo.bar.* ],
     scopematch [ "data.read", "data.write" ],
     statematch { a : 1, b : NULL },
     priority 0,
@@ -18,4 +40,198 @@ sink "mysink"
     {
       <ECAL Code>
     }
+```
+
+Events which match
+...
+
+Events which don't match
+...
+
+Function
+--
+Functions define reusable pieces of code dedicated to perform a particular task based on a set of given input values. In ECAL functions are first-class citizens in that they can be assigned to variables, passed as arguments, immediately invoked or deferred for last execution. Each parameter can have a default value which is by default NULL.
+
+Example:
+```
+func myfunc(a, b, c=1) {
+  <ECAL Code>
+}
+```
+
+Comments
+--
+Comments are defined with `#` as single line comments and `/*` `*/` for multiline comments.
+Single line comments will comment all characters after the `#` until the next newline.
+```
+/*
+  Multi line comment
+  Some comment text
+*/
+
+# Single line comment
+
+a := 1 # Single line comment after a statement
+```
+
+Constant Values
+--
+Constant values are used to initialize variables or as operands in expressions.
+
+Numbers can be expressed in all common notations:
+Formatting|Description
+-|-
+123|Normal integer
+123.456|With decimal point
+1.234560e+02|Scientific notation
+
+Strings can be normal quoted stings which interpret backslash escape characters:
+```
+\a → U+0007 alert or bell
+\b → U+0008 backspace
+\f → U+000C form feed
+\n → U+000A line feed or newline
+\r → U+000D carriage return
+\t → U+0009 horizontal tab
+\v → U+000b vertical tab
+\\ → U+005c backslash
+\" → U+0022 double quote
+\uhhhh → a Unicode character whose codepoint can be expressed in 4 hexadecimal digits. (pad 0 in front)
+```
+
+Normal quoted strings also interpret inline expressions escaped with `{}`:
+```
+"Foo bar {1+2}"
+```
+Inline expression may also specify number formatting:
+```
+"Foo bar {1+2}.2f"
+```
+Formatting|Description
+-|-
+{}.f|With decimal point full precision
+{}.3f|Decimal point with precision 3
+{}.5w3f|5 Width with decimal point with precision 3
+{}.e|Scientific notation
+
+Strings can also be expressed in raw form which will not interpret any escape characters.
+```
+r"Foo bar {1+2}"
+```
+
+Expression|Value
+-|-
+`"foo'bar"`| `foo'bar`
+`'foo"bar'`| `foo"bar`
+`'foo\u0028bar'`| `foo(bar`
+`"foo\u0028bar"`| `foo(bar`
+`"Foo bar {1+2}"`| `Foo bar 3`
+`r"Foo bar {1+2}"`| `Foo bar {1+2}`
+
+Variable Assignments
+--
+A variable is a storage location for holding a value. Variables can hold single values (strings and numbers) or structures like an array or a map. Variables names can only contain [a-zA-Z] and [a-zA-Z0-9] from the second character.
+
+A variable is assigned with the assign operator ':='
+```
+a := 1
+b := "test"
+c := [1,2,3]
+d := {1:2,3:4}
+```
+Multi-assignments are possible using lists:
+```
+[a, b] := [1, 2]
+```
+
+Expressions
+--
+Variables and constants can be combined with operators to form expressions. Boolean expressions can also be formed with variables:
+```
+a := 1 + 2 * 5
+b := a > 10
+c := a == 11
+d := false or c
+```
+
+Operators
+--
+The following operators are available:
+
+Boolean: `and`, `or`, `not`, `>`, `>=`, `<`, `<=`, `==`, `!=`
+
+Arithmetic: `+`, `-`, `*`, `/`, `//` (integer division), `%` (integer modulo)
+
+String:
+Operator|Description|Example
+-|-|-
+like|Regex match|`"Hans" like "H??s"`
+hasPrefix|prefix match|`"Hans" hasPrefix "Ha"`
+hasSuffix|suffix match|`"Hans" hasSuffix "ns"`
+
+List:
+Operator|Description|Example
+-|-|-
+in|Item is in list|`6 in [1, 6, 7]`
+notin|Item is not in list|`6 notin [1, 6, 7]`
+
+Composition structures access
+--
+Composition structures like lists and maps can be accessed with access operators:
+
+Structure|Accessor|Description
+-|-|-
+List|variable[index]|Access the n-th element starting from 0.
+Map|variable[field]|Access a map
+Map|variable.field|Access a map (field name can only contain [a-zA-Z] and [a-zA-Z0-9] from the second character)
+```
+a := [1, 2, 3]
+b := a[1] # B has the value 2
+
+c := { "foo" : 2 }
+d := c["foo"]
+e := c.foo
+```
+
+Loop statements
+---------------
+All loops are defined as a 'for' block statement. Counting loops are defined with the 'range' function. The following code iterates from 2 until 10 in steps of 2:
+```
+for a in range(2, 10, 2) {
+	<ECAL Code>
+}
+```
+
+Conditional loops are using a condition after the for statement:
+```
+for a > 0 {
+  <ECAL Code>
+}
+```
+
+It is possible to loop over lists and even have multiple assignments:
+```
+for [a, b] in [[1, 1], [2, 2], [3, 3]] {
+
+}
+```
+or
+```
+x := { "c" : 0, "a" : 2, "b" : 4}
+for [a, b] in x {
+  <ECAL Code>
+}
+```
 
+Conditional statements
+----------------------
+The "if" statement specifies the conditional execution of multiple branches based on defined conditions:
+```
+if a == 1 {
+    a := a + 1
+} elif a == 2 {
+    a := a + 2
+} else {
+    a := 99
+}
+```

+ 247 - 0
lang/ecal/parser/const.go

@@ -0,0 +1,247 @@
+/*
+ * Public Domain Software
+ *
+ * I (Matthias Ladkau) am the author of the source code in this file.
+ * I have placed the source code in this file in the public domain.
+ *
+ * For further information see: http://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+/*
+Package parser contains a ECAL parser.
+
+Lexer for Source Text
+
+Lex() is a lexer function to convert a given search query into a list of tokens.
+
+Based on a talk by Rob Pike: Lexical Scanning in Go
+
+https://www.youtube.com/watch?v=HxaD_trXwRE
+
+The lexer's output is pushed into a channel which is consumed by the parser.
+This design enables the concurrent processing of the input text by lexer and
+parser.
+
+Parser
+
+Parse() is a parser which produces a parse tree from a given set of lexer tokens.
+
+Based on an article by Douglas Crockford: Top Down Operator Precedence
+
+http://crockford.com/javascript/tdop/tdop.html
+
+which is based on the ideas of Vaughan Pratt and his paper: Top Down Operator Precedence
+
+http://portal.acm.org/citation.cfm?id=512931
+https://tdop.github.io/
+
+ParseWithRuntime() parses a given input and decorates the resulting parse tree
+with runtime components which can be used to interpret the parsed query.
+*/
+package parser
+
+/*
+LexTokenID represents a unique lexer token ID
+*/
+type LexTokenID int
+
+/*
+Available lexer token types
+*/
+const (
+	TokenError LexTokenID = iota // Lexing error token with a message as val
+	TokenEOF                     // End-of-file token
+	TokenAny                     // Unspecified token (used when building an AST from a Go map structure)
+
+	TokenCOMMENT    // Comment
+	TokenSTRING     // String constant
+	TokenNUMBER     // Number constant
+	TokenIDENTIFIER // Idendifier
+
+	// Constructed tokens which are generated by the parser not the lexer
+
+	TokenSTATEMENTS // A code block
+	TokenLIST       // List value
+	TokenMAP        // MAP value
+	TokenGUARD      // Guard expressions for conditional statements
+
+	TOKENodeSYMBOLS // Used to separate symbols from other tokens in this list
+
+	// Condition operators
+
+	TokenGEQ
+	TokenLEQ
+	TokenNEQ
+	TokenEQ
+	TokenGT
+	TokenLT
+
+	// Grouping symbols
+
+	TokenLPAREN
+	TokenRPAREN
+	TokenLBRACK
+	TokenRBRACK
+	TokenLBRACE
+	TokenRBRACE
+
+	// Separators
+
+	TokenDOT
+	TokenCOMMA
+	TokenCOLON
+	TokenSEMICOLON
+
+	// Arithmetic operators
+
+	TokenPLUS
+	TokenMINUS
+	TokenTIMES
+	TokenDIV
+	TokenDIVINT
+	TokenMODINT
+
+	// Assignment statement
+
+	TokenASSIGN
+
+	// Data structure access
+
+	TokenACCESS
+
+	// The colon '' has a context specific meaning and is checked by the parser
+
+	TOKENodeKEYWORDS // Used to separate keywords from other tokens in this list
+
+	// Sink definition
+
+	TokenSINK
+	TokenKINDMATCH
+	TokenSCOPEMATCH
+	TokenSTATEMATCH
+	TokenPRIORITY
+	TokenSUPPRESSES
+
+	// Function definition
+
+	TokenFUNC
+
+	// Boolean operators
+
+	TokenAND
+	TokenOR
+	TokenNOT
+
+	// Condition operators
+
+	TokenLIKE
+	TokenIN
+	TokenHASPREFIX
+	TokenHASSUFFIX
+	TokenNOTIN
+
+	// Constant terminals
+
+	TokenFALSE
+	TokenTRUE
+	TokenNULL
+
+	// Conditional statements
+
+	TokenIF
+	TokenELIF
+	TokenELSE
+
+	// Loop statements
+
+	TokenFOR
+	TokenBREAK
+	TokenCONTINUE
+)
+
+/*
+Available parser AST node types
+*/
+const (
+	NodeEOF = "EOF"
+
+	NodeVALUE = "value" // Simple value
+
+	// Constructed tokens
+
+	NodeSTATEMENTS = "statements" // List of statements
+	NodeLIST       = "list"       // List value
+	NodeMAP        = "map"        // Map value
+	NodeGUARD      = "guard"      // Guard expressions for conditional statements
+
+	// Map entries
+
+	NodeMAPENTRY = "entry" // Map entry value
+
+	// Boolean operators
+
+	NodeOR  = "or"
+	NodeAND = "and"
+	NodeNOT = "not"
+
+	// Condition operators
+
+	NodeLIKE       = "like"
+	NodeIN         = "in"
+	NodeBEGINSWITH = "beginswith"
+	NodeENDSWITH   = "endswith"
+	NodeNOTIN      = "notin"
+
+	NodeGEQ = ">="
+	NodeLEQ = "<="
+	NodeNEQ = "!="
+	NodeEQ  = "=="
+	NodeGT  = ">"
+	NodeLT  = "<"
+
+	// Constants
+
+	NodeTRUE  = "true"
+	NodeFALSE = "false"
+	NodeNULL  = "null"
+
+	// Arithmetic operators
+
+	NodePLUS   = "plus"
+	NodeMINUS  = "minus"
+	NodeTIMES  = "times"
+	NodeDIV    = "div"
+	NodeMODINT = "modint"
+	NodeDIVINT = "divint"
+
+	// Assignment statement
+
+	NodeASSIGN = ":="
+
+	// Function call statement
+
+	NodeFUNCCALL = "funccall"
+
+	// Data structure access
+
+	NodeACCESS = "access"
+
+	// Sink definition
+
+	NodeSINK       = "sink"
+	NodeKINDMATCH  = "kindmatch"
+	NodeSCOPEMATCH = "scopematch"
+	NodeSTATEMATCH = "statematch"
+	NodePRIORITY   = "priority"
+	NodeSUPPRESSES = "suppresses"
+
+	// Block statements
+
+	NodeCOND = "cond"
+	NodeLOOP = "loop"
+
+	// Single statements
+
+	NodeBREAK    = "break"
+	NodeCONTINUE = "continue"
+)

+ 664 - 0
lang/ecal/parser/lexer.go

@@ -0,0 +1,664 @@
+/*
+ * Public Domain Software
+ *
+ * I (Matthias Ladkau) am the author of the source code in this file.
+ * I have placed the source code in this file in the public domain.
+ *
+ * For further information see: http://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+package parser
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+var namePattern = regexp.MustCompile("^[A-Za-z][A-Za-z0-9]*$")
+var numberPattern = regexp.MustCompile("^[0-9].*$")
+
+/*
+LexToken represents a token which is returned by the lexer.
+*/
+type LexToken struct {
+	ID         LexTokenID // Token kind
+	Pos        int        // Starting position (in bytes)
+	Val        string     // Token value
+	Identifier bool       // Flag if the value is an identifier (not quoted and not a number)
+	Lline      int        // Line in the input this token appears
+	Lpos       int        // Position in the input line this token appears
+}
+
+/*
+PosString returns the position of this token in the origianl input as a string.
+*/
+func (t LexToken) PosString() string {
+	return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)
+}
+
+/*
+String returns a string representation of a token.
+*/
+func (t LexToken) String() string {
+
+	prefix := ""
+
+	if !t.Identifier {
+		prefix = "v:" // Value is not an identifier
+	}
+
+	switch {
+
+	case t.ID == TokenEOF:
+		return "EOF"
+
+	case t.ID == TokenError:
+		return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())
+
+	case t.ID == TokenCOMMENT:
+		return fmt.Sprintf("c:'%s'", t.Val)
+
+	case t.ID > TOKENodeSYMBOLS && t.ID < TOKENodeKEYWORDS:
+		return fmt.Sprintf("%s", strings.ToUpper(t.Val))
+
+	case t.ID > TOKENodeKEYWORDS:
+		return fmt.Sprintf("<%s>", strings.ToUpper(t.Val))
+
+	case len(t.Val) > 20:
+
+		// Special case for very long values
+
+		return fmt.Sprintf("%s%.10q...", prefix, t.Val)
+	}
+
+	return fmt.Sprintf("%s%q", prefix, t.Val)
+}
+
+/*
+KeywordMap is a map of keywords - these require spaces between them
+*/
+var KeywordMap = map[string]LexTokenID{
+
+	// Sink definition
+
+	"sink":       TokenSINK,
+	"kindmatch":  TokenKINDMATCH,
+	"scopematch": TokenSCOPEMATCH,
+	"statematch": TokenSTATEMATCH,
+	"priority":   TokenPRIORITY,
+	"suppresses": TokenSUPPRESSES,
+
+	// Function definition
+
+	"func": TokenFUNC,
+
+	// Boolean operators
+
+	"and": TokenAND,
+	"or":  TokenOR,
+	"not": TokenNOT,
+
+	// String operators
+
+	"like":      TokenLIKE,
+	"hasPrefix": TokenHASPREFIX,
+	"hasSuffix": TokenHASSUFFIX,
+
+	// List operators
+
+	"in":    TokenIN,
+	"notin": TokenNOTIN,
+
+	// Constant terminals
+
+	"false": TokenFALSE,
+	"true":  TokenTRUE,
+	"null":  TokenNULL,
+
+	// Conditional statements
+
+	"if":   TokenIF,
+	"elif": TokenELIF,
+	"else": TokenELSE,
+
+	// Loop statements
+
+	"for":      TokenFOR,
+	"break":    TokenBREAK,
+	"continue": TokenCONTINUE,
+}
+
+/*
+SymbolMap is a map of special symbols which will always be unique - these will separate unquoted strings
+Symbols can be maximal 2 characters long.
+*/
+var SymbolMap = map[string]LexTokenID{
+
+	// Condition operators
+
+	">=": TokenGEQ,
+	"<=": TokenLEQ,
+	"!=": TokenNEQ,
+	"==": TokenEQ,
+	">":  TokenGT,
+	"<":  TokenLT,
+
+	// Grouping symbols
+
+	"(": TokenLPAREN,
+	")": TokenRPAREN,
+	"[": TokenLBRACK,
+	"]": TokenRBRACK,
+	"{": TokenLBRACE,
+	"}": TokenRBRACE,
+
+	// Sequence symbols
+
+	".": TokenDOT,
+	",": TokenCOMMA,
+	":": TokenCOLON,
+	";": TokenSEMICOLON,
+
+	// Arithmetic operators
+
+	"+":  TokenPLUS,
+	"-":  TokenMINUS,
+	"*":  TokenTIMES,
+	"/":  TokenDIV,
+	"//": TokenDIVINT,
+	"%":  TokenMODINT,
+
+	// Assignment statement
+
+	":=": TokenASSIGN,
+}
+
+// Lexer
+// =====
+
+/*
+RuneEOF is a special rune which represents the end of the input
+*/
+const RuneEOF = -1
+
+/*
+Function which represents the current state of the lexer and returns the next state
+*/
+type lexFunc func(*lexer) lexFunc
+
+/*
+Lexer data structure
+*/
+type lexer struct {
+	name   string        // Name to identify the input
+	input  string        // Input string of the lexer
+	pos    int           // Current rune pointer
+	line   int           // Current line pointer
+	lastnl int           // Last newline position
+	width  int           // Width of last rune
+	start  int           // Start position of the current red token
+	tokens chan LexToken // Channel for lexer output
+}
+
+/*
+Lex lexes a given input. Returns a channel which contains tokens.
+*/
+func Lex(name string, input string) chan LexToken {
+	l := &lexer{name, input, 0, 0, 0, 0, 0, make(chan LexToken)}
+	go l.run()
+	return l.tokens
+}
+
+/*
+LexToList lexes a given input. Returns a list of tokens.
+*/
+func LexToList(name string, input string) []LexToken {
+	var tokens []LexToken
+
+	for t := range Lex(name, input) {
+		tokens = append(tokens, t)
+	}
+
+	return tokens
+}
+
+/*
+Main loop of the lexer.
+*/
+func (l *lexer) run() {
+
+	if skipWhiteSpace(l) {
+		for state := lexToken; state != nil; {
+			state = state(l)
+
+			if !skipWhiteSpace(l) {
+				break
+			}
+		}
+	}
+
+	close(l.tokens)
+}
+
+/*
+next returns the next rune in the input and advances the current rune pointer
+if peek is 0. If peek is >0 then the nth character is returned without advancing
+the rune pointer.
+*/
+func (l *lexer) next(peek int) rune {
+
+	// Check if we reached the end
+
+	if int(l.pos) >= len(l.input) {
+		return RuneEOF
+	}
+
+	// Decode the next rune
+
+	pos := l.pos
+	if peek > 0 {
+		pos += peek - 1
+	}
+
+	r, w := utf8.DecodeRuneInString(l.input[pos:])
+
+	if peek == 0 {
+		l.width = w
+		l.pos += l.width
+	}
+
+	return r
+}
+
+/*
+backup sets the pointer one rune back. Can only be called once per next call.
+*/
+func (l *lexer) backup(width int) {
+	if width == 0 {
+		width = l.width
+	}
+	l.pos -= width
+}
+
+/*
+startNew starts a new token.
+*/
+func (l *lexer) startNew() {
+	l.start = l.pos
+}
+
+/*
+emitToken passes a token back to the client.
+*/
+func (l *lexer) emitToken(t LexTokenID) {
+	if t == TokenEOF {
+		l.emitTokenAndValue(t, "", false)
+		return
+	}
+
+	if l.tokens != nil {
+		l.tokens <- LexToken{t, l.start, l.input[l.start:l.pos], false,
+			l.line + 1, l.start - l.lastnl + 1}
+	}
+}
+
+/*
+emitTokenAndValue passes a token with a given value back to the client.
+*/
+func (l *lexer) emitTokenAndValue(t LexTokenID, val string, identifier bool) {
+	if l.tokens != nil {
+		l.tokens <- LexToken{t, l.start, val, identifier, l.line + 1, l.start - l.lastnl + 1}
+	}
+}
+
+/*
+emitError passes an error token back to the client.
+*/
+func (l *lexer) emitError(msg string) {
+	if l.tokens != nil {
+		l.tokens <- LexToken{TokenError, l.start, msg, false, l.line + 1, l.start - l.lastnl + 1}
+	}
+}
+
+// Helper functions
+// ================
+
+/*
+skipWhiteSpace skips any number of whitespace characters. Returns false if the parser
+reaches EOF while skipping whitespaces.
+*/
+func skipWhiteSpace(l *lexer) bool {
+	r := l.next(0)
+
+	for unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF {
+		if r == '\n' {
+			l.line++
+			l.lastnl = l.pos
+		}
+		r = l.next(0)
+
+		if r == RuneEOF {
+			l.emitToken(TokenEOF)
+			return false
+		}
+	}
+
+	l.backup(0)
+	return true
+}
+
+/*
+lexTextBlock lexes a block of text without whitespaces. Interprets
+optionally all one or two letter tokens.
+*/
+func lexTextBlock(l *lexer, interpretToken bool) {
+
+	r := l.next(0)
+
+	if interpretToken {
+
+		// Check if we start with a known symbol
+
+		nr := l.next(1)
+		if _, ok := SymbolMap[strings.ToLower(string(r)+string(nr))]; ok {
+			l.next(0)
+			return
+		}
+
+		if _, ok := SymbolMap[strings.ToLower(string(r))]; ok {
+			return
+		}
+	}
+
+	for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {
+
+		if interpretToken {
+
+			// Check if we find a token in the block
+
+			if _, ok := SymbolMap[strings.ToLower(string(r))]; ok {
+				l.backup(0)
+				return
+			}
+
+			nr := l.next(1)
+			if _, ok := SymbolMap[strings.ToLower(string(r)+string(nr))]; ok {
+				l.backup(0)
+				return
+			}
+		}
+
+		r = l.next(0)
+	}
+
+	if r != RuneEOF {
+		l.backup(0)
+	}
+}
+
+/*
+lexNumberBlock lexes a block potentially containing a number.
+*/
+func lexNumberBlock(l *lexer) {
+
+	r := l.next(0)
+
+	for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {
+
+		if !unicode.IsNumber(r) && r != '.' {
+			if r == 'e' {
+
+				l1 := l.next(1)
+				l2 := l.next(2)
+				if l1 != '+' || !unicode.IsNumber(l2) {
+					break
+				}
+				l.next(0)
+				l.next(0)
+			} else {
+				break
+			}
+		}
+		r = l.next(0)
+	}
+
+	if r != RuneEOF {
+		l.backup(0)
+	}
+}
+
+// State functions
+// ===============
+
+/*
+lexToken is the main entry function for the lexer.
+*/
+func lexToken(l *lexer) lexFunc {
+
+	// Check if we got a quoted value or a comment
+
+	n1 := l.next(1)
+	n2 := l.next(2)
+
+	// Parse comments
+
+	if (n1 == '/' && n2 == '*') || n1 == '#' {
+		return lexComment
+	}
+
+	// Parse strings
+
+	if (n1 == '"' || n1 == '\'') || (n1 == 'r' && (n2 == '"' || n2 == '\'')) {
+		return lexValue
+	}
+
+	// Lex a block of text and emit any found tokens
+
+	l.startNew()
+
+	// First try to parse a number
+
+	lexNumberBlock(l)
+	keywordCandidate := strings.ToLower(l.input[l.start:l.pos])
+
+	// Check for number
+
+	if numberPattern.MatchString(keywordCandidate) {
+		_, err := strconv.ParseFloat(keywordCandidate, 64)
+
+		if err == nil {
+			l.emitTokenAndValue(TokenNUMBER, keywordCandidate, false)
+			return lexToken
+		}
+	}
+
+	if len(keywordCandidate) > 0 {
+		l.backup(l.pos - l.start)
+	}
+	lexTextBlock(l, true)
+	keywordCandidate = strings.ToLower(l.input[l.start:l.pos])
+
+	// Check for keyword
+
+	token, ok := KeywordMap[keywordCandidate]
+
+	if !ok {
+
+		// Check for symbol
+
+		token, ok = SymbolMap[keywordCandidate]
+	}
+
+	if ok {
+
+		// A known token was found
+
+		l.emitToken(token)
+
+	} else {
+
+		if !namePattern.MatchString(keywordCandidate) {
+			l.emitError(fmt.Sprintf("Cannot parse identifier '%v'. Identifies may only contain [a-zA-Z] and [a-zA-Z0-9] from the second character", keywordCandidate))
+			return nil
+		}
+
+		// An identifier was found
+
+		l.emitTokenAndValue(TokenIDENTIFIER, keywordCandidate, true)
+	}
+
+	return lexToken
+}
+
+/*
+lexValue lexes a string value.
+
+Values can be declared in different ways:
+
+' ... ' or " ... "
+Characters are parsed between quotes (escape sequences are interpreted)
+
+r' ... ' or r" ... "
+Characters are parsed plain between quote
+*/
+func lexValue(l *lexer) lexFunc {
+	var endToken rune
+
+	l.startNew()
+
+	allowEscapes := false
+
+	r := l.next(0)
+
+	// Check if we have a raw quoted string
+
+	if q := l.next(1); r == 'r' && (q == '"' || q == '\'') {
+		endToken = q
+		l.next(0)
+	} else {
+		allowEscapes = true
+		endToken = r
+	}
+
+	r = l.next(0)
+	rprev := ' '
+	lLine := l.line
+	lLastnl := l.lastnl
+
+	for (!allowEscapes && r != endToken) ||
+		(allowEscapes && (r != endToken || rprev == '\\')) {
+
+		if r == '\n' {
+			lLine++
+			lLastnl = l.pos
+		}
+		rprev = r
+		r = l.next(0)
+
+		if r == RuneEOF {
+			l.emitError("Unexpected end while reading string value (unclosed quotes)")
+			return nil
+		}
+	}
+
+	if allowEscapes {
+		val := l.input[l.start+1 : l.pos-1]
+
+		// Interpret escape sequences right away
+
+		if endToken == '\'' {
+
+			// Escape double quotes in a single quoted string
+
+			val = strings.Replace(val, "\"", "\\\"", -1)
+		}
+
+		s, err := strconv.Unquote("\"" + val + "\"")
+		if err != nil {
+			l.emitError(err.Error() + " while parsing string")
+			return nil
+		}
+
+		l.emitTokenAndValue(TokenSTRING, s, true)
+
+	} else {
+		l.emitTokenAndValue(TokenSTRING, l.input[l.start+2:l.pos-1], true)
+	}
+
+	//  Set newline
+
+	l.line = lLine
+	l.lastnl = lLastnl
+
+	return lexToken
+}
+
+/*
+lexComment lexes comments.
+*/
+func lexComment(l *lexer) lexFunc {
+
+	// Consume initial /*
+
+	r := l.next(0)
+
+	if r == '#' {
+
+		l.startNew()
+
+		for r != '\n' && r != RuneEOF {
+			r = l.next(0)
+		}
+
+		l.emitTokenAndValue(TokenCOMMENT, l.input[l.start:l.pos-1], false)
+
+		if r == RuneEOF {
+			return nil
+		}
+
+		l.line++
+
+	} else {
+
+		l.next(0)
+
+		lLine := l.line
+		lLastnl := l.lastnl
+
+		l.startNew()
+
+		r = l.next(0)
+
+		for r != '*' && l.next(1) != '/' {
+
+			if r == '\n' {
+				lLine++
+				lLastnl = l.pos
+			}
+			r = l.next(0)
+
+			if r == RuneEOF {
+				l.emitError("Unexpected end while reading comment")
+				return nil
+			}
+		}
+
+		l.emitTokenAndValue(TokenCOMMENT, l.input[l.start:l.pos-1], false)
+
+		// Consume final /
+
+		l.next(0)
+
+		//  Set newline
+
+		l.line = lLine
+		l.lastnl = lLastnl
+
+	}
+
+	return lexToken
+}

+ 271 - 0
lang/ecal/parser/lexer_test.go

@@ -0,0 +1,271 @@
+/*
+ * Public Domain Software
+ *
+ * I (Matthias Ladkau) am the author of the source code in this file.
+ * I have placed the source code in this file in the public domain.
+ *
+ * For further information see: http://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+package parser
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestNextItem(t *testing.T) {
+
+	l := &lexer{"Test", "1234", 0, 0, 0, 0, 0, make(chan LexToken)}
+
+	if r := l.next(1); r != '1' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(0); r != '1' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(0); r != '2' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(1); r != '3' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(2); r != '4' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(0); r != '3' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(0); r != '4' {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+
+	if r := l.next(0); r != RuneEOF {
+		t.Errorf("Unexpected token: %q", r)
+		return
+	}
+}
+
+func TestBasicTokenLexing(t *testing.T) {
+
+	// Test empty string parsing
+
+	if res := fmt.Sprint(LexToList("mytest", "    \t   ")); res != "[EOF]" {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+
+	// Test arithmetics
+
+	input := `name := a + 1 and (ver+x!=1) * 5 > name2`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`["name" := "a" + v:"1" <AND> ( "ver" + "x" != v:"1" ) * v:"5" > "name2" EOF]` {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+
+	input = `test := not a * 1.3 or (12 / aa) * 5 DIV 3 % 1 > true`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`["test" := <NOT> "a" * v:"1.3" <OR> ( v:"12" / "aa" ) * v:"5" "div" v:"3" % v:"1" > <TRUE> EOF]` {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+
+	input = `-1.234560e+02+5+2.123 // 1`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[- v:"1.234560e+02" + v:"5" + v:"2.123" // v:"1" EOF]` {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+
+	// Test invalid identifier
+
+	input = `5test`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[v:"5" "test" EOF]` {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+
+	input = `@test`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[Error: Cannot parse identifier '@test'. Identifies may only contain [a-zA-Z] and [a-zA-Z0-9] from the second character (Line 1, Pos 1) EOF]` {
+		t.Error("Unexpected lexer result:\n  ", res)
+		return
+	}
+}
+
+func TestAssignmentLexing(t *testing.T) {
+
+	input := `name := a + 1`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`["name" := "a" + v:"1" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name := a.a + a.b`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`["name" := "a" . "a" + "a" . "b" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name:=a[1] + b["d"] + c[a]`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`["name" := "a" [ v:"1" ] + "b" [ "d" ] + "c" [ "a" ] EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+}
+
+func TestBlockLexing(t *testing.T) {
+
+	input := `
+if a == 1 {
+    print("xxx")
+} elif b > 2 {
+    print("yyy")
+} else {
+    print("zzz")
+}
+`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[<IF> "a" == v:"1" { "print" ( "xxx" ) } <ELIF> "b" > v:"2" { "print" ( "yyy" ) } <ELSE> { "print" ( "zzz" ) } EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `
+for a, b in enum(blist) {
+    do(a)
+}
+`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[<FOR> "a" , "b" <IN> "enum" ( "blist" ) { "do" ( "a" ) } EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `
+for true {
+	x := "1"
+	break; continue
+}
+`
+	if res := LexToList("mytest", input); fmt.Sprint(res) !=
+		`[<FOR> <TRUE> { "x" := "1" <BREAK> ; <CONTINUE> } EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+}
+
+func TestStringLexing(t *testing.T) {
+
+	// Test unclosed quotes
+
+	input := `name "test  bla`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" Error: Unexpected end while reading string value (unclosed quotes) (Line 1, Pos 6) EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name "test"  'bla'`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" "test" "bla" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name "te
+	st"  'bla'`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" Error: invalid syntax while parsing string (Line 1, Pos 6)]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name r"te
+	st"  'bla'`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" "te\n\tst" "bla" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	// Parsing with escape sequences
+
+	input = `"test\n\ttest"  '\nfoo\u0028bar' "test{foo}.5w3f"`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["test\n\ttest" "\nfoo(bar" "test{foo}.5w3f" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+}
+
+func TestCommentLexing(t *testing.T) {
+
+	input := `name /* foo
+		bar
+	x*/ 'b/* - */la' /*test*/`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" c:' foo
+		bar
+	x' "b/* - */la" c:'test' EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `name /* foo
+		bar`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["name" Error: Unexpected end while reading comment (Line 1, Pos 8) EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `foo
+   1+ 2 # Some comment
+bar`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `["foo" v:"1" + v:"2" c:' Some comment' "bar" EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+	input = `1+ 2 # Some comment`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `[v:"1" + v:"2" c:' Some commen' EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+
+}
+
+func TestSinkLexing(t *testing.T) {
+
+	input := `sink "mysink"
+r"
+A comment describing the sink.
+"
+kindmatch [ foo.bar.* ],
+scopematch [ "data.read", "data.write" ],
+statematch { a : 1, b : NULL },
+priority 0,
+suppresses [ "myothersink" ]
+{
+  a := 1
+}`
+	if res := LexToList("mytest", input); fmt.Sprint(res) != `[<SINK> "mysink" "\nA comment"... <KINDMATCH> `+
+		`[ "foo" . "bar" . * ] , <SCOPEMATCH> [ "data.read" , "data.write" ] , <STATEMATCH> `+
+		`{ "a" : v:"1" , "b" : <NULL> } , <PRIORITY> v:"0" , <SUPPRESSES> [ "myothersink" ] `+
+		`{ "a" := v:"1" } EOF]` {
+		t.Error("Unexpected lexer result:", res)
+		return
+	}
+}