lexer.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777
  1. /*
  2. * ECAL
  3. *
  4. * Copyright 2020 Matthias Ladkau. All rights reserved.
  5. *
  6. * This Source Code Form is subject to the terms of the MIT
  7. * License, If a copy of the MIT License was not distributed with this
  8. * file, You can obtain one at https://opensource.org/licenses/MIT.
  9. */
  10. package parser
  11. import (
  12. "bytes"
  13. "encoding/json"
  14. "fmt"
  15. "regexp"
  16. "strconv"
  17. "strings"
  18. "unicode"
  19. "unicode/utf8"
  20. )
  21. var namePattern = regexp.MustCompile("^[A-Za-z][A-Za-z0-9]*$")
  22. var numberPattern = regexp.MustCompile("^[0-9].*$")
  23. /*
  24. LexToken represents a token which is returned by the lexer.
  25. */
  26. type LexToken struct {
  27. ID LexTokenID // Token kind
  28. Pos int // Starting position (in bytes)
  29. Val string // Token value
  30. Identifier bool // Flag if the value is an identifier (not quoted and not a number)
  31. AllowEscapes bool // Flag if the value did interpret escape charaters
  32. Lline int // Line in the input this token appears
  33. Lpos int // Position in the input line this token appears
  34. }
  35. /*
  36. NewLexTokenInstance creates a new LexToken object instance from given LexToken values.
  37. */
  38. func NewLexTokenInstance(t LexToken) *LexToken {
  39. return &LexToken{
  40. t.ID,
  41. t.Pos,
  42. t.Val,
  43. t.Identifier,
  44. t.AllowEscapes,
  45. t.Lline,
  46. t.Lpos,
  47. }
  48. }
  49. /*
  50. Equals checks if this LexToken equals another LexToken. Returns also a message describing
  51. what is the found difference.
  52. */
  53. func (t LexToken) Equals(other LexToken, ignorePosition bool) (bool, string) {
  54. var res = true
  55. var msg = ""
  56. if t.ID != other.ID {
  57. res = false
  58. msg += fmt.Sprintf("ID is different %v vs %v\n", t.ID, other.ID)
  59. }
  60. if !ignorePosition && t.Pos != other.Pos {
  61. res = false
  62. msg += fmt.Sprintf("Pos is different %v vs %v\n", t.Pos, other.Pos)
  63. }
  64. if t.Val != other.Val {
  65. res = false
  66. msg += fmt.Sprintf("Val is different %v vs %v\n", t.Val, other.Val)
  67. }
  68. if t.Identifier != other.Identifier {
  69. res = false
  70. msg += fmt.Sprintf("Identifier is different %v vs %v\n", t.Identifier, other.Identifier)
  71. }
  72. if !ignorePosition && t.Lline != other.Lline {
  73. res = false
  74. msg += fmt.Sprintf("Lline is different %v vs %v\n", t.Lline, other.Lline)
  75. }
  76. if !ignorePosition && t.Lpos != other.Lpos {
  77. res = false
  78. msg += fmt.Sprintf("Lpos is different %v vs %v\n", t.Lpos, other.Lpos)
  79. }
  80. if msg != "" {
  81. var buf bytes.Buffer
  82. out, _ := json.MarshalIndent(t, "", " ")
  83. buf.WriteString(string(out))
  84. buf.WriteString("\nvs\n")
  85. out, _ = json.MarshalIndent(other, "", " ")
  86. buf.WriteString(string(out))
  87. msg = fmt.Sprintf("%v%v", msg, buf.String())
  88. }
  89. return res, msg
  90. }
  91. /*
  92. PosString returns the position of this token in the origianl input as a string.
  93. */
  94. func (t LexToken) PosString() string {
  95. return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)
  96. }
  97. /*
  98. String returns a string representation of a token.
  99. */
  100. func (t LexToken) String() string {
  101. prefix := ""
  102. if !t.Identifier {
  103. prefix = "v:" // Value is not an identifier
  104. }
  105. switch {
  106. case t.ID == TokenEOF:
  107. return "EOF"
  108. case t.ID == TokenError:
  109. return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())
  110. case t.ID == TokenPRECOMMENT:
  111. return fmt.Sprintf("/* %s */", t.Val)
  112. case t.ID == TokenPOSTCOMMENT:
  113. return fmt.Sprintf("# %s", t.Val)
  114. case t.ID > TOKENodeSYMBOLS && t.ID < TOKENodeKEYWORDS:
  115. return fmt.Sprintf("%s", strings.ToUpper(t.Val))
  116. case t.ID > TOKENodeKEYWORDS:
  117. return fmt.Sprintf("<%s>", strings.ToUpper(t.Val))
  118. case len(t.Val) > 20:
  119. // Special case for very long values
  120. return fmt.Sprintf("%s%.10q...", prefix, t.Val)
  121. }
  122. return fmt.Sprintf("%s%q", prefix, t.Val)
  123. }
  124. // Meta data interface
  125. /*
  126. Type returns the meta data type.
  127. */
  128. func (t LexToken) Type() string {
  129. if t.ID == TokenPRECOMMENT {
  130. return MetaDataPreComment
  131. } else if t.ID == TokenPOSTCOMMENT {
  132. return MetaDataPostComment
  133. }
  134. return MetaDataGeneral
  135. }
  136. /*
  137. Value returns the meta data value.
  138. */
  139. func (t LexToken) Value() string {
  140. return t.Val
  141. }
  142. /*
  143. KeywordMap is a map of keywords - these require spaces between them
  144. */
  145. var KeywordMap = map[string]LexTokenID{
  146. // Import statement
  147. "import": TokenIMPORT,
  148. "as": TokenAS,
  149. // Sink definition
  150. "sink": TokenSINK,
  151. "kindmatch": TokenKINDMATCH,
  152. "scopematch": TokenSCOPEMATCH,
  153. "statematch": TokenSTATEMATCH,
  154. "priority": TokenPRIORITY,
  155. "suppresses": TokenSUPPRESSES,
  156. // Function definition
  157. "func": TokenFUNC,
  158. "return": TokenRETURN,
  159. // Boolean operators
  160. "and": TokenAND,
  161. "or": TokenOR,
  162. "not": TokenNOT,
  163. // String operators
  164. "like": TokenLIKE,
  165. "hasprefix": TokenHASPREFIX,
  166. "hassuffix": TokenHASSUFFIX,
  167. // List operators
  168. "in": TokenIN,
  169. "notin": TokenNOTIN,
  170. // Constant terminals
  171. "false": TokenFALSE,
  172. "true": TokenTRUE,
  173. "null": TokenNULL,
  174. // Conditional statements
  175. "if": TokenIF,
  176. "elif": TokenELIF,
  177. "else": TokenELSE,
  178. // Loop statements
  179. "for": TokenFOR,
  180. "break": TokenBREAK,
  181. "continue": TokenCONTINUE,
  182. // Try block
  183. "try": TokenTRY,
  184. "except": TokenEXCEPT,
  185. "finally": TokenFINALLY,
  186. }
  187. /*
  188. SymbolMap is a map of special symbols which will always be unique - these will separate unquoted strings
  189. Symbols can be maximal 2 characters long.
  190. */
  191. var SymbolMap = map[string]LexTokenID{
  192. // Condition operators
  193. ">=": TokenGEQ,
  194. "<=": TokenLEQ,
  195. "!=": TokenNEQ,
  196. "==": TokenEQ,
  197. ">": TokenGT,
  198. "<": TokenLT,
  199. // Grouping symbols
  200. "(": TokenLPAREN,
  201. ")": TokenRPAREN,
  202. "[": TokenLBRACK,
  203. "]": TokenRBRACK,
  204. "{": TokenLBRACE,
  205. "}": TokenRBRACE,
  206. // Separators
  207. ".": TokenDOT,
  208. ",": TokenCOMMA,
  209. ";": TokenSEMICOLON,
  210. // Grouping
  211. ":": TokenCOLON,
  212. "=": TokenEQUAL,
  213. // Arithmetic operators
  214. "+": TokenPLUS,
  215. "-": TokenMINUS,
  216. "*": TokenTIMES,
  217. "/": TokenDIV,
  218. "//": TokenDIVINT,
  219. "%": TokenMODINT,
  220. // Assignment statement
  221. ":=": TokenASSIGN,
  222. }
  223. // Lexer
  224. // =====
  225. /*
  226. RuneEOF is a special rune which represents the end of the input
  227. */
  228. const RuneEOF = -1
  229. /*
  230. Function which represents the current state of the lexer and returns the next state
  231. */
  232. type lexFunc func(*lexer) lexFunc
  233. /*
  234. Lexer data structure
  235. */
  236. type lexer struct {
  237. name string // Name to identify the input
  238. input string // Input string of the lexer
  239. pos int // Current rune pointer
  240. line int // Current line pointer
  241. lastnl int // Last newline position
  242. width int // Width of last rune
  243. start int // Start position of the current red token
  244. tokens chan LexToken // Channel for lexer output
  245. }
  246. /*
  247. Lex lexes a given input. Returns a channel which contains tokens.
  248. */
  249. func Lex(name string, input string) chan LexToken {
  250. l := &lexer{name, input, 0, 0, 0, 0, 0, make(chan LexToken)}
  251. go l.run()
  252. return l.tokens
  253. }
  254. /*
  255. LexToList lexes a given input. Returns a list of tokens.
  256. */
  257. func LexToList(name string, input string) []LexToken {
  258. var tokens []LexToken
  259. for t := range Lex(name, input) {
  260. tokens = append(tokens, t)
  261. }
  262. return tokens
  263. }
  264. /*
  265. Main loop of the lexer.
  266. */
  267. func (l *lexer) run() {
  268. if skipWhiteSpace(l) {
  269. for state := lexToken; state != nil; {
  270. state = state(l)
  271. if !skipWhiteSpace(l) {
  272. break
  273. }
  274. }
  275. }
  276. close(l.tokens)
  277. }
  278. /*
  279. next returns the next rune in the input and advances the current rune pointer
  280. if peek is 0. If peek is >0 then the nth character is returned without advancing
  281. the rune pointer.
  282. */
  283. func (l *lexer) next(peek int) rune {
  284. // Check if we reached the end
  285. if int(l.pos) >= len(l.input) {
  286. return RuneEOF
  287. }
  288. // Decode the next rune
  289. pos := l.pos
  290. if peek > 0 {
  291. pos += peek - 1
  292. }
  293. r, w := utf8.DecodeRuneInString(l.input[pos:])
  294. if peek == 0 {
  295. l.width = w
  296. l.pos += l.width
  297. }
  298. return r
  299. }
  300. /*
  301. backup sets the pointer one rune back. Can only be called once per next call.
  302. */
  303. func (l *lexer) backup(width int) {
  304. if width == 0 {
  305. width = l.width
  306. }
  307. l.pos -= width
  308. }
  309. /*
  310. startNew starts a new token.
  311. */
  312. func (l *lexer) startNew() {
  313. l.start = l.pos
  314. }
  315. /*
  316. emitToken passes a token back to the client.
  317. */
  318. func (l *lexer) emitToken(t LexTokenID) {
  319. if t == TokenEOF {
  320. l.emitTokenAndValue(t, "", false, false)
  321. return
  322. }
  323. if l.tokens != nil {
  324. l.tokens <- LexToken{t, l.start, l.input[l.start:l.pos], false, false,
  325. l.line + 1, l.start - l.lastnl + 1}
  326. }
  327. }
  328. /*
  329. emitTokenAndValue passes a token with a given value back to the client.
  330. */
  331. func (l *lexer) emitTokenAndValue(t LexTokenID, val string, identifier bool, allowEscapes bool) {
  332. if l.tokens != nil {
  333. l.tokens <- LexToken{t, l.start, val, identifier, allowEscapes, l.line + 1, l.start - l.lastnl + 1}
  334. }
  335. }
  336. /*
  337. emitError passes an error token back to the client.
  338. */
  339. func (l *lexer) emitError(msg string) {
  340. if l.tokens != nil {
  341. l.tokens <- LexToken{TokenError, l.start, msg, false, false, l.line + 1, l.start - l.lastnl + 1}
  342. }
  343. }
  344. // Helper functions
  345. // ================
  346. /*
  347. skipWhiteSpace skips any number of whitespace characters. Returns false if the parser
  348. reaches EOF while skipping whitespaces.
  349. */
  350. func skipWhiteSpace(l *lexer) bool {
  351. r := l.next(0)
  352. for unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF {
  353. if r == '\n' {
  354. l.line++
  355. l.lastnl = l.pos
  356. }
  357. r = l.next(0)
  358. if r == RuneEOF {
  359. l.emitToken(TokenEOF)
  360. return false
  361. }
  362. }
  363. l.backup(0)
  364. return true
  365. }
  366. /*
  367. lexTextBlock lexes a block of text without whitespaces. Interprets
  368. optionally all one or two letter tokens.
  369. */
  370. func lexTextBlock(l *lexer, interpretToken bool) {
  371. r := l.next(0)
  372. if interpretToken {
  373. // Check if we start with a known symbol
  374. nr := l.next(1)
  375. if _, ok := SymbolMap[strings.ToLower(string(r)+string(nr))]; ok {
  376. l.next(0)
  377. return
  378. }
  379. if _, ok := SymbolMap[strings.ToLower(string(r))]; ok {
  380. return
  381. }
  382. }
  383. for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {
  384. if interpretToken {
  385. // Check if we find a token in the block
  386. if _, ok := SymbolMap[strings.ToLower(string(r))]; ok {
  387. l.backup(0)
  388. return
  389. }
  390. nr := l.next(1)
  391. if _, ok := SymbolMap[strings.ToLower(string(r)+string(nr))]; ok {
  392. l.backup(0)
  393. return
  394. }
  395. }
  396. r = l.next(0)
  397. }
  398. if r != RuneEOF {
  399. l.backup(0)
  400. }
  401. }
  402. /*
  403. lexNumberBlock lexes a block potentially containing a number.
  404. */
  405. func lexNumberBlock(l *lexer) {
  406. r := l.next(0)
  407. for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {
  408. if !unicode.IsNumber(r) && r != '.' {
  409. if r == 'e' {
  410. l1 := l.next(1)
  411. l2 := l.next(2)
  412. if l1 != '+' || !unicode.IsNumber(l2) {
  413. break
  414. }
  415. l.next(0)
  416. l.next(0)
  417. } else {
  418. break
  419. }
  420. }
  421. r = l.next(0)
  422. }
  423. if r != RuneEOF {
  424. l.backup(0)
  425. }
  426. }
  427. // State functions
  428. // ===============
  429. /*
  430. lexToken is the main entry function for the lexer.
  431. */
  432. func lexToken(l *lexer) lexFunc {
  433. // Check if we got a quoted value or a comment
  434. n1 := l.next(1)
  435. n2 := l.next(2)
  436. // Parse comments
  437. if (n1 == '/' && n2 == '*') || n1 == '#' {
  438. return lexComment
  439. }
  440. // Parse strings
  441. if (n1 == '"' || n1 == '\'') || (n1 == 'r' && (n2 == '"' || n2 == '\'')) {
  442. return lexValue
  443. }
  444. // Lex a block of text and emit any found tokens
  445. l.startNew()
  446. // First try to parse a number
  447. lexNumberBlock(l)
  448. identifierCandidate := l.input[l.start:l.pos]
  449. keywordCandidate := strings.ToLower(identifierCandidate)
  450. // Check for number
  451. if numberPattern.MatchString(keywordCandidate) {
  452. _, err := strconv.ParseFloat(keywordCandidate, 64)
  453. if err == nil {
  454. l.emitTokenAndValue(TokenNUMBER, keywordCandidate, false, false)
  455. return lexToken
  456. }
  457. }
  458. if len(keywordCandidate) > 0 {
  459. l.backup(l.pos - l.start)
  460. }
  461. lexTextBlock(l, true)
  462. identifierCandidate = l.input[l.start:l.pos]
  463. keywordCandidate = strings.ToLower(identifierCandidate)
  464. // Check for keyword
  465. token, ok := KeywordMap[keywordCandidate]
  466. if !ok {
  467. // Check for symbol
  468. token, ok = SymbolMap[keywordCandidate]
  469. }
  470. if ok {
  471. // A known token was found
  472. l.emitToken(token)
  473. } else {
  474. if !namePattern.MatchString(keywordCandidate) {
  475. l.emitError(fmt.Sprintf("Cannot parse identifier '%v'. Identifies may only contain [a-zA-Z] and [a-zA-Z0-9] from the second character", keywordCandidate))
  476. return nil
  477. }
  478. // An identifier was found
  479. l.emitTokenAndValue(TokenIDENTIFIER, identifierCandidate, true, false)
  480. }
  481. return lexToken
  482. }
  483. /*
  484. lexValue lexes a string value.
  485. Values can be declared in different ways:
  486. ' ... ' or " ... "
  487. Characters are parsed between quotes (escape sequences are interpreted)
  488. r' ... ' or r" ... "
  489. Characters are parsed plain between quote
  490. */
  491. func lexValue(l *lexer) lexFunc {
  492. var endToken rune
  493. l.startNew()
  494. allowEscapes := false
  495. r := l.next(0)
  496. // Check if we have a raw quoted string
  497. if q := l.next(1); r == 'r' && (q == '"' || q == '\'') {
  498. endToken = q
  499. l.next(0)
  500. } else {
  501. allowEscapes = true
  502. endToken = r
  503. }
  504. r = l.next(0)
  505. rprev := ' '
  506. lLine := l.line
  507. lLastnl := l.lastnl
  508. for (!allowEscapes && r != endToken) ||
  509. (allowEscapes && (r != endToken || rprev == '\\')) {
  510. if r == '\n' {
  511. lLine++
  512. lLastnl = l.pos
  513. }
  514. rprev = r
  515. r = l.next(0)
  516. if r == RuneEOF {
  517. l.emitError("Unexpected end while reading string value (unclosed quotes)")
  518. return nil
  519. }
  520. }
  521. if allowEscapes {
  522. val := l.input[l.start+1 : l.pos-1]
  523. // Interpret escape sequences right away
  524. if endToken == '\'' {
  525. // Escape double quotes in a single quoted string
  526. val = strings.Replace(val, "\"", "\\\"", -1)
  527. }
  528. s, err := strconv.Unquote("\"" + val + "\"")
  529. if err != nil {
  530. l.emitError(err.Error() + " while parsing string")
  531. return nil
  532. }
  533. l.emitTokenAndValue(TokenSTRING, s, true, true)
  534. } else {
  535. l.emitTokenAndValue(TokenSTRING, l.input[l.start+2:l.pos-1], true, false)
  536. }
  537. // Set newline
  538. l.line = lLine
  539. l.lastnl = lLastnl
  540. return lexToken
  541. }
  542. /*
  543. lexComment lexes comments.
  544. */
  545. func lexComment(l *lexer) lexFunc {
  546. // Consume initial /*
  547. r := l.next(0)
  548. if r == '#' {
  549. l.startNew()
  550. for r != '\n' && r != RuneEOF {
  551. r = l.next(0)
  552. }
  553. l.emitTokenAndValue(TokenPOSTCOMMENT, l.input[l.start:l.pos], false, false)
  554. if r == RuneEOF {
  555. return nil
  556. }
  557. l.line++
  558. } else {
  559. l.next(0)
  560. lLine := l.line
  561. lLastnl := l.lastnl
  562. l.startNew()
  563. r = l.next(0)
  564. for r != '*' && l.next(1) != '/' {
  565. if r == '\n' {
  566. lLine++
  567. lLastnl = l.pos
  568. }
  569. r = l.next(0)
  570. if r == RuneEOF {
  571. l.emitError("Unexpected end while reading comment")
  572. return nil
  573. }
  574. }
  575. l.emitTokenAndValue(TokenPRECOMMENT, l.input[l.start:l.pos-1], false, false)
  576. // Consume final /
  577. l.next(0)
  578. // Set newline
  579. l.line = lLine
  580. l.lastnl = lLastnl
  581. }
  582. return lexToken
  583. }