lexer.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558
  1. /*
  2. * EliasDB
  3. *
  4. * Copyright 2016 Matthias Ladkau. All rights reserved.
  5. *
  6. * This Source Code Form is subject to the terms of the Mozilla Public
  7. * License, v. 2.0. If a copy of the MPL was not distributed with this
  8. * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. */
  10. package parser
  11. import (
  12. "fmt"
  13. "strconv"
  14. "strings"
  15. "unicode"
  16. "unicode/utf8"
  17. "devt.de/krotik/common/stringutil"
  18. )
  19. /*
  20. LexToken represents a token which is returned by the lexer.
  21. */
  22. type LexToken struct {
  23. ID LexTokenID // Token kind
  24. Pos int // Starting position (in runes)
  25. Val string // Token value
  26. Lline int // Line in the input this token appears
  27. Lpos int // Position in the input line this token appears
  28. }
  29. /*
  30. PosString returns the position of this token in the origianl input as a string.
  31. */
  32. func (t LexToken) PosString() string {
  33. return fmt.Sprintf("Line %v, Pos %v", t.Lline, t.Lpos)
  34. }
  35. /*
  36. String returns a string representation of a token.
  37. */
  38. func (t LexToken) String() string {
  39. switch {
  40. case t.ID == TokenEOF:
  41. return "EOF"
  42. case t.ID == TokenError:
  43. return fmt.Sprintf("Error: %s (%s)", t.Val, t.PosString())
  44. case t.ID > TOKENodeSYMBOLS && t.ID < TOKENodeKEYWORDS:
  45. return fmt.Sprintf("%s", strings.ToUpper(t.Val))
  46. case t.ID > TOKENodeKEYWORDS:
  47. return fmt.Sprintf("<%s>", strings.ToUpper(t.Val))
  48. case len(t.Val) > 10:
  49. // Special case for very long values
  50. return fmt.Sprintf("%.10q...", t.Val)
  51. }
  52. return fmt.Sprintf("%q", t.Val)
  53. }
  54. /*
  55. Map of keywords - these require spaces between them
  56. */
  57. var keywordMap = map[string]LexTokenID{
  58. "get": TokenGET,
  59. "lookup": TokenLOOKUP,
  60. "from": TokenFROM,
  61. "group": TokenGROUP,
  62. "with": TokenWITH,
  63. "filtering": TokenFILTERING,
  64. "ordering": TokenORDERING,
  65. "nulltraversal": TokenNULLTRAVERSAL,
  66. "where": TokenWHERE,
  67. "traverse": TokenTRAVERSE,
  68. "end": TokenEND,
  69. "primary": TokenPRIMARY,
  70. "show": TokenSHOW,
  71. "as": TokenAS,
  72. "format": TokenFORMAT,
  73. "and": TokenAND,
  74. "or": TokenOR,
  75. "like": TokenLIKE,
  76. "in": TokenIN,
  77. "contains": TokenCONTAINS,
  78. "beginswith": TokenBEGINSWITH,
  79. "endswith": TokenENDSWITH,
  80. "containsnot": TokenCONTAINSNOT,
  81. "not": TokenNOT,
  82. "notin": TokenNOTIN,
  83. "false": TokenFALSE,
  84. "true": TokenTRUE,
  85. "unique": TokenUNIQUE,
  86. "uniquecount": TokenUNIQUECOUNT,
  87. "null": TokenNULL,
  88. "isnotnull": TokenISNOTNULL,
  89. "ascending": TokenASCENDING,
  90. "descending": TokenDESCENDING,
  91. }
  92. /*
  93. Special symbols which will always be unique - these will separate unquoted strings
  94. */
  95. var symbolMap = map[string]LexTokenID{
  96. "@": TokenAT,
  97. ">=": TokenGEQ,
  98. "<=": TokenLEQ,
  99. "!=": TokenNEQ,
  100. "=": TokenEQ,
  101. ">": TokenGT,
  102. "<": TokenLT,
  103. "(": TokenLPAREN,
  104. ")": TokenRPAREN,
  105. "[": TokenLBRACK,
  106. "]": TokenRBRACK,
  107. ",": TokenCOMMA,
  108. "+": TokenPLUS,
  109. "-": TokenMINUS,
  110. "*": TokenTIMES,
  111. "/": TokenDIV,
  112. "//": TokenDIVINT,
  113. "%": TokenMODINT,
  114. }
  115. // Lexer
  116. // =====
  117. /*
  118. RuneEOF is a special rune which represents the end of the input
  119. */
  120. const RuneEOF = -1
  121. /*
  122. Function which represents the current state of the lexer and returns the next state
  123. */
  124. type lexFunc func(*lexer) lexFunc
  125. /*
  126. Lexer data structure
  127. */
  128. type lexer struct {
  129. name string // Name to identify the input
  130. input string // Input string of the lexer
  131. pos int // Current rune pointer
  132. line int // Current line pointer
  133. lastnl int // Last newline position
  134. width int // Width of last rune
  135. start int // Start position of the current read token
  136. scope LexTokenID // Current scope
  137. tokens chan LexToken // Channel for lexer output
  138. }
  139. /*
  140. FirstWord returns the first word of a given input.
  141. */
  142. func FirstWord(input string) string {
  143. var word string
  144. l := &lexer{"", input, 0, 0, 0, 0, 0, -1, nil}
  145. if skipWhiteSpace(l) {
  146. l.startNew()
  147. lexTextBlock(l, false)
  148. word = input[l.start:l.pos]
  149. }
  150. return word
  151. }
  152. /*
  153. Lex lexes a given input. Returns a channel which contains tokens.
  154. */
  155. func Lex(name string, input string) chan LexToken {
  156. l := &lexer{name, input, 0, 0, 0, 0, 0, -1, make(chan LexToken)}
  157. go l.run()
  158. return l.tokens
  159. }
  160. /*
  161. LexToList lexes a given input. Returns a list of tokens.
  162. */
  163. func LexToList(name string, input string) []LexToken {
  164. var tokens []LexToken
  165. for t := range Lex(name, input) {
  166. tokens = append(tokens, t)
  167. }
  168. return tokens
  169. }
  170. /*
  171. Main look of the lexer.
  172. */
  173. func (l *lexer) run() {
  174. if skipWhiteSpace(l) {
  175. for state := lexToken; state != nil; {
  176. state = state(l)
  177. if !skipWhiteSpace(l) {
  178. break
  179. }
  180. }
  181. }
  182. close(l.tokens)
  183. }
  184. /*
  185. next returns the next rune in the input and advances the current rune pointer
  186. if the peek flag is not set. If the peek flag is set then the rune pointer
  187. is not advanced.
  188. */
  189. func (l *lexer) next(peek bool) rune {
  190. // Check if we reached the end
  191. if int(l.pos) >= len(l.input) {
  192. return RuneEOF
  193. }
  194. // Decode the next rune
  195. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  196. if !peek {
  197. l.width = w
  198. l.pos += l.width
  199. }
  200. return r
  201. }
  202. /*
  203. backup sets the pointer one rune back. Can only be called once per next call.
  204. */
  205. func (l *lexer) backup() {
  206. if l.width == -1 {
  207. panic("Can only backup once per next call")
  208. }
  209. l.pos -= l.width
  210. l.width = -1
  211. }
  212. /*
  213. startNew starts a new token.
  214. */
  215. func (l *lexer) startNew() {
  216. l.start = l.pos
  217. }
  218. /*
  219. emitToken passes a token back to the client.
  220. */
  221. func (l *lexer) emitToken(t LexTokenID) {
  222. if t == TokenEOF {
  223. l.emitTokenAndValue(t, "")
  224. return
  225. }
  226. if l.tokens != nil {
  227. l.tokens <- LexToken{t, l.start, l.input[l.start:l.pos],
  228. l.line + 1, l.start - l.lastnl + 1}
  229. }
  230. }
  231. /*
  232. emitTokenAndValue passes a token with a given value back to the client.
  233. */
  234. func (l *lexer) emitTokenAndValue(t LexTokenID, val string) {
  235. if l.tokens != nil {
  236. l.tokens <- LexToken{t, l.start, val, l.line + 1, l.start - l.lastnl + 1}
  237. }
  238. }
  239. /*
  240. emitError passes an error token back to the client.
  241. */
  242. func (l *lexer) emitError(msg string) {
  243. if l.tokens != nil {
  244. l.tokens <- LexToken{TokenError, l.start, msg, l.line + 1, l.start - l.lastnl + 1}
  245. }
  246. }
  247. // State functions
  248. // ===============
  249. /*
  250. lexToken is the main entry function for the lexer.
  251. */
  252. func lexToken(l *lexer) lexFunc {
  253. // Check if we got a quoted value or a comment
  254. n1 := l.next(false)
  255. n2 := l.next(true)
  256. l.backup()
  257. if n1 == '#' {
  258. return skipRestOfLine
  259. }
  260. if (n1 == '"' || n1 == '\'') || (n1 == 'r' && (n2 == '"' || n2 == '\'')) {
  261. return lexValue
  262. }
  263. // Lex a block of text and emit any found tokens
  264. l.startNew()
  265. lexTextBlock(l, true)
  266. // Try to lookup the keyword or an unquoted value
  267. keywordCandidate := strings.ToLower(l.input[l.start:l.pos])
  268. token, ok := keywordMap[keywordCandidate]
  269. if !ok {
  270. token, ok = symbolMap[keywordCandidate]
  271. }
  272. if ok {
  273. // Special start token was found
  274. l.emitToken(token)
  275. switch token {
  276. case TokenGET:
  277. l.scope = token
  278. return lexNodeKind
  279. case TokenLOOKUP:
  280. l.scope = token
  281. return lexNodeKind
  282. }
  283. } else {
  284. // An unknown token was found - it must be an unquoted value
  285. // emit and continue
  286. l.emitToken(TokenVALUE)
  287. }
  288. return lexToken
  289. }
  290. /*
  291. skipRestOfLine skips all characters until the next newline character.
  292. */
  293. func skipRestOfLine(l *lexer) lexFunc {
  294. r := l.next(false)
  295. for r != '\n' && r != RuneEOF {
  296. r = l.next(false)
  297. }
  298. if r == RuneEOF {
  299. return nil
  300. }
  301. return lexToken
  302. }
  303. /*
  304. lexNodeKind lexes a node kind string.
  305. */
  306. func lexNodeKind(l *lexer) lexFunc {
  307. l.startNew()
  308. lexTextBlock(l, false)
  309. nodeKindCandidate := strings.ToLower(l.input[l.start:l.pos])
  310. if !stringutil.IsAlphaNumeric(nodeKindCandidate) {
  311. l.emitError("Invalid node kind " + fmt.Sprintf("'%v'", nodeKindCandidate) +
  312. " - can only contain [a-zA-Z0-9_]")
  313. return nil
  314. }
  315. l.emitToken(TokenNODEKIND)
  316. if l.scope == TokenGET {
  317. return lexToken
  318. }
  319. // In a lookup scope more values are following
  320. return lexValue
  321. }
  322. /*
  323. lexValue lexes a value which can describe names, values, regexes, etc ...
  324. Values can be declared in different ways:
  325. ' ... ' or " ... "
  326. Characters are parsed between quotes (escape sequences are interpreted)
  327. r' ... ' or r" ... "
  328. Characters are parsed plain between quote
  329. */
  330. func lexValue(l *lexer) lexFunc {
  331. l.startNew()
  332. allowEscapes := false
  333. endToken := ' '
  334. r := l.next(false)
  335. // Check if we have a raw quoted string
  336. if q := l.next(true); r == 'r' && (q == '"' || q == '\'') {
  337. endToken = q
  338. l.next(false)
  339. } else if r == '"' || r == '\'' {
  340. allowEscapes = true
  341. endToken = r
  342. } else {
  343. l.emitError("Value expected")
  344. return nil
  345. }
  346. r = l.next(false)
  347. rprev := ' '
  348. lLine := l.line
  349. lLastnl := l.lastnl
  350. for (!allowEscapes && r != endToken) ||
  351. (allowEscapes && (r != endToken || rprev == '\\')) {
  352. if r == '\n' {
  353. lLine++
  354. lLastnl = l.pos
  355. }
  356. rprev = r
  357. r = l.next(false)
  358. if r == RuneEOF {
  359. l.emitError("Unexpected end while reading value")
  360. return nil
  361. }
  362. }
  363. if allowEscapes {
  364. val := l.input[l.start+1 : l.pos-1]
  365. // Interpret escape sequences right away
  366. if endToken == '\'' {
  367. // Escape double quotes in a single quoted string
  368. val = strings.Replace(val, "\"", "\\\"", -1)
  369. }
  370. s, err := strconv.Unquote("\"" + val + "\"")
  371. if err != nil {
  372. l.emitError(err.Error() + " while parsing escape sequences")
  373. return nil
  374. }
  375. l.emitTokenAndValue(TokenVALUE, s)
  376. } else {
  377. l.emitTokenAndValue(TokenVALUE, l.input[l.start+2:l.pos-1])
  378. }
  379. // Set newline
  380. l.line = lLine
  381. l.lastnl = lLastnl
  382. return lexToken
  383. }
  384. // Helper functions
  385. // ================
  386. /*
  387. skipWhiteSpace skips any number of whitespace characters. Returns false if the parser
  388. reaches EOF while skipping whitespaces.
  389. */
  390. func skipWhiteSpace(l *lexer) bool {
  391. r := l.next(false)
  392. for unicode.IsSpace(r) || unicode.IsControl(r) || r == RuneEOF {
  393. if r == '\n' {
  394. l.line++
  395. l.lastnl = l.pos
  396. }
  397. r = l.next(false)
  398. if r == RuneEOF {
  399. l.emitToken(TokenEOF)
  400. return false
  401. }
  402. }
  403. l.backup()
  404. return true
  405. }
  406. /*
  407. lexTextBlock lexes a block of text without whitespaces. Interprets
  408. optionally all one or two letter tokens.
  409. */
  410. func lexTextBlock(l *lexer, interpretToken bool) {
  411. r := l.next(false)
  412. if interpretToken {
  413. // Check if we start with a known symbol
  414. nr := l.next(true)
  415. if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {
  416. l.next(false)
  417. return
  418. }
  419. if _, ok := symbolMap[strings.ToLower(string(r))]; ok {
  420. return
  421. }
  422. }
  423. for !unicode.IsSpace(r) && !unicode.IsControl(r) && r != RuneEOF {
  424. if interpretToken {
  425. // Check if we find a token in the block
  426. if _, ok := symbolMap[strings.ToLower(string(r))]; ok {
  427. l.backup()
  428. return
  429. }
  430. nr := l.next(true)
  431. if _, ok := symbolMap[strings.ToLower(string(r)+string(nr))]; ok {
  432. l.backup()
  433. return
  434. }
  435. }
  436. r = l.next(false)
  437. }
  438. if r != RuneEOF {
  439. l.backup()
  440. }
  441. }