lex.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. // Copyright 2013-2022 Frank Schroeder. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //
  5. // Parts of the lexer are from the template/text/parser package
  6. // For these parts the following applies:
  7. //
  8. // Copyright 2011 The Go Authors. All rights reserved.
  9. // Use of this source code is governed by a BSD-style
  10. // license that can be found in the LICENSE file of the go 1.2
  11. // distribution.
  12. package properties
  13. import (
  14. "fmt"
  15. "strconv"
  16. "strings"
  17. "unicode/utf8"
  18. )
  19. // item represents a token or text string returned from the scanner.
  20. type item struct {
  21. typ itemType // The type of this item.
  22. pos int // The starting position, in bytes, of this item in the input string.
  23. val string // The value of this item.
  24. }
  25. func (i item) String() string {
  26. switch {
  27. case i.typ == itemEOF:
  28. return "EOF"
  29. case i.typ == itemError:
  30. return i.val
  31. case len(i.val) > 10:
  32. return fmt.Sprintf("%.10q...", i.val)
  33. }
  34. return fmt.Sprintf("%q", i.val)
  35. }
  36. // itemType identifies the type of lex items.
  37. type itemType int
  38. const (
  39. itemError itemType = iota // error occurred; value is text of error
  40. itemEOF
  41. itemKey // a key
  42. itemValue // a value
  43. itemComment // a comment
  44. )
  45. // defines a constant for EOF
  46. const eof = -1
  47. // permitted whitespace characters space, FF and TAB
  48. const whitespace = " \f\t"
  49. // stateFn represents the state of the scanner as a function that returns the next state.
  50. type stateFn func(*lexer) stateFn
  51. // lexer holds the state of the scanner.
  52. type lexer struct {
  53. input string // the string being scanned
  54. state stateFn // the next lexing function to enter
  55. pos int // current position in the input
  56. start int // start position of this item
  57. width int // width of last rune read from input
  58. lastPos int // position of most recent item returned by nextItem
  59. runes []rune // scanned runes for this item
  60. items chan item // channel of scanned items
  61. }
  62. // next returns the next rune in the input.
  63. func (l *lexer) next() rune {
  64. if l.pos >= len(l.input) {
  65. l.width = 0
  66. return eof
  67. }
  68. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  69. l.width = w
  70. l.pos += l.width
  71. return r
  72. }
  73. // peek returns but does not consume the next rune in the input.
  74. func (l *lexer) peek() rune {
  75. r := l.next()
  76. l.backup()
  77. return r
  78. }
  79. // backup steps back one rune. Can only be called once per call of next.
  80. func (l *lexer) backup() {
  81. l.pos -= l.width
  82. }
  83. // emit passes an item back to the client.
  84. func (l *lexer) emit(t itemType) {
  85. i := item{t, l.start, string(l.runes)}
  86. l.items <- i
  87. l.start = l.pos
  88. l.runes = l.runes[:0]
  89. }
  90. // ignore skips over the pending input before this point.
  91. func (l *lexer) ignore() {
  92. l.start = l.pos
  93. }
  94. // appends the rune to the current value
  95. func (l *lexer) appendRune(r rune) {
  96. l.runes = append(l.runes, r)
  97. }
  98. // accept consumes the next rune if it's from the valid set.
  99. func (l *lexer) accept(valid string) bool {
  100. if strings.ContainsRune(valid, l.next()) {
  101. return true
  102. }
  103. l.backup()
  104. return false
  105. }
  106. // acceptRun consumes a run of runes from the valid set.
  107. func (l *lexer) acceptRun(valid string) {
  108. for strings.ContainsRune(valid, l.next()) {
  109. }
  110. l.backup()
  111. }
  112. // lineNumber reports which line we're on, based on the position of
  113. // the previous item returned by nextItem. Doing it this way
  114. // means we don't have to worry about peek double counting.
  115. func (l *lexer) lineNumber() int {
  116. return 1 + strings.Count(l.input[:l.lastPos], "\n")
  117. }
  118. // errorf returns an error token and terminates the scan by passing
  119. // back a nil pointer that will be the next state, terminating l.nextItem.
  120. func (l *lexer) errorf(format string, args ...interface{}) stateFn {
  121. l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
  122. return nil
  123. }
  124. // nextItem returns the next item from the input.
  125. func (l *lexer) nextItem() item {
  126. i := <-l.items
  127. l.lastPos = i.pos
  128. return i
  129. }
  130. // lex creates a new scanner for the input string.
  131. func lex(input string) *lexer {
  132. l := &lexer{
  133. input: input,
  134. items: make(chan item),
  135. runes: make([]rune, 0, 32),
  136. }
  137. go l.run()
  138. return l
  139. }
  140. // run runs the state machine for the lexer.
  141. func (l *lexer) run() {
  142. for l.state = lexBeforeKey(l); l.state != nil; {
  143. l.state = l.state(l)
  144. }
  145. }
  146. // state functions
  147. // lexBeforeKey scans until a key begins.
  148. func lexBeforeKey(l *lexer) stateFn {
  149. switch r := l.next(); {
  150. case isEOF(r):
  151. l.emit(itemEOF)
  152. return nil
  153. case isEOL(r):
  154. l.ignore()
  155. return lexBeforeKey
  156. case isComment(r):
  157. return lexComment
  158. case isWhitespace(r):
  159. l.ignore()
  160. return lexBeforeKey
  161. default:
  162. l.backup()
  163. return lexKey
  164. }
  165. }
  166. // lexComment scans a comment line. The comment character has already been scanned.
  167. func lexComment(l *lexer) stateFn {
  168. l.acceptRun(whitespace)
  169. l.ignore()
  170. for {
  171. switch r := l.next(); {
  172. case isEOF(r):
  173. l.ignore()
  174. l.emit(itemEOF)
  175. return nil
  176. case isEOL(r):
  177. l.emit(itemComment)
  178. return lexBeforeKey
  179. default:
  180. l.appendRune(r)
  181. }
  182. }
  183. }
  184. // lexKey scans the key up to a delimiter
  185. func lexKey(l *lexer) stateFn {
  186. var r rune
  187. Loop:
  188. for {
  189. switch r = l.next(); {
  190. case isEscape(r):
  191. err := l.scanEscapeSequence()
  192. if err != nil {
  193. return l.errorf(err.Error())
  194. }
  195. case isEndOfKey(r):
  196. l.backup()
  197. break Loop
  198. case isEOF(r):
  199. break Loop
  200. default:
  201. l.appendRune(r)
  202. }
  203. }
  204. if len(l.runes) > 0 {
  205. l.emit(itemKey)
  206. }
  207. if isEOF(r) {
  208. l.emit(itemEOF)
  209. return nil
  210. }
  211. return lexBeforeValue
  212. }
  213. // lexBeforeValue scans the delimiter between key and value.
  214. // Leading and trailing whitespace is ignored.
  215. // We expect to be just after the key.
  216. func lexBeforeValue(l *lexer) stateFn {
  217. l.acceptRun(whitespace)
  218. l.accept(":=")
  219. l.acceptRun(whitespace)
  220. l.ignore()
  221. return lexValue
  222. }
  223. // lexValue scans text until the end of the line. We expect to be just after the delimiter.
  224. func lexValue(l *lexer) stateFn {
  225. for {
  226. switch r := l.next(); {
  227. case isEscape(r):
  228. if isEOL(l.peek()) {
  229. l.next()
  230. l.acceptRun(whitespace)
  231. } else {
  232. err := l.scanEscapeSequence()
  233. if err != nil {
  234. return l.errorf(err.Error())
  235. }
  236. }
  237. case isEOL(r):
  238. l.emit(itemValue)
  239. l.ignore()
  240. return lexBeforeKey
  241. case isEOF(r):
  242. l.emit(itemValue)
  243. l.emit(itemEOF)
  244. return nil
  245. default:
  246. l.appendRune(r)
  247. }
  248. }
  249. }
  250. // scanEscapeSequence scans either one of the escaped characters
  251. // or a unicode literal. We expect to be after the escape character.
  252. func (l *lexer) scanEscapeSequence() error {
  253. switch r := l.next(); {
  254. case isEscapedCharacter(r):
  255. l.appendRune(decodeEscapedCharacter(r))
  256. return nil
  257. case atUnicodeLiteral(r):
  258. return l.scanUnicodeLiteral()
  259. case isEOF(r):
  260. return fmt.Errorf("premature EOF")
  261. // silently drop the escape character and append the rune as is
  262. default:
  263. l.appendRune(r)
  264. return nil
  265. }
  266. }
  267. // scans a unicode literal in the form \uXXXX. We expect to be after the \u.
  268. func (l *lexer) scanUnicodeLiteral() error {
  269. // scan the digits
  270. d := make([]rune, 4)
  271. for i := 0; i < 4; i++ {
  272. d[i] = l.next()
  273. if d[i] == eof || !strings.ContainsRune("0123456789abcdefABCDEF", d[i]) {
  274. return fmt.Errorf("invalid unicode literal")
  275. }
  276. }
  277. // decode the digits into a rune
  278. r, err := strconv.ParseInt(string(d), 16, 0)
  279. if err != nil {
  280. return err
  281. }
  282. l.appendRune(rune(r))
  283. return nil
  284. }
  285. // decodeEscapedCharacter returns the unescaped rune. We expect to be after the escape character.
  286. func decodeEscapedCharacter(r rune) rune {
  287. switch r {
  288. case 'f':
  289. return '\f'
  290. case 'n':
  291. return '\n'
  292. case 'r':
  293. return '\r'
  294. case 't':
  295. return '\t'
  296. default:
  297. return r
  298. }
  299. }
  300. // atUnicodeLiteral reports whether we are at a unicode literal.
  301. // The escape character has already been consumed.
  302. func atUnicodeLiteral(r rune) bool {
  303. return r == 'u'
  304. }
  305. // isComment reports whether we are at the start of a comment.
  306. func isComment(r rune) bool {
  307. return r == '#' || r == '!'
  308. }
  309. // isEndOfKey reports whether the rune terminates the current key.
  310. func isEndOfKey(r rune) bool {
  311. return strings.ContainsRune(" \f\t\r\n:=", r)
  312. }
  313. // isEOF reports whether we are at EOF.
  314. func isEOF(r rune) bool {
  315. return r == eof
  316. }
  317. // isEOL reports whether we are at a new line character.
  318. func isEOL(r rune) bool {
  319. return r == '\n' || r == '\r'
  320. }
  321. // isEscape reports whether the rune is the escape character which
  322. // prefixes unicode literals and other escaped characters.
  323. func isEscape(r rune) bool {
  324. return r == '\\'
  325. }
  326. // isEscapedCharacter reports whether we are at one of the characters that need escaping.
  327. // The escape character has already been consumed.
  328. func isEscapedCharacter(r rune) bool {
  329. return strings.ContainsRune(" :=fnrt", r)
  330. }
  331. // isWhitespace reports whether the rune is a whitespace character.
  332. func isWhitespace(r rune) bool {
  333. return strings.ContainsRune(whitespace, r)
  334. }