scanner.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. package unstable
  2. import "github.com/pelletier/go-toml/v2/internal/characters"
  3. func scanFollows(b []byte, pattern string) bool {
  4. n := len(pattern)
  5. return len(b) >= n && string(b[:n]) == pattern
  6. }
  7. func scanFollowsMultilineBasicStringDelimiter(b []byte) bool {
  8. return scanFollows(b, `"""`)
  9. }
  10. func scanFollowsMultilineLiteralStringDelimiter(b []byte) bool {
  11. return scanFollows(b, `'''`)
  12. }
  13. func scanFollowsTrue(b []byte) bool {
  14. return scanFollows(b, `true`)
  15. }
  16. func scanFollowsFalse(b []byte) bool {
  17. return scanFollows(b, `false`)
  18. }
  19. func scanFollowsInf(b []byte) bool {
  20. return scanFollows(b, `inf`)
  21. }
  22. func scanFollowsNan(b []byte) bool {
  23. return scanFollows(b, `nan`)
  24. }
  25. func scanUnquotedKey(b []byte) ([]byte, []byte) {
  26. // unquoted-key = 1*( ALPHA / DIGIT / %x2D / %x5F ) ; A-Z / a-z / 0-9 / - / _
  27. for i := 0; i < len(b); i++ {
  28. if !isUnquotedKeyChar(b[i]) {
  29. return b[:i], b[i:]
  30. }
  31. }
  32. return b, b[len(b):]
  33. }
  34. func isUnquotedKeyChar(r byte) bool {
  35. return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_'
  36. }
  37. func scanLiteralString(b []byte) ([]byte, []byte, error) {
  38. // literal-string = apostrophe *literal-char apostrophe
  39. // apostrophe = %x27 ; ' apostrophe
  40. // literal-char = %x09 / %x20-26 / %x28-7E / non-ascii
  41. for i := 1; i < len(b); {
  42. switch b[i] {
  43. case '\'':
  44. return b[:i+1], b[i+1:], nil
  45. case '\n', '\r':
  46. return nil, nil, NewParserError(b[i:i+1], "literal strings cannot have new lines")
  47. }
  48. size := characters.Utf8ValidNext(b[i:])
  49. if size == 0 {
  50. return nil, nil, NewParserError(b[i:i+1], "invalid character")
  51. }
  52. i += size
  53. }
  54. return nil, nil, NewParserError(b[len(b):], "unterminated literal string")
  55. }
  56. func scanMultilineLiteralString(b []byte) ([]byte, []byte, error) {
  57. // ml-literal-string = ml-literal-string-delim [ newline ] ml-literal-body
  58. // ml-literal-string-delim
  59. // ml-literal-string-delim = 3apostrophe
  60. // ml-literal-body = *mll-content *( mll-quotes 1*mll-content ) [ mll-quotes ]
  61. //
  62. // mll-content = mll-char / newline
  63. // mll-char = %x09 / %x20-26 / %x28-7E / non-ascii
  64. // mll-quotes = 1*2apostrophe
  65. for i := 3; i < len(b); {
  66. switch b[i] {
  67. case '\'':
  68. if scanFollowsMultilineLiteralStringDelimiter(b[i:]) {
  69. i += 3
  70. // At that point we found 3 apostrophe, and i is the
  71. // index of the byte after the third one. The scanner
  72. // needs to be eager, because there can be an extra 2
  73. // apostrophe that can be accepted at the end of the
  74. // string.
  75. if i >= len(b) || b[i] != '\'' {
  76. return b[:i], b[i:], nil
  77. }
  78. i++
  79. if i >= len(b) || b[i] != '\'' {
  80. return b[:i], b[i:], nil
  81. }
  82. i++
  83. if i < len(b) && b[i] == '\'' {
  84. return nil, nil, NewParserError(b[i-3:i+1], "''' not allowed in multiline literal string")
  85. }
  86. return b[:i], b[i:], nil
  87. }
  88. case '\r':
  89. if len(b) < i+2 {
  90. return nil, nil, NewParserError(b[len(b):], `need a \n after \r`)
  91. }
  92. if b[i+1] != '\n' {
  93. return nil, nil, NewParserError(b[i:i+2], `need a \n after \r`)
  94. }
  95. i += 2 // skip the \n
  96. continue
  97. }
  98. size := characters.Utf8ValidNext(b[i:])
  99. if size == 0 {
  100. return nil, nil, NewParserError(b[i:i+1], "invalid character")
  101. }
  102. i += size
  103. }
  104. return nil, nil, NewParserError(b[len(b):], `multiline literal string not terminated by '''`)
  105. }
  106. func scanWindowsNewline(b []byte) ([]byte, []byte, error) {
  107. const lenCRLF = 2
  108. if len(b) < lenCRLF {
  109. return nil, nil, NewParserError(b, "windows new line expected")
  110. }
  111. if b[1] != '\n' {
  112. return nil, nil, NewParserError(b, `windows new line should be \r\n`)
  113. }
  114. return b[:lenCRLF], b[lenCRLF:], nil
  115. }
  116. func scanWhitespace(b []byte) ([]byte, []byte) {
  117. for i := 0; i < len(b); i++ {
  118. switch b[i] {
  119. case ' ', '\t':
  120. continue
  121. default:
  122. return b[:i], b[i:]
  123. }
  124. }
  125. return b, b[len(b):]
  126. }
  127. func scanComment(b []byte) ([]byte, []byte, error) {
  128. // comment-start-symbol = %x23 ; #
  129. // non-ascii = %x80-D7FF / %xE000-10FFFF
  130. // non-eol = %x09 / %x20-7F / non-ascii
  131. //
  132. // comment = comment-start-symbol *non-eol
  133. for i := 1; i < len(b); {
  134. if b[i] == '\n' {
  135. return b[:i], b[i:], nil
  136. }
  137. if b[i] == '\r' {
  138. if i+1 < len(b) && b[i+1] == '\n' {
  139. return b[:i+1], b[i+1:], nil
  140. }
  141. return nil, nil, NewParserError(b[i:i+1], "invalid character in comment")
  142. }
  143. size := characters.Utf8ValidNext(b[i:])
  144. if size == 0 {
  145. return nil, nil, NewParserError(b[i:i+1], "invalid character in comment")
  146. }
  147. i += size
  148. }
  149. return b, b[len(b):], nil
  150. }
  151. func scanBasicString(b []byte) ([]byte, bool, []byte, error) {
  152. // basic-string = quotation-mark *basic-char quotation-mark
  153. // quotation-mark = %x22 ; "
  154. // basic-char = basic-unescaped / escaped
  155. // basic-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
  156. // escaped = escape escape-seq-char
  157. escaped := false
  158. i := 1
  159. for ; i < len(b); i++ {
  160. switch b[i] {
  161. case '"':
  162. return b[:i+1], escaped, b[i+1:], nil
  163. case '\n', '\r':
  164. return nil, escaped, nil, NewParserError(b[i:i+1], "basic strings cannot have new lines")
  165. case '\\':
  166. if len(b) < i+2 {
  167. return nil, escaped, nil, NewParserError(b[i:i+1], "need a character after \\")
  168. }
  169. escaped = true
  170. i++ // skip the next character
  171. }
  172. }
  173. return nil, escaped, nil, NewParserError(b[len(b):], `basic string not terminated by "`)
  174. }
  175. func scanMultilineBasicString(b []byte) ([]byte, bool, []byte, error) {
  176. // ml-basic-string = ml-basic-string-delim [ newline ] ml-basic-body
  177. // ml-basic-string-delim
  178. // ml-basic-string-delim = 3quotation-mark
  179. // ml-basic-body = *mlb-content *( mlb-quotes 1*mlb-content ) [ mlb-quotes ]
  180. //
  181. // mlb-content = mlb-char / newline / mlb-escaped-nl
  182. // mlb-char = mlb-unescaped / escaped
  183. // mlb-quotes = 1*2quotation-mark
  184. // mlb-unescaped = wschar / %x21 / %x23-5B / %x5D-7E / non-ascii
  185. // mlb-escaped-nl = escape ws newline *( wschar / newline )
  186. escaped := false
  187. i := 3
  188. for ; i < len(b); i++ {
  189. switch b[i] {
  190. case '"':
  191. if scanFollowsMultilineBasicStringDelimiter(b[i:]) {
  192. i += 3
  193. // At that point we found 3 apostrophe, and i is the
  194. // index of the byte after the third one. The scanner
  195. // needs to be eager, because there can be an extra 2
  196. // apostrophe that can be accepted at the end of the
  197. // string.
  198. if i >= len(b) || b[i] != '"' {
  199. return b[:i], escaped, b[i:], nil
  200. }
  201. i++
  202. if i >= len(b) || b[i] != '"' {
  203. return b[:i], escaped, b[i:], nil
  204. }
  205. i++
  206. if i < len(b) && b[i] == '"' {
  207. return nil, escaped, nil, NewParserError(b[i-3:i+1], `""" not allowed in multiline basic string`)
  208. }
  209. return b[:i], escaped, b[i:], nil
  210. }
  211. case '\\':
  212. if len(b) < i+2 {
  213. return nil, escaped, nil, NewParserError(b[len(b):], "need a character after \\")
  214. }
  215. escaped = true
  216. i++ // skip the next character
  217. case '\r':
  218. if len(b) < i+2 {
  219. return nil, escaped, nil, NewParserError(b[len(b):], `need a \n after \r`)
  220. }
  221. if b[i+1] != '\n' {
  222. return nil, escaped, nil, NewParserError(b[i:i+2], `need a \n after \r`)
  223. }
  224. i++ // skip the \n
  225. }
  226. }
  227. return nil, escaped, nil, NewParserError(b[len(b):], `multiline basic string not terminated by """`)
  228. }