terraform-provider-google/vendor/github.com/hashicorp/hil/scanner/scanner.go
Paddy 961c878e0d Switch to using Go modules. (#2679)
Switch to using Go modules.

This migrates our vendor.json to use Go 1.11's modules system, and
replaces the vendor folder with the output of go mod vendor.

The vendored code should remain basically the same; I believe some
tree shaking of packages and support scripts/licenses/READMEs/etc.
happened.

This also fixes Travis and our Makefile to no longer use govendor.
2018-12-20 17:22:22 -08:00

557 lines
13 KiB
Go

package scanner
import (
"unicode"
"unicode/utf8"
"github.com/hashicorp/hil/ast"
)
// Scan returns a channel that recieves Tokens from the given input string.
//
// The scanner's job is just to partition the string into meaningful parts.
// It doesn't do any transformation of the raw input string, so the caller
// must deal with any further interpretation required, such as parsing INTEGER
// tokens into real ints, or dealing with escape sequences in LITERAL or
// STRING tokens.
//
// Strings in the returned tokens are slices from the original string.
//
// startPos should be set to ast.InitPos unless the caller knows that
// this interpolation string is part of a larger file and knows the position
// of the first character in that larger file.
func Scan(s string, startPos ast.Pos) <-chan *Token {
ch := make(chan *Token)
go scan(s, ch, startPos)
return ch
}
func scan(s string, ch chan<- *Token, pos ast.Pos) {
// 'remain' starts off as the whole string but we gradually
// slice of the front of it as we work our way through.
remain := s
// nesting keeps track of how many ${ .. } sequences we are
// inside, so we can recognize the minor differences in syntax
// between outer string literals (LITERAL tokens) and quoted
// string literals (STRING tokens).
nesting := 0
// We're going to flip back and forth between parsing literals/strings
// and parsing interpolation sequences ${ .. } until we reach EOF or
// some INVALID token.
All:
for {
startPos := pos
// Literal string processing first, since the beginning of
// a string is always outside of an interpolation sequence.
literalVal, terminator := scanLiteral(remain, pos, nesting > 0)
if len(literalVal) > 0 {
litType := LITERAL
if nesting > 0 {
litType = STRING
}
ch <- &Token{
Type: litType,
Content: literalVal,
Pos: startPos,
}
remain = remain[len(literalVal):]
}
ch <- terminator
remain = remain[len(terminator.Content):]
pos = terminator.Pos
// Safe to use len() here because none of the terminator tokens
// can contain UTF-8 sequences.
pos.Column = pos.Column + len(terminator.Content)
switch terminator.Type {
case INVALID:
// Synthetic EOF after invalid token, since further scanning
// is likely to just produce more garbage.
ch <- &Token{
Type: EOF,
Content: "",
Pos: pos,
}
break All
case EOF:
// All done!
break All
case BEGIN:
nesting++
case CQUOTE:
// nothing special to do
default:
// Should never happen
panic("invalid string/literal terminator")
}
// Now we do the processing of the insides of ${ .. } sequences.
// This loop terminates when we encounter either a closing } or
// an opening ", which will cause us to return to literal processing.
Interpolation:
for {
token, size, newPos := scanInterpolationToken(remain, pos)
ch <- token
remain = remain[size:]
pos = newPos
switch token.Type {
case INVALID:
// Synthetic EOF after invalid token, since further scanning
// is likely to just produce more garbage.
ch <- &Token{
Type: EOF,
Content: "",
Pos: pos,
}
break All
case EOF:
// All done
// (though a syntax error that we'll catch in the parser)
break All
case END:
nesting--
if nesting < 0 {
// Can happen if there are unbalanced ${ and } sequences
// in the input, which we'll catch in the parser.
nesting = 0
}
break Interpolation
case OQUOTE:
// Beginning of nested quoted string
break Interpolation
}
}
}
close(ch)
}
// Returns the token found at the start of the given string, followed by
// the number of bytes that were consumed from the string and the adjusted
// source position.
//
// Note that the number of bytes consumed can be more than the length of
// the returned token contents if the string begins with whitespace, since
// it will be silently consumed before reading the token.
func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) {
pos := startPos
size := 0
// Consume whitespace, if any
for len(s) > 0 && byteIsSpace(s[0]) {
if s[0] == '\n' {
pos.Column = 1
pos.Line++
} else {
pos.Column++
}
size++
s = s[1:]
}
// Unexpected EOF during sequence
if len(s) == 0 {
return &Token{
Type: EOF,
Content: "",
Pos: pos,
}, size, pos
}
next := s[0]
var token *Token
switch next {
case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':':
// Easy punctuation symbols that don't have any special meaning
// during scanning, and that stand for themselves in the
// TokenType enumeration.
token = &Token{
Type: TokenType(next),
Content: s[:1],
Pos: pos,
}
case '}':
token = &Token{
Type: END,
Content: s[:1],
Pos: pos,
}
case '"':
token = &Token{
Type: OQUOTE,
Content: s[:1],
Pos: pos,
}
case '!':
if len(s) >= 2 && s[:2] == "!=" {
token = &Token{
Type: NOTEQUAL,
Content: s[:2],
Pos: pos,
}
} else {
token = &Token{
Type: BANG,
Content: s[:1],
Pos: pos,
}
}
case '<':
if len(s) >= 2 && s[:2] == "<=" {
token = &Token{
Type: LTE,
Content: s[:2],
Pos: pos,
}
} else {
token = &Token{
Type: LT,
Content: s[:1],
Pos: pos,
}
}
case '>':
if len(s) >= 2 && s[:2] == ">=" {
token = &Token{
Type: GTE,
Content: s[:2],
Pos: pos,
}
} else {
token = &Token{
Type: GT,
Content: s[:1],
Pos: pos,
}
}
case '=':
if len(s) >= 2 && s[:2] == "==" {
token = &Token{
Type: EQUAL,
Content: s[:2],
Pos: pos,
}
} else {
// A single equals is not a valid operator
token = &Token{
Type: INVALID,
Content: s[:1],
Pos: pos,
}
}
case '&':
if len(s) >= 2 && s[:2] == "&&" {
token = &Token{
Type: AND,
Content: s[:2],
Pos: pos,
}
} else {
token = &Token{
Type: INVALID,
Content: s[:1],
Pos: pos,
}
}
case '|':
if len(s) >= 2 && s[:2] == "||" {
token = &Token{
Type: OR,
Content: s[:2],
Pos: pos,
}
} else {
token = &Token{
Type: INVALID,
Content: s[:1],
Pos: pos,
}
}
default:
if next >= '0' && next <= '9' {
num, numType := scanNumber(s)
token = &Token{
Type: numType,
Content: num,
Pos: pos,
}
} else if stringStartsWithIdentifier(s) {
ident, runeLen := scanIdentifier(s)
tokenType := IDENTIFIER
if ident == "true" || ident == "false" {
tokenType = BOOL
}
token = &Token{
Type: tokenType,
Content: ident,
Pos: pos,
}
// Skip usual token handling because it doesn't
// know how to deal with UTF-8 sequences.
pos.Column = pos.Column + runeLen
return token, size + len(ident), pos
} else {
_, byteLen := utf8.DecodeRuneInString(s)
token = &Token{
Type: INVALID,
Content: s[:byteLen],
Pos: pos,
}
// Skip usual token handling because it doesn't
// know how to deal with UTF-8 sequences.
pos.Column = pos.Column + 1
return token, size + byteLen, pos
}
}
// Here we assume that the token content contains no UTF-8 sequences,
// because we dealt with UTF-8 characters as a special case where
// necessary above.
size = size + len(token.Content)
pos.Column = pos.Column + len(token.Content)
return token, size, pos
}
// Returns the (possibly-empty) prefix of the given string that represents
// a literal, followed by the token that marks the end of the literal.
func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) {
litLen := 0
pos := startPos
var terminator *Token
for {
if litLen >= len(s) {
if nested {
// We've ended in the middle of a quoted string,
// which means this token is actually invalid.
return "", &Token{
Type: INVALID,
Content: s,
Pos: startPos,
}
}
terminator = &Token{
Type: EOF,
Content: "",
Pos: pos,
}
break
}
next := s[litLen]
if next == '$' && len(s) > litLen+1 {
follow := s[litLen+1]
if follow == '{' {
terminator = &Token{
Type: BEGIN,
Content: s[litLen : litLen+2],
Pos: pos,
}
pos.Column = pos.Column + 2
break
} else if follow == '$' {
// Double-$ escapes the special processing of $,
// so we will consume both characters here.
pos.Column = pos.Column + 2
litLen = litLen + 2
continue
}
}
// special handling that applies only to quoted strings
if nested {
if next == '"' {
terminator = &Token{
Type: CQUOTE,
Content: s[litLen : litLen+1],
Pos: pos,
}
pos.Column = pos.Column + 1
break
}
// Escaped quote marks do not terminate the string.
//
// All we do here in the scanner is avoid terminating a string
// due to an escaped quote. The parser is responsible for the
// full handling of escape sequences, since it's able to produce
// better error messages than we can produce in here.
if next == '\\' && len(s) > litLen+1 {
follow := s[litLen+1]
if follow == '"' {
// \" escapes the special processing of ",
// so we will consume both characters here.
pos.Column = pos.Column + 2
litLen = litLen + 2
continue
} else if follow == '\\' {
// \\ escapes \
// so we will consume both characters here.
pos.Column = pos.Column + 2
litLen = litLen + 2
continue
}
}
}
if next == '\n' {
pos.Column = 1
pos.Line++
litLen++
} else {
pos.Column++
// "Column" measures runes, so we need to actually consume
// a valid UTF-8 character here.
_, size := utf8.DecodeRuneInString(s[litLen:])
litLen = litLen + size
}
}
return s[:litLen], terminator
}
// scanNumber returns the extent of the prefix of the string that represents
// a valid number, along with what type of number it represents: INT or FLOAT.
//
// scanNumber does only basic character analysis: numbers consist of digits
// and periods, with at least one period signalling a FLOAT. It's the parser's
// responsibility to validate the form and range of the number, such as ensuring
// that a FLOAT actually contains only one period, etc.
func scanNumber(s string) (string, TokenType) {
period := -1
byteLen := 0
numType := INTEGER
for {
if byteLen >= len(s) {
break
}
next := s[byteLen]
if next != '.' && (next < '0' || next > '9') {
// If our last value was a period, then we're not a float,
// we're just an integer that ends in a period.
if period == byteLen-1 {
byteLen--
numType = INTEGER
}
break
}
if next == '.' {
// If we've already seen a period, break out
if period >= 0 {
break
}
period = byteLen
numType = FLOAT
}
byteLen++
}
return s[:byteLen], numType
}
// scanIdentifier returns the extent of the prefix of the string that
// represents a valid identifier, along with the length of that prefix
// in runes.
//
// Identifiers may contain utf8-encoded non-Latin letters, which will
// cause the returned "rune length" to be shorter than the byte length
// of the returned string.
func scanIdentifier(s string) (string, int) {
byteLen := 0
runeLen := 0
for {
if byteLen >= len(s) {
break
}
nextRune, size := utf8.DecodeRuneInString(s[byteLen:])
if !(nextRune == '_' ||
nextRune == '-' ||
nextRune == '.' ||
nextRune == '*' ||
unicode.IsNumber(nextRune) ||
unicode.IsLetter(nextRune) ||
unicode.IsMark(nextRune)) {
break
}
// If we reach a star, it must be between periods to be part
// of the same identifier.
if nextRune == '*' && s[byteLen-1] != '.' {
break
}
// If our previous character was a star, then the current must
// be period. Otherwise, undo that and exit.
if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' {
byteLen--
if s[byteLen-1] == '.' {
byteLen--
}
break
}
byteLen = byteLen + size
runeLen = runeLen + 1
}
return s[:byteLen], runeLen
}
// byteIsSpace implements a restrictive interpretation of spaces that includes
// only what's valid inside interpolation sequences: spaces, tabs, newlines.
func byteIsSpace(b byte) bool {
switch b {
case ' ', '\t', '\r', '\n':
return true
default:
return false
}
}
// stringStartsWithIdentifier returns true if the given string begins with
// a character that is a legal start of an identifier: an underscore or
// any character that Unicode considers to be a letter.
func stringStartsWithIdentifier(s string) bool {
if len(s) == 0 {
return false
}
first := s[0]
// Easy ASCII cases first
if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' {
return true
}
// If our first byte begins a UTF-8 sequence then the sequence might
// be a unicode letter.
if utf8.RuneStart(first) {
firstRune, _ := utf8.DecodeRuneInString(s)
if unicode.IsLetter(firstRune) {
return true
}
}
return false
}