// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package text import ( "bytes" "fmt" "io" "regexp" "strconv" "unicode/utf8" "google.golang.org/protobuf/internal/errors" ) // Decoder is a token-based textproto decoder. type Decoder struct { // lastCall is last method called, either readCall or peekCall. // Initial value is readCall. lastCall call // lastToken contains the last read token. lastToken Token // lastErr contains the last read error. lastErr error // openStack is a stack containing the byte characters for MessageOpen and // ListOpen kinds. The top of stack represents the message or the list that // the current token is nested in. An empty stack means the current token is // at the top level message. The characters '{' and '<' both represent the // MessageOpen kind. openStack []byte // orig is used in reporting line and column. orig []byte // in contains the unconsumed input. in []byte } // NewDecoder returns a Decoder to read the given []byte. func NewDecoder(b []byte) *Decoder { return &Decoder{orig: b, in: b} } // ErrUnexpectedEOF means that EOF was encountered in the middle of the input. var ErrUnexpectedEOF = errors.New("%v", io.ErrUnexpectedEOF) // call specifies which Decoder method was invoked. type call uint8 const ( readCall call = iota peekCall ) // Peek looks ahead and returns the next token and error without advancing a read. func (d *Decoder) Peek() (Token, error) { defer func() { d.lastCall = peekCall }() if d.lastCall == readCall { d.lastToken, d.lastErr = d.Read() } return d.lastToken, d.lastErr } // Read returns the next token. // It will return an error if there is no valid token. func (d *Decoder) Read() (Token, error) { defer func() { d.lastCall = readCall }() if d.lastCall == peekCall { return d.lastToken, d.lastErr } tok, err := d.parseNext(d.lastToken.kind) if err != nil { return Token{}, err } switch tok.kind { case comma, semicolon: tok, err = d.parseNext(tok.kind) if err != nil { return Token{}, err } } d.lastToken = tok return tok, nil } const ( mismatchedFmt = "mismatched close character %q" unexpectedFmt = "unexpected character %q" ) // parseNext parses the next Token based on given last kind. func (d *Decoder) parseNext(lastKind Kind) (Token, error) { // Trim leading spaces. d.consume(0) isEOF := false if len(d.in) == 0 { isEOF = true } switch lastKind { case EOF: return d.consumeToken(EOF, 0, 0), nil case bof: // Start of top level message. Next token can be EOF or Name. if isEOF { return d.consumeToken(EOF, 0, 0), nil } return d.parseFieldName() case Name: // Next token can be MessageOpen, ListOpen or Scalar. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case '{', '<': d.pushOpenStack(ch) return d.consumeToken(MessageOpen, 1, 0), nil case '[': d.pushOpenStack(ch) return d.consumeToken(ListOpen, 1, 0), nil default: return d.parseScalar() } case Scalar: openKind, closeCh := d.currentOpenKind() switch openKind { case bof: // Top level message. // Next token can be EOF, comma, semicolon or Name. if isEOF { return d.consumeToken(EOF, 0, 0), nil } switch d.in[0] { case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } case MessageOpen: // Next token can be MessageClose, comma, semicolon or Name. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(MessageClose, 1, 0), nil case otherCloseChar[closeCh]: return Token{}, d.newSyntaxError(mismatchedFmt, ch) case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } case ListOpen: // Next token can be ListClose or comma. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case ']': d.popOpenStack() return d.consumeToken(ListClose, 1, 0), nil case ',': return d.consumeToken(comma, 1, 0), nil default: return Token{}, d.newSyntaxError(unexpectedFmt, ch) } } case MessageOpen: // Next token can be MessageClose or Name. if isEOF { return Token{}, ErrUnexpectedEOF } _, closeCh := d.currentOpenKind() switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(MessageClose, 1, 0), nil case otherCloseChar[closeCh]: return Token{}, d.newSyntaxError(mismatchedFmt, ch) default: return d.parseFieldName() } case MessageClose: openKind, closeCh := d.currentOpenKind() switch openKind { case bof: // Top level message. // Next token can be EOF, comma, semicolon or Name. if isEOF { return d.consumeToken(EOF, 0, 0), nil } switch ch := d.in[0]; ch { case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } case MessageOpen: // Next token can be MessageClose, comma, semicolon or Name. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(MessageClose, 1, 0), nil case otherCloseChar[closeCh]: return Token{}, d.newSyntaxError(mismatchedFmt, ch) case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } case ListOpen: // Next token can be ListClose or comma if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(ListClose, 1, 0), nil case ',': return d.consumeToken(comma, 1, 0), nil default: return Token{}, d.newSyntaxError(unexpectedFmt, ch) } } case ListOpen: // Next token can be ListClose, MessageStart or Scalar. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case ']': d.popOpenStack() return d.consumeToken(ListClose, 1, 0), nil case '{', '<': d.pushOpenStack(ch) return d.consumeToken(MessageOpen, 1, 0), nil default: return d.parseScalar() } case ListClose: openKind, closeCh := d.currentOpenKind() switch openKind { case bof: // Top level message. // Next token can be EOF, comma, semicolon or Name. if isEOF { return d.consumeToken(EOF, 0, 0), nil } switch ch := d.in[0]; ch { case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } case MessageOpen: // Next token can be MessageClose, comma, semicolon or Name. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(MessageClose, 1, 0), nil case otherCloseChar[closeCh]: return Token{}, d.newSyntaxError(mismatchedFmt, ch) case ',': return d.consumeToken(comma, 1, 0), nil case ';': return d.consumeToken(semicolon, 1, 0), nil default: return d.parseFieldName() } default: // It is not possible to have this case. Let it panic below. } case comma, semicolon: openKind, closeCh := d.currentOpenKind() switch openKind { case bof: // Top level message. Next token can be EOF or Name. if isEOF { return d.consumeToken(EOF, 0, 0), nil } return d.parseFieldName() case MessageOpen: // Next token can be MessageClose or Name. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case closeCh: d.popOpenStack() return d.consumeToken(MessageClose, 1, 0), nil case otherCloseChar[closeCh]: return Token{}, d.newSyntaxError(mismatchedFmt, ch) default: return d.parseFieldName() } case ListOpen: if lastKind == semicolon { // It is not be possible to have this case as logic here // should not have produced a semicolon Token when inside a // list. Let it panic below. break } // Next token can be MessageOpen or Scalar. if isEOF { return Token{}, ErrUnexpectedEOF } switch ch := d.in[0]; ch { case '{', '<': d.pushOpenStack(ch) return d.consumeToken(MessageOpen, 1, 0), nil default: return d.parseScalar() } } } line, column := d.Position(len(d.orig) - len(d.in)) panic(fmt.Sprintf("Decoder.parseNext: bug at handling line %d:%d with lastKind=%v", line, column, lastKind)) } var otherCloseChar = map[byte]byte{ '}': '>', '>': '}', } // currentOpenKind indicates whether current position is inside a message, list // or top-level message by returning MessageOpen, ListOpen or bof respectively. // If the returned kind is either a MessageOpen or ListOpen, it also returns the // corresponding closing character. func (d *Decoder) currentOpenKind() (Kind, byte) { if len(d.openStack) == 0 { return bof, 0 } openCh := d.openStack[len(d.openStack)-1] switch openCh { case '{': return MessageOpen, '}' case '<': return MessageOpen, '>' case '[': return ListOpen, ']' } panic(fmt.Sprintf("Decoder: openStack contains invalid byte %s", string(openCh))) } func (d *Decoder) pushOpenStack(ch byte) { d.openStack = append(d.openStack, ch) } func (d *Decoder) popOpenStack() { d.openStack = d.openStack[:len(d.openStack)-1] } // parseFieldName parses field name and separator. func (d *Decoder) parseFieldName() (tok Token, err error) { defer func() { if err == nil && d.tryConsumeChar(':') { tok.attrs |= hasSeparator } }() // Extension or Any type URL. if d.in[0] == '[' { return d.parseTypeName() } // Identifier. if size := parseIdent(d.in, false); size > 0 { return d.consumeToken(Name, size, uint8(IdentName)), nil } // Field number. Identify if input is a valid number that is not negative // and is decimal integer within 32-bit range. if num := parseNumber(d.in); num.size > 0 { if !num.neg && num.kind == numDec { if _, err := strconv.ParseInt(string(d.in[:num.size]), 10, 32); err == nil { return d.consumeToken(Name, num.size, uint8(FieldNumber)), nil } } return Token{}, d.newSyntaxError("invalid field number: %s", d.in[:num.size]) } return Token{}, d.newSyntaxError("invalid field name: %s", errRegexp.Find(d.in)) } // parseTypeName parses Any type URL or extension field name. The name is // enclosed in [ and ] characters. The C++ parser does not handle many legal URL // strings. This implementation is more liberal and allows for the pattern // ^[-_a-zA-Z0-9]+([./][-_a-zA-Z0-9]+)*`). Whitespaces and comments are allowed // in between [ ], '.', '/' and the sub names. func (d *Decoder) parseTypeName() (Token, error) { startPos := len(d.orig) - len(d.in) // Use alias s to advance first in order to use d.in for error handling. // Caller already checks for [ as first character. s := consume(d.in[1:], 0) if len(s) == 0 { return Token{}, ErrUnexpectedEOF } var name []byte for len(s) > 0 && isTypeNameChar(s[0]) { name = append(name, s[0]) s = s[1:] } s = consume(s, 0) var closed bool for len(s) > 0 && !closed { switch { case s[0] == ']': s = s[1:] closed = true case s[0] == '/', s[0] == '.': if len(name) > 0 && (name[len(name)-1] == '/' || name[len(name)-1] == '.') { return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) } name = append(name, s[0]) s = s[1:] s = consume(s, 0) for len(s) > 0 && isTypeNameChar(s[0]) { name = append(name, s[0]) s = s[1:] } s = consume(s, 0) default: return Token{}, d.newSyntaxError( "invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)+1]) } } if !closed { return Token{}, ErrUnexpectedEOF } // First character cannot be '.'. Last character cannot be '.' or '/'. size := len(name) if size == 0 || name[0] == '.' || name[size-1] == '.' || name[size-1] == '/' { return Token{}, d.newSyntaxError("invalid type URL/extension field name: %s", d.orig[startPos:len(d.orig)-len(s)]) } d.in = s endPos := len(d.orig) - len(d.in) d.consume(0) return Token{ kind: Name, attrs: uint8(TypeName), pos: startPos, raw: d.orig[startPos:endPos], str: string(name), }, nil } func isTypeNameChar(b byte) bool { return (b == '-' || b == '_' || ('0' <= b && b <= '9') || ('a' <= b && b <= 'z') || ('A' <= b && b <= 'Z')) } func isWhiteSpace(b byte) bool { switch b { case ' ', '\n', '\r', '\t': return true default: return false } } // parseIdent parses an unquoted proto identifier and returns size. // If allowNeg is true, it allows '-' to be the first character in the // identifier. This is used when parsing literal values like -infinity, etc. // Regular expression matches an identifier: `^[_a-zA-Z][_a-zA-Z0-9]*` func parseIdent(input []byte, allowNeg bool) int { var size int s := input if len(s) == 0 { return 0 } if allowNeg && s[0] == '-' { s = s[1:] size++ if len(s) == 0 { return 0 } } switch { case s[0] == '_', 'a' <= s[0] && s[0] <= 'z', 'A' <= s[0] && s[0] <= 'Z': s = s[1:] size++ default: return 0 } for len(s) > 0 && (s[0] == '_' || 'a' <= s[0] && s[0] <= 'z' || 'A' <= s[0] && s[0] <= 'Z' || '0' <= s[0] && s[0] <= '9') { s = s[1:] size++ } if len(s) > 0 && !isDelim(s[0]) { return 0 } return size } // parseScalar parses for a string, literal or number value. func (d *Decoder) parseScalar() (Token, error) { if d.in[0] == '"' || d.in[0] == '\'' { return d.parseStringValue() } if tok, ok := d.parseLiteralValue(); ok { return tok, nil } if tok, ok := d.parseNumberValue(); ok { return tok, nil } return Token{}, d.newSyntaxError("invalid scalar value: %s", errRegexp.Find(d.in)) } // parseLiteralValue parses a literal value. A literal value is used for // bools, special floats and enums. This function simply identifies that the // field value is a literal. func (d *Decoder) parseLiteralValue() (Token, bool) { size := parseIdent(d.in, true) if size == 0 { return Token{}, false } return d.consumeToken(Scalar, size, literalValue), true } // consumeToken constructs a Token for given Kind from d.in and consumes given // size-length from it. func (d *Decoder) consumeToken(kind Kind, size int, attrs uint8) Token { // Important to compute raw and pos before consuming. tok := Token{ kind: kind, attrs: attrs, pos: len(d.orig) - len(d.in), raw: d.in[:size], } d.consume(size) return tok } // newSyntaxError returns a syntax error with line and column information for // current position. func (d *Decoder) newSyntaxError(f string, x ...interface{}) error { e := errors.New(f, x...) line, column := d.Position(len(d.orig) - len(d.in)) return errors.New("syntax error (line %d:%d): %v", line, column, e) } // Position returns line and column number of given index of the original input. // It will panic if index is out of range. func (d *Decoder) Position(idx int) (line int, column int) { b := d.orig[:idx] line = bytes.Count(b, []byte("\n")) + 1 if i := bytes.LastIndexByte(b, '\n'); i >= 0 { b = b[i+1:] } column = utf8.RuneCount(b) + 1 // ignore multi-rune characters return line, column } func (d *Decoder) tryConsumeChar(c byte) bool { if len(d.in) > 0 && d.in[0] == c { d.consume(1) return true } return false } // consume consumes n bytes of input and any subsequent whitespace or comments. func (d *Decoder) consume(n int) { d.in = consume(d.in, n) return } // consume consumes n bytes of input and any subsequent whitespace or comments. func consume(b []byte, n int) []byte { b = b[n:] for len(b) > 0 { switch b[0] { case ' ', '\n', '\r', '\t': b = b[1:] case '#': if i := bytes.IndexByte(b, '\n'); i >= 0 { b = b[i+len("\n"):] } else { b = nil } default: return b } } return b } // Any sequence that looks like a non-delimiter (for error reporting). var errRegexp = regexp.MustCompile(`^([-+._a-zA-Z0-9\/]+|.)`) // isDelim returns true if given byte is a delimiter character. func isDelim(c byte) bool { return !(c == '-' || c == '+' || c == '.' || c == '_' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')) }