route/vendor/gopkg.in/xmlpath.v2/parser.go

397 lines
8.4 KiB
Go

package xmlpath
import (
"golang.org/x/net/html"
"encoding/xml"
"io"
"strings"
)
// Node is an item in an xml tree that was compiled to
// be processed via xml paths. A node may represent:
//
// - An element in the xml document (<body>)
// - An attribute of an element in the xml document (href="...")
// - A comment in the xml document (<!--...-->)
// - A processing instruction in the xml document (<?...?>)
// - Some text within the xml document
//
type Node struct {
kind nodeKind
name xml.Name
attr string
text []byte
nodes []Node
pos int
end int
up *Node
down []*Node
}
type nodeKind int
const (
anyNode nodeKind = iota
startNode
endNode
attrNode
textNode
commentNode
procInstNode
)
// String returns the string value of node.
//
// The string value of a node is:
//
// - For element nodes, the concatenation of all text nodes within the element.
// - For text nodes, the text itself.
// - For attribute nodes, the attribute value.
// - For comment nodes, the text within the comment delimiters.
// - For processing instruction nodes, the content of the instruction.
//
func (node *Node) String() string {
if node.kind == attrNode {
return node.attr
}
return string(node.Bytes())
}
// Bytes returns the string value of node as a byte slice.
// See Node.String for a description of what the string value of a node is.
func (node *Node) Bytes() []byte {
if node.kind == attrNode {
return []byte(node.attr)
}
if node.kind != startNode {
return node.text
}
size := 0
for i := node.pos; i < node.end; i++ {
if node.nodes[i].kind == textNode {
size += len(node.nodes[i].text)
}
}
text := make([]byte, 0, size)
for i := node.pos; i < node.end; i++ {
if node.nodes[i].kind == textNode {
text = append(text, node.nodes[i].text...)
}
}
return text
}
// equals returns whether the string value of node is equal to s,
// without allocating memory.
func (node *Node) equals(s string) bool {
if node.kind == attrNode {
return s == node.attr
}
if node.kind != startNode {
if len(s) != len(node.text) {
return false
}
for i := range s {
if s[i] != node.text[i] {
return false
}
}
return true
}
si := 0
for i := node.pos; i < node.end; i++ {
if node.nodes[i].kind == textNode {
for _, c := range node.nodes[i].text {
if si > len(s) {
return false
}
if s[si] != c {
return false
}
si++
}
}
}
return si == len(s)
}
// contains returns whether the string value of node contains s,
// without allocating memory.
func (node *Node) contains(s string) (ok bool) {
if len(s) == 0 {
return true
}
if node.kind == attrNode {
return strings.Contains(node.attr, s)
}
s0 := s[0]
for i := node.pos; i < node.end; i++ {
if node.nodes[i].kind == textNode {
text := node.nodes[i].text
NextTry:
for ci, c := range text {
if c != s0 {
continue
}
si := 1
for ci++; ci < len(text) && si < len(s); ci++ {
if s[si] != text[ci] {
continue NextTry
}
si++
}
if si == len(s) {
return true
}
for j := i + 1; j < node.end; j++ {
if node.nodes[j].kind == textNode {
for _, c := range node.nodes[j].text {
if s[si] != c {
continue NextTry
}
si++
if si == len(s) {
return true
}
}
}
}
}
}
}
return false
}
// Parse reads an xml document from r, parses it, and returns its root node.
func Parse(r io.Reader) (*Node, error) {
return ParseDecoder(xml.NewDecoder(r))
}
// ParseDecoder parses the xml document being decoded by d and returns
// its root node.
func ParseDecoder(d *xml.Decoder) (*Node, error) {
var nodes []Node
var text []byte
// The root node.
nodes = append(nodes, Node{kind: startNode})
for {
t, err := d.Token()
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
switch t := t.(type) {
case xml.EndElement:
nodes = append(nodes, Node{
kind: endNode,
})
case xml.StartElement:
nodes = append(nodes, Node{
kind: startNode,
name: t.Name,
})
for _, attr := range t.Attr {
nodes = append(nodes, Node{
kind: attrNode,
name: attr.Name,
attr: attr.Value,
})
}
case xml.CharData:
texti := len(text)
text = append(text, t...)
nodes = append(nodes, Node{
kind: textNode,
text: text[texti : texti+len(t)],
})
case xml.Comment:
texti := len(text)
text = append(text, t...)
nodes = append(nodes, Node{
kind: commentNode,
text: text[texti : texti+len(t)],
})
case xml.ProcInst:
texti := len(text)
text = append(text, t.Inst...)
nodes = append(nodes, Node{
kind: procInstNode,
name: xml.Name{Local: t.Target},
text: text[texti : texti+len(t.Inst)],
})
}
}
// Close the root node.
nodes = append(nodes, Node{kind: endNode})
stack := make([]*Node, 0, len(nodes))
downs := make([]*Node, len(nodes))
downCount := 0
for pos := range nodes {
switch nodes[pos].kind {
case startNode, attrNode, textNode, commentNode, procInstNode:
node := &nodes[pos]
node.nodes = nodes
node.pos = pos
if len(stack) > 0 {
node.up = stack[len(stack)-1]
}
if node.kind == startNode {
stack = append(stack, node)
} else {
node.end = pos + 1
}
case endNode:
node := stack[len(stack)-1]
node.end = pos
stack = stack[:len(stack)-1]
// Compute downs. Doing that here is what enables the
// use of a slice of a contiguous pre-allocated block.
node.down = downs[downCount:downCount]
for i := node.pos + 1; i < node.end; i++ {
if nodes[i].up == node {
switch nodes[i].kind {
case startNode, textNode, commentNode, procInstNode:
node.down = append(node.down, &nodes[i])
downCount++
}
}
}
if len(stack) == 0 {
return node, nil
}
}
}
return nil, io.EOF
}
// ParseHTML reads an HTML document from r, parses it using a proper HTML
// parser, and returns its root node.
//
// The document will be processed as a properly structured HTML document,
// emulating the behavior of a browser when processing it. This includes
// putting the content inside proper <html> and <body> tags, if the
// provided text misses them.
func ParseHTML(r io.Reader) (*Node, error) {
ns, err := html.ParseFragment(r, nil)
if err != nil {
return nil, err
}
var nodes []Node
var text []byte
n := ns[0]
// The root node.
nodes = append(nodes, Node{kind: startNode})
for n != nil {
switch n.Type {
case html.DocumentNode:
case html.ElementNode:
nodes = append(nodes, Node{
kind: startNode,
name: xml.Name{Local: n.Data, Space: n.Namespace},
})
for _, attr := range n.Attr {
nodes = append(nodes, Node{
kind: attrNode,
name: xml.Name{Local: attr.Key, Space: attr.Namespace},
attr: attr.Val,
})
}
case html.TextNode:
texti := len(text)
text = append(text, n.Data...)
nodes = append(nodes, Node{
kind: textNode,
text: text[texti : texti+len(n.Data)],
})
case html.CommentNode:
texti := len(text)
text = append(text, n.Data...)
nodes = append(nodes, Node{
kind: commentNode,
text: text[texti : texti+len(n.Data)],
})
}
if n.FirstChild != nil {
n = n.FirstChild
continue
}
for n != nil {
if n.Type == html.ElementNode {
nodes = append(nodes, Node{kind: endNode})
}
if n.NextSibling != nil {
n = n.NextSibling
break
}
n = n.Parent
}
}
// Close the root node.
nodes = append(nodes, Node{kind: endNode})
stack := make([]*Node, 0, len(nodes))
downs := make([]*Node, len(nodes))
downCount := 0
for pos := range nodes {
switch nodes[pos].kind {
case startNode, attrNode, textNode, commentNode, procInstNode:
node := &nodes[pos]
node.nodes = nodes
node.pos = pos
if len(stack) > 0 {
node.up = stack[len(stack)-1]
}
if node.kind == startNode {
stack = append(stack, node)
} else {
node.end = pos + 1
}
case endNode:
node := stack[len(stack)-1]
node.end = pos
stack = stack[:len(stack)-1]
// Compute downs. Doing that here is what enables the
// use of a slice of a contiguous pre-allocated block.
node.down = downs[downCount:downCount]
for i := node.pos + 1; i < node.end; i++ {
if nodes[i].up == node {
switch nodes[i].kind {
case startNode, textNode, commentNode, procInstNode:
node.down = append(node.down, &nodes[i])
downCount++
}
}
}
if len(stack) == 0 {
return node, nil
}
}
}
return nil, io.EOF
}