Parsing HTML at the command line
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

334 lines
7.5 KiB

package main
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"github.com/fatih/color"
colorable "github.com/mattn/go-colorable"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
func init() {
color.Output = colorable.NewColorableStdout()
}
type Displayer interface {
Display([]*html.Node)
}
func ParseDisplayer(cmd string) error {
attrRe := regexp.MustCompile(`attr\{([a-zA-Z\-]+)\}`)
if cmd == "text{}" {
pupDisplayer = TextDisplayer{}
} else if cmd == "json{}" {
pupDisplayer = JSONDisplayer{}
} else if match := attrRe.FindAllStringSubmatch(cmd, -1); len(match) == 1 {
pupDisplayer = AttrDisplayer{
Attr: match[0][1],
}
} else {
return fmt.Errorf("Unknown displayer")
}
return nil
}
// Is this node a tag with no end tag such as <meta> or <br>?
// http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
func isVoidElement(n *html.Node) bool {
switch n.DataAtom {
case atom.Area, atom.Base, atom.Br, atom.Col, atom.Command, atom.Embed,
atom.Hr, atom.Img, atom.Input, atom.Keygen, atom.Link,
atom.Meta, atom.Param, atom.Source, atom.Track, atom.Wbr:
return true
}
return false
}
var (
// Colors
tagColor *color.Color = color.New(color.FgCyan)
tokenColor = color.New(color.FgCyan)
attrKeyColor = color.New(color.FgMagenta)
quoteColor = color.New(color.FgBlue)
commentColor = color.New(color.FgYellow)
)
type TreeDisplayer struct {
}
func (t TreeDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
t.printNode(node, 0)
}
}
// The <pre> tag indicates that the text within it should always be formatted
// as is. See https://github.com/ericchiang/pup/issues/33
func (t TreeDisplayer) printPre(n *html.Node) {
switch n.Type {
case html.TextNode:
s := n.Data
if pupEscapeHTML {
// don't escape javascript
if n.Parent == nil || n.Parent.DataAtom != atom.Script {
s = html.EscapeString(s)
}
}
fmt.Print(s)
for c := n.FirstChild; c != nil; c = c.NextSibling {
t.printPre(c)
}
case html.ElementNode:
fmt.Printf("<%s", n.Data)
for _, a := range n.Attr {
val := a.Val
if pupEscapeHTML {
val = html.EscapeString(val)
}
fmt.Printf(` %s="%s"`, a.Key, val)
}
fmt.Print(">")
if !isVoidElement(n) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
t.printPre(c)
}
fmt.Printf("</%s>", n.Data)
}
case html.CommentNode:
data := n.Data
if pupEscapeHTML {
data = html.EscapeString(data)
}
fmt.Printf("<!--%s-->\n", data)
for c := n.FirstChild; c != nil; c = c.NextSibling {
t.printPre(c)
}
case html.DoctypeNode, html.DocumentNode:
for c := n.FirstChild; c != nil; c = c.NextSibling {
t.printPre(c)
}
}
}
// Print a node and all of it's children to `maxlevel`.
func (t TreeDisplayer) printNode(n *html.Node, level int) {
switch n.Type {
case html.TextNode:
s := n.Data
if pupEscapeHTML {
// don't escape javascript
if n.Parent == nil || n.Parent.DataAtom != atom.Script {
s = html.EscapeString(s)
}
}
s = strings.TrimSpace(s)
if s != "" {
t.printIndent(level)
fmt.Println(s)
}
case html.ElementNode:
t.printIndent(level)
// TODO: allow pre with color
if n.DataAtom == atom.Pre && !pupPrintColor && pupPreformatted {
t.printPre(n)
fmt.Println()
return
}
if pupPrintColor {
tokenColor.Print("<")
tagColor.Printf("%s", n.Data)
} else {
fmt.Printf("<%s", n.Data)
}
for _, a := range n.Attr {
val := a.Val
if pupEscapeHTML {
val = html.EscapeString(val)
}
if pupPrintColor {
fmt.Print(" ")
attrKeyColor.Printf("%s", a.Key)
tokenColor.Print("=")
quoteColor.Printf(`"%s"`, val)
} else {
fmt.Printf(` %s="%s"`, a.Key, val)
}
}
if pupPrintColor {
tokenColor.Println(">")
} else {
fmt.Println(">")
}
if !isVoidElement(n) {
t.printChildren(n, level+1)
t.printIndent(level)
if pupPrintColor {
tokenColor.Print("</")
tagColor.Printf("%s", n.Data)
tokenColor.Println(">")
} else {
fmt.Printf("</%s>\n", n.Data)
}
}
case html.CommentNode:
t.printIndent(level)
data := n.Data
if pupEscapeHTML {
data = html.EscapeString(data)
}
if pupPrintColor {
commentColor.Printf("<!--%s-->\n", data)
} else {
fmt.Printf("<!--%s-->\n", data)
}
t.printChildren(n, level)
case html.DoctypeNode, html.DocumentNode:
t.printChildren(n, level)
}
}
func (t TreeDisplayer) printChildren(n *html.Node, level int) {
if pupMaxPrintLevel > -1 {
if level >= pupMaxPrintLevel {
t.printIndent(level)
fmt.Println("...")
return
}
}
child := n.FirstChild
for child != nil {
t.printNode(child, level)
child = child.NextSibling
}
}
func (t TreeDisplayer) printIndent(level int) {
for ; level > 0; level-- {
fmt.Print(pupIndentString)
}
}
// Print the text of a node
type TextDisplayer struct{}
func (t TextDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
if node.Type == html.TextNode {
data := node.Data
if pupEscapeHTML {
// don't escape javascript
if node.Parent == nil || node.Parent.DataAtom != atom.Script {
data = html.EscapeString(data)
}
}
fmt.Println(data)
}
children := []*html.Node{}
child := node.FirstChild
for child != nil {
children = append(children, child)
child = child.NextSibling
}
t.Display(children)
}
}
// Print the attribute of a node
type AttrDisplayer struct {
Attr string
}
func (a AttrDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
attributes := node.Attr
for _, attr := range attributes {
if attr.Key == a.Attr {
val := attr.Val
if pupEscapeHTML {
val = html.EscapeString(val)
}
fmt.Printf("%s\n", val)
}
}
}
}
// Print nodes as a JSON list
type JSONDisplayer struct{}
// returns a jsonifiable struct
func jsonify(node *html.Node) map[string]interface{} {
vals := map[string]interface{}{}
if len(node.Attr) > 0 {
for _, attr := range node.Attr {
if pupEscapeHTML {
vals[attr.Key] = html.EscapeString(attr.Val)
} else {
vals[attr.Key] = attr.Val
}
}
}
vals["tag"] = node.DataAtom.String()
children := []interface{}{}
for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Type {
case html.ElementNode:
children = append(children, jsonify(child))
case html.TextNode:
text := strings.TrimSpace(child.Data)
if text != "" {
if pupEscapeHTML {
// don't escape javascript
if node.DataAtom != atom.Script {
text = html.EscapeString(text)
}
}
// if there is already text we'll append it
currText, ok := vals["text"]
if ok {
text = fmt.Sprintf("%s %s", currText, text)
}
vals["text"] = text
}
case html.CommentNode:
comment := strings.TrimSpace(child.Data)
if pupEscapeHTML {
comment = html.EscapeString(comment)
}
currComment, ok := vals["comment"]
if ok {
comment = fmt.Sprintf("%s %s", currComment, comment)
}
vals["comment"] = comment
}
}
if len(children) > 0 {
vals["children"] = children
}
return vals
}
func (j JSONDisplayer) Display(nodes []*html.Node) {
var data []byte
var err error
jsonNodes := []map[string]interface{}{}
for _, node := range nodes {
jsonNodes = append(jsonNodes, jsonify(node))
}
data, err = json.MarshalIndent(&jsonNodes, "", pupIndentString)
if err != nil {
panic("Could not jsonify nodes")
}
fmt.Printf("%s\n", data)
}
// Print the number of features returned
type NumDisplayer struct{}
func (d NumDisplayer) Display(nodes []*html.Node) {
fmt.Println(len(nodes))
}