Parsing HTML at the command line
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

183 lines
4.3 KiB

package main
import (
"fmt"
"io"
"os"
"strconv"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
var (
pupIn io.ReadCloser = os.Stdin
pupCharset string = ""
pupMaxPrintLevel int = -1
pupPreformatted bool = false
pupPrintColor bool = false
pupEscapeHTML bool = true
pupIndentString string = " "
pupDisplayer Displayer = TreeDisplayer{}
)
// Parse the html while handling the charset
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
var err error
if cs == "" {
// attempt to guess the charset of the HTML document
r, err = charset.NewReader(r, "")
if err != nil {
return nil, err
}
} else {
// let the user specify the charset
e, name := charset.Lookup(cs)
if name == "" {
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
}
r = transform.NewReader(r, e.NewDecoder())
}
return html.Parse(r)
}
func PrintHelp(w io.Writer, exitCode int) {
helpString := `Usage
pup [flags] [selectors] [optional display function]
Version
%s
Flags
-c --color print result with color
-f --file file to read from
-h --help display this help
-i --indent number of spaces to use for indent or character
-n --number print number of elements selected
-l --limit restrict number of levels printed
-p --plain don't escape html
--pre preserve preformatted text
--charset specify the charset for pup to use
--version display version
`
fmt.Fprintf(w, helpString, VERSION)
os.Exit(exitCode)
}
func ParseArgs() ([]string, error) {
cmds, err := ProcessFlags(os.Args[1:])
if err != nil {
return []string{}, err
}
return ParseCommands(strings.Join(cmds, " "))
}
// Process command arguments and return all non-flags.
func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
var i int
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Option '%s' requires an argument", cmds[i])
}
}()
nonFlagCmds = make([]string, len(cmds))
n := 0
for i = 0; i < len(cmds); i++ {
cmd := cmds[i]
switch cmd {
case "-c", "--color":
pupPrintColor = true
case "-p", "--plain":
pupEscapeHTML = false
case "--pre":
pupPreformatted = true
case "-f", "--file":
filename := cmds[i+1]
pupIn, err = os.Open(filename)
if err != nil {
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
os.Exit(2)
}
i++
case "-h", "--help":
PrintHelp(os.Stdout, 0)
case "-i", "--indent":
indentLevel, err := strconv.Atoi(cmds[i+1])
if err == nil {
pupIndentString = strings.Repeat(" ", indentLevel)
} else {
pupIndentString = cmds[i+1]
}
i++
case "-l", "--limit":
pupMaxPrintLevel, err = strconv.Atoi(cmds[i+1])
if err != nil {
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
}
i++
case "--charset":
pupCharset = cmds[i+1]
i++
case "--version":
fmt.Println(VERSION)
os.Exit(0)
case "-n", "--number":
pupDisplayer = NumDisplayer{}
default:
if cmd[0] == '-' {
return []string{}, fmt.Errorf("Unrecognized flag '%s'", cmd)
}
nonFlagCmds[n] = cmds[i]
n++
}
}
return nonFlagCmds[:n], nil
}
// Split a string with awareness for quoted text and commas
func ParseCommands(cmdString string) ([]string, error) {
cmds := []string{}
last, next, max := 0, 0, len(cmdString)
for {
// if we're at the end of the string, return
if next == max {
if next > last {
cmds = append(cmds, cmdString[last:next])
}
return cmds, nil
}
// evaluate a rune
c := cmdString[next]
switch c {
case ' ':
if next > last {
cmds = append(cmds, cmdString[last:next])
}
last = next + 1
case ',':
if next > last {
cmds = append(cmds, cmdString[last:next])
}
cmds = append(cmds, ",")
last = next + 1
case '\'', '"':
// for quotes, consume runes until the quote has ended
quoteChar := c
for {
next++
if next == max {
return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar)
}
if cmdString[next] == '\\' {
next++
if next == max {
return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar)
}
} else if cmdString[next] == quoteChar {
break
}
}
}
next++
}
}