1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-15 10:11:16 +00:00

cleaned up code and add comments

This commit is contained in:
ericchiang 2014-09-01 14:50:10 -04:00
parent 957fc30cc1
commit 825f458e22
4 changed files with 46 additions and 31 deletions

View File

@ -13,8 +13,7 @@ fast and flexible way of exploring HTML from the terminal.
## Examples
Download a webpage with `wget`. _Please exercise restraint when using any
automated request tool._
Download a webpage with `wget`.
```bash
$ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html

27
main.go
View File

@ -6,12 +6,11 @@ import (
"github.com/ericchiang/pup/selector"
"io"
"os"
"regexp"
"strconv"
"strings"
)
const VERSION = "0.1.0"
const VERSION string = "0.1.0"
var (
// Flags
@ -20,20 +19,17 @@ var (
maxPrintLevel int = -1
printNumber bool = false
printColor bool = false
// Helpers
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`)
postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`)
)
// Print to stderr and exit
func Fatal(format string, args ...interface{}) {
fmt.Fprintf(os.Stderr, format, args...)
fmt.Fprintf(os.Stderr, "\n")
os.Exit(1)
}
func printHelp() {
// Print help to stderr and quit
func PrintHelp() {
helpString := `Usage
pup [list of css selectors]
@ -54,7 +50,8 @@ Flags
Fatal(helpString, VERSION)
}
func processFlags(cmds []string) []string {
// Process command arguments and return all non-flags.
func ProcessFlags(cmds []string) []string {
var i int
var err error
defer func() {
@ -77,7 +74,7 @@ func processFlags(cmds []string) []string {
}
i++
case "-h", "--help":
printHelp()
PrintHelp()
os.Exit(1)
case "-i", "--indent":
indentLevel, err := strconv.Atoi(cmds[i+1])
@ -109,8 +106,9 @@ func processFlags(cmds []string) []string {
return nonFlagCmds[:n]
}
// pup
func main() {
cmds := processFlags(os.Args[1:])
cmds := ProcessFlags(os.Args[1:])
root, err := html.Parse(inputStream)
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
@ -121,9 +119,9 @@ func main() {
PrintNode(root, 0)
os.Exit(0)
}
selectors := make([]selector.Selector, len(cmds))
selectors := make([]*selector.Selector, len(cmds))
for i, cmd := range cmds {
selectors[i], err = selector.ParseSelector(cmd)
selectors[i], err = selector.NewSelector(cmd)
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(2)
@ -134,7 +132,8 @@ func main() {
for _, selector := range selectors {
selected = []*html.Node{}
for _, node := range currNodes {
selected = append(selected, selector.FindAllChildren(node)...)
selected = append(selected,
selector.FindAllChildren(node)...)
}
currNodes = selected
}

View File

@ -5,13 +5,20 @@ import (
"code.google.com/p/go.net/html/atom"
"fmt"
"github.com/fatih/color"
"regexp"
)
var (
// Colors
tagColor *color.Color = color.New(color.FgYellow).Add(color.Bold)
tokenColor = color.New(color.FgCyan).Add(color.Bold)
attrKeyColor = color.New(color.FgRed)
quoteColor = color.New(color.FgBlue)
// Regexp helpers
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`)
postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`)
)
func printIndent(level int) {
@ -48,6 +55,7 @@ func printChildren(n *html.Node, level int) {
}
}
// Print a node and all of it's children to `maxlevel`.
func PrintNode(n *html.Node, level int) {
switch n.Type {
case html.TextNode:

View File

@ -7,20 +7,22 @@ import (
"strings"
)
// A CSS Selector
type Selector struct {
Class, ID, Name *regexp.Regexp
Attrs map[string]*regexp.Regexp
}
type SelectorType string
type selectorField string
const (
Class SelectorType = "class"
ID SelectorType = "id"
Name SelectorType = "name"
Class selectorField = "class"
ID selectorField = "id"
Name selectorField = "name"
)
func setTypeValue(s *Selector, a SelectorType, v string) error {
// Set a field of this selector.
func (s *Selector) setFieldValue(a selectorField, v string) error {
if v == "" {
return nil
}
@ -41,7 +43,8 @@ func setTypeValue(s *Selector, a SelectorType, v string) error {
return nil
}
func ParseSelector(s string) (Selector, error) {
// Convert a string to a selector.
func NewSelector(s string) (*Selector, error) {
attrs := map[string]*regexp.Regexp{}
selector := &Selector{nil, nil, nil, attrs}
nextAttr := Name
@ -49,26 +52,30 @@ func ParseSelector(s string) (Selector, error) {
for i, c := range s {
switch c {
case '.':
err := setTypeValue(selector, nextAttr, s[start:i])
err := selector.setFieldValue(nextAttr, s[start:i])
if err != nil {
return *selector, err
return selector, err
}
nextAttr = Class
start = i + 1
case '#':
err := setTypeValue(selector, nextAttr, s[start:i])
err := selector.setFieldValue(nextAttr, s[start:i])
if err != nil {
return *selector, err
return selector, err
}
nextAttr = ID
start = i + 1
}
}
setTypeValue(selector, nextAttr, s[start:])
return *selector, nil
err := selector.setFieldValue(nextAttr, s[start:])
if err != nil {
return selector, err
}
return selector, nil
}
func (sel Selector) FindAllChildren(node *html.Node) []*html.Node {
// Find all nodes which match a selector.
func (sel *Selector) FindAllChildren(node *html.Node) []*html.Node {
selected := []*html.Node{}
child := node.FirstChild
for child != nil {
@ -79,7 +86,8 @@ func (sel Selector) FindAllChildren(node *html.Node) []*html.Node {
return selected
}
func (sel Selector) FindAll(node *html.Node) []*html.Node {
// Find all nodes which match a selector. May return itself.
func (sel *Selector) FindAll(node *html.Node) []*html.Node {
selected := []*html.Node{}
if sel.Match(node) {
return []*html.Node{node}
@ -93,7 +101,8 @@ func (sel Selector) FindAll(node *html.Node) []*html.Node {
return selected
}
func (sel Selector) Match(node *html.Node) bool {
// Does this selector match a given node?
func (sel *Selector) Match(node *html.Node) bool {
if node.Type != html.ElementNode {
return false
}