From 1c07121c07e86c8ec1a06025e74722a577c5a1c7 Mon Sep 17 00:00:00 2001 From: ericchiang Date: Mon, 1 Sep 2014 23:53:12 -0400 Subject: [PATCH] text function added --- README.md | 47 ++++++++++++++++++++++++++++++++++++++-------- funcs/display.go | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ main.go | 28 +++++++++++++++++++++------ printing.go | 4 ++-- 4 files changed, 112 insertions(+), 16 deletions(-) create mode 100644 funcs/display.go diff --git a/README.md b/README.md index 00c6e02..e1cb584 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Download a webpage with `wget`. $ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html ``` -###Clean and indent +####Clean and indent By default `pup` will fill in missing tags and properly indent the page. @@ -42,7 +42,7 @@ $ cat robots.html | pup --color # cleaned, indented, and colorful HTML ``` -###Filter by tag +####Filter by tag ```bash $ pup < robots.html title @@ -50,7 +50,7 @@ $ pup < robots.html title ``` -###Filter by id +####Filter by id ```bash $ pup < robots.html span#See_also @@ -58,7 +58,7 @@ $ pup < robots.html span#See_also ``` -###Chain selectors together +####Chain selectors together The following two commands are equivalent. (NOTE: pipes do not work with the `--color` flag) @@ -86,14 +86,14 @@ Both produce the ouput: ``` -###How many nodes are selected by a filter? +####How many nodes are selected by a filter? ```bash $ pup < robots.html a -n 283 ``` -###Limit print level +####Limit print level ```bash $ pup < robots.html table -l 2 @@ -134,6 +134,36 @@ You can mix and match selectors as you wish. cat index.html | pup element#id[attribute=value] ``` +## Functions + +Non-HTML selectors which effect the output type are implemented as functions +which can be provided as a final argument. + +As of now, `text{}` is the only implemented function. + +#### `text{}` + +Print all text from selected nodes and children in depth first order. + +```bash +$ cat robots.html | pup .mw-headline text{} +History +About the standard +Disadvantages +Alternatives +Examples +Nonstandard extensions +Crawl-delay directive +Allow directive +Sitemap +Host +Universal "*" match +Meta tags and headers +See also +References +External links +``` + ## Flags ```bash @@ -148,5 +178,6 @@ cat index.html | pup element#id[attribute=value] ## TODO: -* Print attribute value rather than html ({href}) -* Print result as JSON (--json) +* Print attribute function `attr{attr1, attr2}` +* Print as json function `json{}` +* Switch `-n` from a flag to a function diff --git a/funcs/display.go b/funcs/display.go new file mode 100644 index 0000000..053b4b6 --- /dev/null +++ b/funcs/display.go @@ -0,0 +1,49 @@ +package funcs + +import ( + "code.google.com/p/go.net/html" + "fmt" + "regexp" +) + +type Displayer interface { + Display(nodes []*html.Node) +} + +type TextDisplayer struct { +} + +func (t TextDisplayer) Display(nodes []*html.Node) { + for _, node := range nodes { + if node.Type == html.TextNode { + fmt.Println(node.Data) + } + children := []*html.Node{} + child := node.FirstChild + for child != nil { + children = append(children, child) + child = child.NextSibling + } + t.Display(children) + } +} + +var ( + // Display function helpers + displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`) + textFuncMatcher = regexp.MustCompile(`^text\{\}$`) + attrFuncMatcher = regexp.MustCompile(`^attr\{[^\}]*\}$`) +) + +func NewDisplayFunc(text string) (Displayer, error) { + if !displayMatcher.MatchString(text) { + return nil, fmt.Errorf("Not a display function") + } + switch { + case textFuncMatcher.MatchString(text): + return TextDisplayer{}, nil + case attrFuncMatcher.MatchString(text): + return nil, fmt.Errorf("attr") + } + return nil, fmt.Errorf("Not a display function") +} diff --git a/main.go b/main.go index f5f08a7..7a04e41 100644 --- a/main.go +++ b/main.go @@ -3,6 +3,7 @@ package main import ( "code.google.com/p/go.net/html" "fmt" + "github.com/ericchiang/pup/funcs" "github.com/ericchiang/pup/selector" "io" "os" @@ -14,11 +15,13 @@ const VERSION string = "0.1.0" var ( // Flags - inputStream io.ReadCloser = os.Stdin - indentString string = " " - maxPrintLevel int = -1 - printNumber bool = false - printColor bool = false + attributes []string = []string{} + inputStream io.ReadCloser = os.Stdin + indentString string = " " + maxPrintLevel int = -1 + printNumber bool = false + printColor bool = false + displayer funcs.Displayer = nil ) // Print to stderr and exit @@ -64,6 +67,9 @@ func ProcessFlags(cmds []string) []string { for i = 0; i < len(cmds); i++ { cmd := cmds[i] switch cmd { + case "-a", "--attr": + attributes = append(attributes, cmds[i+1]) + i++ case "-c", "--color": printColor = true case "-f", "--file": @@ -121,6 +127,14 @@ func main() { } selectors := make([]*selector.Selector, len(cmds)) for i, cmd := range cmds { + if i+1 == len(cmds) { + d, err := funcs.NewDisplayFunc(cmd) + if err == nil { + displayer = d + selectors = selectors[0 : len(cmds)-1] + break + } + } selectors[i], err = selector.NewSelector(cmd) if err != nil { Fatal("Selector parse error: %s", err) @@ -136,7 +150,9 @@ func main() { } currNodes = selected } - if printNumber { + if displayer != nil { + displayer.Display(currNodes) + } else if printNumber { fmt.Println(len(currNodes)) } else { for _, s := range currNodes { diff --git a/printing.go b/printing.go index 0d4cb70..37a9dee 100644 --- a/printing.go +++ b/printing.go @@ -17,8 +17,8 @@ var ( // Regexp helpers whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`) - preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`) - postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`) + preWhitespace = regexp.MustCompile(`^\s+`) + postWhitespace = regexp.MustCompile(`\s+$`) ) func printIndent(level int) {