text function added

pull/7/head
ericchiang 10 years ago
parent 6915c6abb9
commit 1c07121c07

@ -31,7 +31,7 @@ Download a webpage with `wget`.
$ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html $ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html
``` ```
###Clean and indent ####Clean and indent
By default `pup` will fill in missing tags and properly indent the page. By default `pup` will fill in missing tags and properly indent the page.
@ -42,7 +42,7 @@ $ cat robots.html | pup --color
# cleaned, indented, and colorful HTML # cleaned, indented, and colorful HTML
``` ```
###Filter by tag ####Filter by tag
```bash ```bash
$ pup < robots.html title $ pup < robots.html title
<title> <title>
@ -50,7 +50,7 @@ $ pup < robots.html title
</title> </title>
``` ```
###Filter by id ####Filter by id
```bash ```bash
$ pup < robots.html span#See_also $ pup < robots.html span#See_also
<span class="mw-headline" id="See_also"> <span class="mw-headline" id="See_also">
@ -58,7 +58,7 @@ $ pup < robots.html span#See_also
</span> </span>
``` ```
###Chain selectors together ####Chain selectors together
The following two commands are equivalent. (NOTE: pipes do not work with the The following two commands are equivalent. (NOTE: pipes do not work with the
`--color` flag) `--color` flag)
@ -86,14 +86,14 @@ Both produce the ouput:
</a> </a>
``` ```
###How many nodes are selected by a filter? ####How many nodes are selected by a filter?
```bash ```bash
$ pup < robots.html a -n $ pup < robots.html a -n
283 283
``` ```
###Limit print level ####Limit print level
```bash ```bash
$ pup < robots.html table -l 2 $ pup < robots.html table -l 2
@ -134,6 +134,36 @@ You can mix and match selectors as you wish.
cat index.html | pup element#id[attribute=value] cat index.html | pup element#id[attribute=value]
``` ```
## Functions
Non-HTML selectors which effect the output type are implemented as functions
which can be provided as a final argument.
As of now, `text{}` is the only implemented function.
#### `text{}`
Print all text from selected nodes and children in depth first order.
```bash
$ cat robots.html | pup .mw-headline text{}
History
About the standard
Disadvantages
Alternatives
Examples
Nonstandard extensions
Crawl-delay directive
Allow directive
Sitemap
Host
Universal "*" match
Meta tags and headers
See also
References
External links
```
## Flags ## Flags
```bash ```bash
@ -148,5 +178,6 @@ cat index.html | pup element#id[attribute=value]
## TODO: ## TODO:
* Print attribute value rather than html ({href}) * Print attribute function `attr{attr1, attr2}`
* Print result as JSON (--json) * Print as json function `json{}`
* Switch `-n` from a flag to a function

@ -0,0 +1,49 @@
package funcs
import (
"code.google.com/p/go.net/html"
"fmt"
"regexp"
)
type Displayer interface {
Display(nodes []*html.Node)
}
type TextDisplayer struct {
}
func (t TextDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
if node.Type == html.TextNode {
fmt.Println(node.Data)
}
children := []*html.Node{}
child := node.FirstChild
for child != nil {
children = append(children, child)
child = child.NextSibling
}
t.Display(children)
}
}
var (
// Display function helpers
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
attrFuncMatcher = regexp.MustCompile(`^attr\{[^\}]*\}$`)
)
func NewDisplayFunc(text string) (Displayer, error) {
if !displayMatcher.MatchString(text) {
return nil, fmt.Errorf("Not a display function")
}
switch {
case textFuncMatcher.MatchString(text):
return TextDisplayer{}, nil
case attrFuncMatcher.MatchString(text):
return nil, fmt.Errorf("attr")
}
return nil, fmt.Errorf("Not a display function")
}

@ -3,6 +3,7 @@ package main
import ( import (
"code.google.com/p/go.net/html" "code.google.com/p/go.net/html"
"fmt" "fmt"
"github.com/ericchiang/pup/funcs"
"github.com/ericchiang/pup/selector" "github.com/ericchiang/pup/selector"
"io" "io"
"os" "os"
@ -14,11 +15,13 @@ const VERSION string = "0.1.0"
var ( var (
// Flags // Flags
inputStream io.ReadCloser = os.Stdin attributes []string = []string{}
indentString string = " " inputStream io.ReadCloser = os.Stdin
maxPrintLevel int = -1 indentString string = " "
printNumber bool = false maxPrintLevel int = -1
printColor bool = false printNumber bool = false
printColor bool = false
displayer funcs.Displayer = nil
) )
// Print to stderr and exit // Print to stderr and exit
@ -64,6 +67,9 @@ func ProcessFlags(cmds []string) []string {
for i = 0; i < len(cmds); i++ { for i = 0; i < len(cmds); i++ {
cmd := cmds[i] cmd := cmds[i]
switch cmd { switch cmd {
case "-a", "--attr":
attributes = append(attributes, cmds[i+1])
i++
case "-c", "--color": case "-c", "--color":
printColor = true printColor = true
case "-f", "--file": case "-f", "--file":
@ -121,6 +127,14 @@ func main() {
} }
selectors := make([]*selector.Selector, len(cmds)) selectors := make([]*selector.Selector, len(cmds))
for i, cmd := range cmds { for i, cmd := range cmds {
if i+1 == len(cmds) {
d, err := funcs.NewDisplayFunc(cmd)
if err == nil {
displayer = d
selectors = selectors[0 : len(cmds)-1]
break
}
}
selectors[i], err = selector.NewSelector(cmd) selectors[i], err = selector.NewSelector(cmd)
if err != nil { if err != nil {
Fatal("Selector parse error: %s", err) Fatal("Selector parse error: %s", err)
@ -136,7 +150,9 @@ func main() {
} }
currNodes = selected currNodes = selected
} }
if printNumber { if displayer != nil {
displayer.Display(currNodes)
} else if printNumber {
fmt.Println(len(currNodes)) fmt.Println(len(currNodes))
} else { } else {
for _, s := range currNodes { for _, s := range currNodes {

@ -17,8 +17,8 @@ var (
// Regexp helpers // Regexp helpers
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`) whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`) preWhitespace = regexp.MustCompile(`^\s+`)
postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`) postWhitespace = regexp.MustCompile(`\s+$`)
) )
func printIndent(level int) { func printIndent(level int) {

Loading…
Cancel
Save