mirror of
https://github.com/ericchiang/pup
synced 2024-11-24 08:58:08 +00:00
text function added
This commit is contained in:
parent
6915c6abb9
commit
1c07121c07
47
README.md
47
README.md
@ -31,7 +31,7 @@ Download a webpage with `wget`.
|
||||
$ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html
|
||||
```
|
||||
|
||||
###Clean and indent
|
||||
####Clean and indent
|
||||
|
||||
By default `pup` will fill in missing tags and properly indent the page.
|
||||
|
||||
@ -42,7 +42,7 @@ $ cat robots.html | pup --color
|
||||
# cleaned, indented, and colorful HTML
|
||||
```
|
||||
|
||||
###Filter by tag
|
||||
####Filter by tag
|
||||
```bash
|
||||
$ pup < robots.html title
|
||||
<title>
|
||||
@ -50,7 +50,7 @@ $ pup < robots.html title
|
||||
</title>
|
||||
```
|
||||
|
||||
###Filter by id
|
||||
####Filter by id
|
||||
```bash
|
||||
$ pup < robots.html span#See_also
|
||||
<span class="mw-headline" id="See_also">
|
||||
@ -58,7 +58,7 @@ $ pup < robots.html span#See_also
|
||||
</span>
|
||||
```
|
||||
|
||||
###Chain selectors together
|
||||
####Chain selectors together
|
||||
|
||||
The following two commands are equivalent. (NOTE: pipes do not work with the
|
||||
`--color` flag)
|
||||
@ -86,14 +86,14 @@ Both produce the ouput:
|
||||
</a>
|
||||
```
|
||||
|
||||
###How many nodes are selected by a filter?
|
||||
####How many nodes are selected by a filter?
|
||||
|
||||
```bash
|
||||
$ pup < robots.html a -n
|
||||
283
|
||||
```
|
||||
|
||||
###Limit print level
|
||||
####Limit print level
|
||||
|
||||
```bash
|
||||
$ pup < robots.html table -l 2
|
||||
@ -134,6 +134,36 @@ You can mix and match selectors as you wish.
|
||||
cat index.html | pup element#id[attribute=value]
|
||||
```
|
||||
|
||||
## Functions
|
||||
|
||||
Non-HTML selectors which effect the output type are implemented as functions
|
||||
which can be provided as a final argument.
|
||||
|
||||
As of now, `text{}` is the only implemented function.
|
||||
|
||||
#### `text{}`
|
||||
|
||||
Print all text from selected nodes and children in depth first order.
|
||||
|
||||
```bash
|
||||
$ cat robots.html | pup .mw-headline text{}
|
||||
History
|
||||
About the standard
|
||||
Disadvantages
|
||||
Alternatives
|
||||
Examples
|
||||
Nonstandard extensions
|
||||
Crawl-delay directive
|
||||
Allow directive
|
||||
Sitemap
|
||||
Host
|
||||
Universal "*" match
|
||||
Meta tags and headers
|
||||
See also
|
||||
References
|
||||
External links
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
```bash
|
||||
@ -148,5 +178,6 @@ cat index.html | pup element#id[attribute=value]
|
||||
|
||||
## TODO:
|
||||
|
||||
* Print attribute value rather than html ({href})
|
||||
* Print result as JSON (--json)
|
||||
* Print attribute function `attr{attr1, attr2}`
|
||||
* Print as json function `json{}`
|
||||
* Switch `-n` from a flag to a function
|
||||
|
49
funcs/display.go
Normal file
49
funcs/display.go
Normal file
@ -0,0 +1,49 @@
|
||||
package funcs
|
||||
|
||||
import (
|
||||
"code.google.com/p/go.net/html"
|
||||
"fmt"
|
||||
"regexp"
|
||||
)
|
||||
|
||||
type Displayer interface {
|
||||
Display(nodes []*html.Node)
|
||||
}
|
||||
|
||||
type TextDisplayer struct {
|
||||
}
|
||||
|
||||
func (t TextDisplayer) Display(nodes []*html.Node) {
|
||||
for _, node := range nodes {
|
||||
if node.Type == html.TextNode {
|
||||
fmt.Println(node.Data)
|
||||
}
|
||||
children := []*html.Node{}
|
||||
child := node.FirstChild
|
||||
for child != nil {
|
||||
children = append(children, child)
|
||||
child = child.NextSibling
|
||||
}
|
||||
t.Display(children)
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// Display function helpers
|
||||
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
|
||||
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
|
||||
attrFuncMatcher = regexp.MustCompile(`^attr\{[^\}]*\}$`)
|
||||
)
|
||||
|
||||
func NewDisplayFunc(text string) (Displayer, error) {
|
||||
if !displayMatcher.MatchString(text) {
|
||||
return nil, fmt.Errorf("Not a display function")
|
||||
}
|
||||
switch {
|
||||
case textFuncMatcher.MatchString(text):
|
||||
return TextDisplayer{}, nil
|
||||
case attrFuncMatcher.MatchString(text):
|
||||
return nil, fmt.Errorf("attr")
|
||||
}
|
||||
return nil, fmt.Errorf("Not a display function")
|
||||
}
|
18
main.go
18
main.go
@ -3,6 +3,7 @@ package main
|
||||
import (
|
||||
"code.google.com/p/go.net/html"
|
||||
"fmt"
|
||||
"github.com/ericchiang/pup/funcs"
|
||||
"github.com/ericchiang/pup/selector"
|
||||
"io"
|
||||
"os"
|
||||
@ -14,11 +15,13 @@ const VERSION string = "0.1.0"
|
||||
|
||||
var (
|
||||
// Flags
|
||||
attributes []string = []string{}
|
||||
inputStream io.ReadCloser = os.Stdin
|
||||
indentString string = " "
|
||||
maxPrintLevel int = -1
|
||||
printNumber bool = false
|
||||
printColor bool = false
|
||||
displayer funcs.Displayer = nil
|
||||
)
|
||||
|
||||
// Print to stderr and exit
|
||||
@ -64,6 +67,9 @@ func ProcessFlags(cmds []string) []string {
|
||||
for i = 0; i < len(cmds); i++ {
|
||||
cmd := cmds[i]
|
||||
switch cmd {
|
||||
case "-a", "--attr":
|
||||
attributes = append(attributes, cmds[i+1])
|
||||
i++
|
||||
case "-c", "--color":
|
||||
printColor = true
|
||||
case "-f", "--file":
|
||||
@ -121,6 +127,14 @@ func main() {
|
||||
}
|
||||
selectors := make([]*selector.Selector, len(cmds))
|
||||
for i, cmd := range cmds {
|
||||
if i+1 == len(cmds) {
|
||||
d, err := funcs.NewDisplayFunc(cmd)
|
||||
if err == nil {
|
||||
displayer = d
|
||||
selectors = selectors[0 : len(cmds)-1]
|
||||
break
|
||||
}
|
||||
}
|
||||
selectors[i], err = selector.NewSelector(cmd)
|
||||
if err != nil {
|
||||
Fatal("Selector parse error: %s", err)
|
||||
@ -136,7 +150,9 @@ func main() {
|
||||
}
|
||||
currNodes = selected
|
||||
}
|
||||
if printNumber {
|
||||
if displayer != nil {
|
||||
displayer.Display(currNodes)
|
||||
} else if printNumber {
|
||||
fmt.Println(len(currNodes))
|
||||
} else {
|
||||
for _, s := range currNodes {
|
||||
|
@ -17,8 +17,8 @@ var (
|
||||
|
||||
// Regexp helpers
|
||||
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
|
||||
preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`)
|
||||
postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`)
|
||||
preWhitespace = regexp.MustCompile(`^\s+`)
|
||||
postWhitespace = regexp.MustCompile(`\s+$`)
|
||||
)
|
||||
|
||||
func printIndent(level int) {
|
||||
|
Loading…
Reference in New Issue
Block a user