mirror of
https://github.com/ericchiang/pup
synced 2024-11-24 08:58:08 +00:00
text function added
This commit is contained in:
parent
6915c6abb9
commit
1c07121c07
47
README.md
47
README.md
@ -31,7 +31,7 @@ Download a webpage with `wget`.
|
|||||||
$ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html
|
$ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html
|
||||||
```
|
```
|
||||||
|
|
||||||
###Clean and indent
|
####Clean and indent
|
||||||
|
|
||||||
By default `pup` will fill in missing tags and properly indent the page.
|
By default `pup` will fill in missing tags and properly indent the page.
|
||||||
|
|
||||||
@ -42,7 +42,7 @@ $ cat robots.html | pup --color
|
|||||||
# cleaned, indented, and colorful HTML
|
# cleaned, indented, and colorful HTML
|
||||||
```
|
```
|
||||||
|
|
||||||
###Filter by tag
|
####Filter by tag
|
||||||
```bash
|
```bash
|
||||||
$ pup < robots.html title
|
$ pup < robots.html title
|
||||||
<title>
|
<title>
|
||||||
@ -50,7 +50,7 @@ $ pup < robots.html title
|
|||||||
</title>
|
</title>
|
||||||
```
|
```
|
||||||
|
|
||||||
###Filter by id
|
####Filter by id
|
||||||
```bash
|
```bash
|
||||||
$ pup < robots.html span#See_also
|
$ pup < robots.html span#See_also
|
||||||
<span class="mw-headline" id="See_also">
|
<span class="mw-headline" id="See_also">
|
||||||
@ -58,7 +58,7 @@ $ pup < robots.html span#See_also
|
|||||||
</span>
|
</span>
|
||||||
```
|
```
|
||||||
|
|
||||||
###Chain selectors together
|
####Chain selectors together
|
||||||
|
|
||||||
The following two commands are equivalent. (NOTE: pipes do not work with the
|
The following two commands are equivalent. (NOTE: pipes do not work with the
|
||||||
`--color` flag)
|
`--color` flag)
|
||||||
@ -86,14 +86,14 @@ Both produce the ouput:
|
|||||||
</a>
|
</a>
|
||||||
```
|
```
|
||||||
|
|
||||||
###How many nodes are selected by a filter?
|
####How many nodes are selected by a filter?
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pup < robots.html a -n
|
$ pup < robots.html a -n
|
||||||
283
|
283
|
||||||
```
|
```
|
||||||
|
|
||||||
###Limit print level
|
####Limit print level
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pup < robots.html table -l 2
|
$ pup < robots.html table -l 2
|
||||||
@ -134,6 +134,36 @@ You can mix and match selectors as you wish.
|
|||||||
cat index.html | pup element#id[attribute=value]
|
cat index.html | pup element#id[attribute=value]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Functions
|
||||||
|
|
||||||
|
Non-HTML selectors which effect the output type are implemented as functions
|
||||||
|
which can be provided as a final argument.
|
||||||
|
|
||||||
|
As of now, `text{}` is the only implemented function.
|
||||||
|
|
||||||
|
#### `text{}`
|
||||||
|
|
||||||
|
Print all text from selected nodes and children in depth first order.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup .mw-headline text{}
|
||||||
|
History
|
||||||
|
About the standard
|
||||||
|
Disadvantages
|
||||||
|
Alternatives
|
||||||
|
Examples
|
||||||
|
Nonstandard extensions
|
||||||
|
Crawl-delay directive
|
||||||
|
Allow directive
|
||||||
|
Sitemap
|
||||||
|
Host
|
||||||
|
Universal "*" match
|
||||||
|
Meta tags and headers
|
||||||
|
See also
|
||||||
|
References
|
||||||
|
External links
|
||||||
|
```
|
||||||
|
|
||||||
## Flags
|
## Flags
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -148,5 +178,6 @@ cat index.html | pup element#id[attribute=value]
|
|||||||
|
|
||||||
## TODO:
|
## TODO:
|
||||||
|
|
||||||
* Print attribute value rather than html ({href})
|
* Print attribute function `attr{attr1, attr2}`
|
||||||
* Print result as JSON (--json)
|
* Print as json function `json{}`
|
||||||
|
* Switch `-n` from a flag to a function
|
||||||
|
49
funcs/display.go
Normal file
49
funcs/display.go
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
package funcs
|
||||||
|
|
||||||
|
import (
|
||||||
|
"code.google.com/p/go.net/html"
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Displayer interface {
|
||||||
|
Display(nodes []*html.Node)
|
||||||
|
}
|
||||||
|
|
||||||
|
type TextDisplayer struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TextDisplayer) Display(nodes []*html.Node) {
|
||||||
|
for _, node := range nodes {
|
||||||
|
if node.Type == html.TextNode {
|
||||||
|
fmt.Println(node.Data)
|
||||||
|
}
|
||||||
|
children := []*html.Node{}
|
||||||
|
child := node.FirstChild
|
||||||
|
for child != nil {
|
||||||
|
children = append(children, child)
|
||||||
|
child = child.NextSibling
|
||||||
|
}
|
||||||
|
t.Display(children)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Display function helpers
|
||||||
|
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
|
||||||
|
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
|
||||||
|
attrFuncMatcher = regexp.MustCompile(`^attr\{[^\}]*\}$`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewDisplayFunc(text string) (Displayer, error) {
|
||||||
|
if !displayMatcher.MatchString(text) {
|
||||||
|
return nil, fmt.Errorf("Not a display function")
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case textFuncMatcher.MatchString(text):
|
||||||
|
return TextDisplayer{}, nil
|
||||||
|
case attrFuncMatcher.MatchString(text):
|
||||||
|
return nil, fmt.Errorf("attr")
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("Not a display function")
|
||||||
|
}
|
18
main.go
18
main.go
@ -3,6 +3,7 @@ package main
|
|||||||
import (
|
import (
|
||||||
"code.google.com/p/go.net/html"
|
"code.google.com/p/go.net/html"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"github.com/ericchiang/pup/funcs"
|
||||||
"github.com/ericchiang/pup/selector"
|
"github.com/ericchiang/pup/selector"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@ -14,11 +15,13 @@ const VERSION string = "0.1.0"
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
// Flags
|
// Flags
|
||||||
|
attributes []string = []string{}
|
||||||
inputStream io.ReadCloser = os.Stdin
|
inputStream io.ReadCloser = os.Stdin
|
||||||
indentString string = " "
|
indentString string = " "
|
||||||
maxPrintLevel int = -1
|
maxPrintLevel int = -1
|
||||||
printNumber bool = false
|
printNumber bool = false
|
||||||
printColor bool = false
|
printColor bool = false
|
||||||
|
displayer funcs.Displayer = nil
|
||||||
)
|
)
|
||||||
|
|
||||||
// Print to stderr and exit
|
// Print to stderr and exit
|
||||||
@ -64,6 +67,9 @@ func ProcessFlags(cmds []string) []string {
|
|||||||
for i = 0; i < len(cmds); i++ {
|
for i = 0; i < len(cmds); i++ {
|
||||||
cmd := cmds[i]
|
cmd := cmds[i]
|
||||||
switch cmd {
|
switch cmd {
|
||||||
|
case "-a", "--attr":
|
||||||
|
attributes = append(attributes, cmds[i+1])
|
||||||
|
i++
|
||||||
case "-c", "--color":
|
case "-c", "--color":
|
||||||
printColor = true
|
printColor = true
|
||||||
case "-f", "--file":
|
case "-f", "--file":
|
||||||
@ -121,6 +127,14 @@ func main() {
|
|||||||
}
|
}
|
||||||
selectors := make([]*selector.Selector, len(cmds))
|
selectors := make([]*selector.Selector, len(cmds))
|
||||||
for i, cmd := range cmds {
|
for i, cmd := range cmds {
|
||||||
|
if i+1 == len(cmds) {
|
||||||
|
d, err := funcs.NewDisplayFunc(cmd)
|
||||||
|
if err == nil {
|
||||||
|
displayer = d
|
||||||
|
selectors = selectors[0 : len(cmds)-1]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
selectors[i], err = selector.NewSelector(cmd)
|
selectors[i], err = selector.NewSelector(cmd)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
Fatal("Selector parse error: %s", err)
|
Fatal("Selector parse error: %s", err)
|
||||||
@ -136,7 +150,9 @@ func main() {
|
|||||||
}
|
}
|
||||||
currNodes = selected
|
currNodes = selected
|
||||||
}
|
}
|
||||||
if printNumber {
|
if displayer != nil {
|
||||||
|
displayer.Display(currNodes)
|
||||||
|
} else if printNumber {
|
||||||
fmt.Println(len(currNodes))
|
fmt.Println(len(currNodes))
|
||||||
} else {
|
} else {
|
||||||
for _, s := range currNodes {
|
for _, s := range currNodes {
|
||||||
|
@ -17,8 +17,8 @@ var (
|
|||||||
|
|
||||||
// Regexp helpers
|
// Regexp helpers
|
||||||
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
|
whitespaceRegexp *regexp.Regexp = regexp.MustCompile(`^\s*$`)
|
||||||
preWhitespace *regexp.Regexp = regexp.MustCompile(`^\s+`)
|
preWhitespace = regexp.MustCompile(`^\s+`)
|
||||||
postWhitespace *regexp.Regexp = regexp.MustCompile(`\s+$`)
|
postWhitespace = regexp.MustCompile(`\s+$`)
|
||||||
)
|
)
|
||||||
|
|
||||||
func printIndent(level int) {
|
func printIndent(level int) {
|
||||||
|
Loading…
Reference in New Issue
Block a user