mirror of
https://github.com/ericchiang/pup
synced 2025-01-28 08:31:26 +00:00
commit
f407082dd5
16
README.md
16
README.md
@ -149,6 +149,22 @@ $ cat robots.html | pup ':parent-of([action="edit"])'
|
|||||||
For a complete list, view the [implemented selectors](#Implemented Selectors)
|
For a complete list, view the [implemented selectors](#Implemented Selectors)
|
||||||
section.
|
section.
|
||||||
|
|
||||||
|
|
||||||
|
####`+`, `>`, and `,`
|
||||||
|
|
||||||
|
There are intermediate characters which declare special instructions. For
|
||||||
|
instance, a comma `,` allows pup to specify mulitple groups of selctors.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup 'title, h1 span[dir="auto"]'
|
||||||
|
<title>
|
||||||
|
Robots exclusion standard - Wikipedia, the free encyclopedia
|
||||||
|
</title>
|
||||||
|
<span dir="auto">
|
||||||
|
Robots exclusion standard
|
||||||
|
</span>
|
||||||
|
```
|
||||||
|
|
||||||
####Chain selectors together
|
####Chain selectors together
|
||||||
|
|
||||||
When combining selectors, the HTML nodes selected by the previous selector will
|
When combining selectors, the HTML nodes selected by the previous selector will
|
||||||
|
45
parse.go
45
parse.go
@ -34,22 +34,23 @@ Flags
|
|||||||
os.Exit(exitCode)
|
os.Exit(exitCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
func ParseArgs() []string {
|
func ParseArgs() ([]string, error) {
|
||||||
cmds := ProcessFlags(os.Args[1:])
|
cmds, err := ProcessFlags(os.Args[1:])
|
||||||
|
if err != nil {
|
||||||
|
return []string{}, err
|
||||||
|
}
|
||||||
return ParseCommands(strings.Join(cmds, " "))
|
return ParseCommands(strings.Join(cmds, " "))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process command arguments and return all non-flags.
|
// Process command arguments and return all non-flags.
|
||||||
func ProcessFlags(cmds []string) []string {
|
func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
|
||||||
var i int
|
var i int
|
||||||
var err error
|
|
||||||
defer func() {
|
defer func() {
|
||||||
if r := recover(); r != nil {
|
if r := recover(); r != nil {
|
||||||
fmt.Fprintf(os.Stderr, "Option '%s' requires an argument", cmds[i])
|
err = fmt.Errorf("Option '%s' requires an argument", cmds[i])
|
||||||
os.Exit(2)
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
nonFlagCmds := make([]string, len(cmds))
|
nonFlagCmds = make([]string, len(cmds))
|
||||||
n := 0
|
n := 0
|
||||||
for i = 0; i < len(cmds); i++ {
|
for i = 0; i < len(cmds); i++ {
|
||||||
cmd := cmds[i]
|
cmd := cmds[i]
|
||||||
@ -77,8 +78,7 @@ func ProcessFlags(cmds []string) []string {
|
|||||||
case "-l", "--limit":
|
case "-l", "--limit":
|
||||||
pupMaxPrintLevel, err = strconv.Atoi(cmds[i+1])
|
pupMaxPrintLevel, err = strconv.Atoi(cmds[i+1])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, "Argument for '%s' must be numeric\n", cmd)
|
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
|
||||||
os.Exit(2)
|
|
||||||
}
|
}
|
||||||
i++
|
i++
|
||||||
case "--version":
|
case "--version":
|
||||||
@ -86,18 +86,17 @@ func ProcessFlags(cmds []string) []string {
|
|||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
default:
|
default:
|
||||||
if cmd[0] == '-' {
|
if cmd[0] == '-' {
|
||||||
fmt.Fprintf(os.Stderr, "Unrecognized flag '%s'", cmd)
|
return []string{}, fmt.Errorf("Unrecognized flag '%s'", cmd)
|
||||||
os.Exit(2)
|
|
||||||
}
|
}
|
||||||
nonFlagCmds[n] = cmds[i]
|
nonFlagCmds[n] = cmds[i]
|
||||||
n++
|
n++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nonFlagCmds[:n]
|
return nonFlagCmds[:n], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split a string with awareness for quoted text
|
// Split a string with awareness for quoted text and commas
|
||||||
func ParseCommands(cmdString string) []string {
|
func ParseCommands(cmdString string) ([]string, error) {
|
||||||
cmds := []string{}
|
cmds := []string{}
|
||||||
last, next, max := 0, 0, len(cmdString)
|
last, next, max := 0, 0, len(cmdString)
|
||||||
for {
|
for {
|
||||||
@ -106,7 +105,7 @@ func ParseCommands(cmdString string) []string {
|
|||||||
if next > last {
|
if next > last {
|
||||||
cmds = append(cmds, cmdString[last:next])
|
cmds = append(cmds, cmdString[last:next])
|
||||||
}
|
}
|
||||||
return cmds
|
return cmds, nil
|
||||||
}
|
}
|
||||||
// evalute a rune
|
// evalute a rune
|
||||||
c := cmdString[next]
|
c := cmdString[next]
|
||||||
@ -116,16 +115,26 @@ func ParseCommands(cmdString string) []string {
|
|||||||
cmds = append(cmds, cmdString[last:next])
|
cmds = append(cmds, cmdString[last:next])
|
||||||
}
|
}
|
||||||
last = next + 1
|
last = next + 1
|
||||||
|
case ',':
|
||||||
|
if next > last {
|
||||||
|
cmds = append(cmds, cmdString[last:next])
|
||||||
|
}
|
||||||
|
cmds = append(cmds, ",")
|
||||||
|
last = next + 1
|
||||||
case '\'', '"':
|
case '\'', '"':
|
||||||
// for quotes, consume runes until the quote has ended
|
// for quotes, consume runes until the quote has ended
|
||||||
quoteChar := c
|
quoteChar := c
|
||||||
for {
|
for {
|
||||||
next++
|
next++
|
||||||
if next == max {
|
if next == max {
|
||||||
fmt.Fprintf(os.Stderr, "Unmatched open quote (%c)\n", quoteChar)
|
return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar)
|
||||||
os.Exit(2)
|
|
||||||
}
|
}
|
||||||
if cmdString[next] == quoteChar {
|
if cmdString[next] == '\\' {
|
||||||
|
next++
|
||||||
|
if next == max {
|
||||||
|
return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar)
|
||||||
|
}
|
||||||
|
} else if cmdString[next] == quoteChar {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
85
parse_test.go
Normal file
85
parse_test.go
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
type parseCmdTest struct {
|
||||||
|
input string
|
||||||
|
split []string
|
||||||
|
ok bool
|
||||||
|
}
|
||||||
|
|
||||||
|
var parseCmdTests = []parseCmdTest{
|
||||||
|
parseCmdTest{`w1 w2`, []string{`w1`, `w2`}, true},
|
||||||
|
parseCmdTest{`w1 w2 w3`, []string{`w1`, `w2`, `w3`}, true},
|
||||||
|
parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1"w2 w3"`, []string{`w1"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1'w2 w3'`, []string{`w1'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1"w2 'w3"`, []string{`w1"w2 'w3"`}, true},
|
||||||
|
parseCmdTest{`w1'w2 "w3'`, []string{`w1'w2 "w3'`}, true},
|
||||||
|
parseCmdTest{`"w1 w2" "w3"`, []string{`"w1 w2"`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`'w1 w2' "w3"`, []string{`'w1 w2'`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`'w1 \'w2' "w3"`, []string{`'w1 \'w2'`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`'w1 \'w2 "w3"`, []string{}, false},
|
||||||
|
parseCmdTest{`w1 'w2 w3'"`, []string{}, false},
|
||||||
|
parseCmdTest{`w1 "w2 w3"'`, []string{}, false},
|
||||||
|
parseCmdTest{`w1 ' "w2 w3"`, []string{}, false},
|
||||||
|
parseCmdTest{`w1 " 'w2 w3'`, []string{}, false},
|
||||||
|
parseCmdTest{`w1"w2 w3""`, []string{}, false},
|
||||||
|
parseCmdTest{`w1'w2 w3''`, []string{}, false},
|
||||||
|
parseCmdTest{`w1"w2 'w3""`, []string{}, false},
|
||||||
|
parseCmdTest{`w1'w2 "w3''`, []string{}, false},
|
||||||
|
parseCmdTest{`"w1 w2" "w3"'`, []string{}, false},
|
||||||
|
parseCmdTest{`'w1 w2' "w3"'`, []string{}, false},
|
||||||
|
parseCmdTest{`w1,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1 , "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1 , 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1, "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1, 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1 ,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true},
|
||||||
|
parseCmdTest{`w1 ,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true},
|
||||||
|
parseCmdTest{`w1"w2, w3"`, []string{`w1"w2, w3"`}, true},
|
||||||
|
parseCmdTest{`w1'w2, w3'`, []string{`w1'w2, w3'`}, true},
|
||||||
|
parseCmdTest{`w1"w2, 'w3"`, []string{`w1"w2, 'w3"`}, true},
|
||||||
|
parseCmdTest{`w1'w2, "w3'`, []string{`w1'w2, "w3'`}, true},
|
||||||
|
parseCmdTest{`"w1, w2" "w3"`, []string{`"w1, w2"`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`'w1, w2' "w3"`, []string{`'w1, w2'`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`'w1, \'w2' "w3"`, []string{`'w1, \'w2'`, `"w3"`}, true},
|
||||||
|
parseCmdTest{`h1, .article-teaser, .article-content`, []string{
|
||||||
|
`h1`, `,`, `.article-teaser`, `,`, `.article-content`,
|
||||||
|
}, true},
|
||||||
|
parseCmdTest{`h1 ,.article-teaser ,.article-content`, []string{
|
||||||
|
`h1`, `,`, `.article-teaser`, `,`, `.article-content`,
|
||||||
|
}, true},
|
||||||
|
parseCmdTest{`h1 , .article-teaser , .article-content`, []string{
|
||||||
|
`h1`, `,`, `.article-teaser`, `,`, `.article-content`,
|
||||||
|
}, true},
|
||||||
|
}
|
||||||
|
|
||||||
|
func sliceEq(s1, s2 []string) bool {
|
||||||
|
if len(s1) != len(s2) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for i := range s1 {
|
||||||
|
if s1[i] != s2[i] {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseCommands(t *testing.T) {
|
||||||
|
for _, test := range parseCmdTests {
|
||||||
|
parsed, err := ParseCommands(test.input)
|
||||||
|
if test.ok != (err == nil) {
|
||||||
|
t.Errorf("`%s`: should have cause error? %v", test.input, !test.ok)
|
||||||
|
} else if !sliceEq(test.split, parsed) {
|
||||||
|
t.Errorf("`%s`: `%s`: `%s`", test.input, test.split, parsed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
23
pup.go
23
pup.go
@ -17,11 +17,15 @@ import (
|
|||||||
// |/ \_( # |"
|
// |/ \_( # |"
|
||||||
// C/ ,--___/
|
// C/ ,--___/
|
||||||
|
|
||||||
var VERSION string = "0.3.4"
|
var VERSION string = "0.3.5"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// process flags and arguments
|
// process flags and arguments
|
||||||
cmds := ParseArgs()
|
cmds, err := ParseArgs()
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
|
||||||
|
os.Exit(2)
|
||||||
|
}
|
||||||
|
|
||||||
// Determine the charset of the input
|
// Determine the charset of the input
|
||||||
cr, err := charset.NewReader(pupIn, "")
|
cr, err := charset.NewReader(pupIn, "")
|
||||||
@ -49,12 +53,14 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
switch cmd {
|
switch cmd {
|
||||||
case "*":
|
case "*": // select all
|
||||||
continue
|
continue
|
||||||
case "+":
|
case "+":
|
||||||
funcGenerator = SelectFromChildren
|
funcGenerator = SelectFromChildren
|
||||||
case ">":
|
case ">":
|
||||||
funcGenerator = SelectNextSibling
|
funcGenerator = SelectNextSibling
|
||||||
|
case ",": // nil will signify a comma
|
||||||
|
selectorFuncs = append(selectorFuncs, nil)
|
||||||
default:
|
default:
|
||||||
selector, err := ParseSelector(cmd)
|
selector, err := ParseSelector(cmd)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -66,9 +72,16 @@ func main() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
selectedNodes := []*html.Node{}
|
||||||
currNodes := []*html.Node{root}
|
currNodes := []*html.Node{root}
|
||||||
for _, selectorFunc := range selectorFuncs {
|
for _, selectorFunc := range selectorFuncs {
|
||||||
currNodes = selectorFunc(currNodes)
|
if selectorFunc == nil { // hit a comma
|
||||||
|
selectedNodes = append(selectedNodes, currNodes...)
|
||||||
|
currNodes = []*html.Node{root}
|
||||||
|
} else {
|
||||||
|
currNodes = selectorFunc(currNodes)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
pupDisplayer.Display(currNodes)
|
selectedNodes = append(selectedNodes, currNodes...)
|
||||||
|
pupDisplayer.Display(selectedNodes)
|
||||||
}
|
}
|
||||||
|
10
pup.rb
10
pup.rb
@ -2,14 +2,14 @@ require 'formula'
|
|||||||
|
|
||||||
class Pup < Formula
|
class Pup < Formula
|
||||||
homepage 'https://github.com/EricChiang/pup'
|
homepage 'https://github.com/EricChiang/pup'
|
||||||
version '0.3.4'
|
version '0.3.5'
|
||||||
|
|
||||||
if Hardware.is_64_bit?
|
if Hardware.is_64_bit?
|
||||||
url 'https://github.com/EricChiang/pup/releases/download/v0.3.4/pup_darwin_amd64.zip'
|
url 'https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_darwin_amd64.zip'
|
||||||
sha1 '5fec62701a49bfd5eaa4b9c980e9c06dcece78c6'
|
sha1 '6991dc9408e02adfa0ed5866eb7e284a94d79a77'
|
||||||
else
|
else
|
||||||
url 'https://github.com/EricChiang/pup/releases/download/v0.3.4/pup_darwin_386.zip'
|
url 'https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_darwin_386.zip'
|
||||||
sha1 '1eb129c662d7e323c9b1e8f8ed3b8e28ce521434'
|
sha1 'ec58d15a39ab821caa5f903035862690bbeb4dfe'
|
||||||
end
|
end
|
||||||
|
|
||||||
def install
|
def install
|
||||||
|
@ -39,3 +39,9 @@ td:empty
|
|||||||
#toc li + span
|
#toc li + span
|
||||||
#toc li > li
|
#toc li > li
|
||||||
li a:not([rel])
|
li a:not([rel])
|
||||||
|
link, a
|
||||||
|
link ,a
|
||||||
|
link , a
|
||||||
|
link , a sup
|
||||||
|
link , a:parent-of(sup)
|
||||||
|
link , a:parent-of(sup) sup
|
||||||
|
@ -39,3 +39,9 @@ dbc580de40eeb8448f0dbe1b98d74cf799a6868b #toc li + a
|
|||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
|
||||||
5d6e3ed3cfe310cde185cbfe1bba6aa7ec2a7f8d #toc li > li
|
5d6e3ed3cfe310cde185cbfe1bba6aa7ec2a7f8d #toc li > li
|
||||||
87eee1189dd5296d6c010a1ad329fc53c6099d72 li a:not([rel])
|
87eee1189dd5296d6c010a1ad329fc53c6099d72 li a:not([rel])
|
||||||
|
055f3c98e9160beb13f72f1009ad66b6252a9bba link, a
|
||||||
|
055f3c98e9160beb13f72f1009ad66b6252a9bba link ,a
|
||||||
|
055f3c98e9160beb13f72f1009ad66b6252a9bba link , a
|
||||||
|
0d1f66765d1632c70f8608947890524e78459362 link , a sup
|
||||||
|
b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup)
|
||||||
|
0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup
|
||||||
|
Loading…
Reference in New Issue
Block a user