diff --git a/README.md b/README.md index 5655c50..2561881 100644 --- a/README.md +++ b/README.md @@ -149,6 +149,22 @@ $ cat robots.html | pup ':parent-of([action="edit"])' For a complete list, view the [implemented selectors](#Implemented Selectors) section. + +####`+`, `>`, and `,` + +There are intermediate characters which declare special instructions. For +instance, a comma `,` allows pup to specify mulitple groups of selctors. + +```bash +$ cat robots.html | pup 'title, h1 span[dir="auto"]' + + Robots exclusion standard - Wikipedia, the free encyclopedia + + + Robots exclusion standard + +``` + ####Chain selectors together When combining selectors, the HTML nodes selected by the previous selector will diff --git a/parse.go b/parse.go index f61a43f..d9d570a 100644 --- a/parse.go +++ b/parse.go @@ -34,22 +34,23 @@ Flags os.Exit(exitCode) } -func ParseArgs() []string { - cmds := ProcessFlags(os.Args[1:]) +func ParseArgs() ([]string, error) { + cmds, err := ProcessFlags(os.Args[1:]) + if err != nil { + return []string{}, err + } return ParseCommands(strings.Join(cmds, " ")) } // Process command arguments and return all non-flags. -func ProcessFlags(cmds []string) []string { +func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) { var i int - var err error defer func() { if r := recover(); r != nil { - fmt.Fprintf(os.Stderr, "Option '%s' requires an argument", cmds[i]) - os.Exit(2) + err = fmt.Errorf("Option '%s' requires an argument", cmds[i]) } }() - nonFlagCmds := make([]string, len(cmds)) + nonFlagCmds = make([]string, len(cmds)) n := 0 for i = 0; i < len(cmds); i++ { cmd := cmds[i] @@ -77,8 +78,7 @@ func ProcessFlags(cmds []string) []string { case "-l", "--limit": pupMaxPrintLevel, err = strconv.Atoi(cmds[i+1]) if err != nil { - fmt.Fprintf(os.Stderr, "Argument for '%s' must be numeric\n", cmd) - os.Exit(2) + return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd) } i++ case "--version": @@ -86,18 +86,17 @@ func ProcessFlags(cmds []string) []string { os.Exit(0) default: if cmd[0] == '-' { - fmt.Fprintf(os.Stderr, "Unrecognized flag '%s'", cmd) - os.Exit(2) + return []string{}, fmt.Errorf("Unrecognized flag '%s'", cmd) } nonFlagCmds[n] = cmds[i] n++ } } - return nonFlagCmds[:n] + return nonFlagCmds[:n], nil } -// Split a string with awareness for quoted text -func ParseCommands(cmdString string) []string { +// Split a string with awareness for quoted text and commas +func ParseCommands(cmdString string) ([]string, error) { cmds := []string{} last, next, max := 0, 0, len(cmdString) for { @@ -106,7 +105,7 @@ func ParseCommands(cmdString string) []string { if next > last { cmds = append(cmds, cmdString[last:next]) } - return cmds + return cmds, nil } // evalute a rune c := cmdString[next] @@ -116,16 +115,26 @@ func ParseCommands(cmdString string) []string { cmds = append(cmds, cmdString[last:next]) } last = next + 1 + case ',': + if next > last { + cmds = append(cmds, cmdString[last:next]) + } + cmds = append(cmds, ",") + last = next + 1 case '\'', '"': // for quotes, consume runes until the quote has ended quoteChar := c for { next++ if next == max { - fmt.Fprintf(os.Stderr, "Unmatched open quote (%c)\n", quoteChar) - os.Exit(2) + return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar) } - if cmdString[next] == quoteChar { + if cmdString[next] == '\\' { + next++ + if next == max { + return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar) + } + } else if cmdString[next] == quoteChar { break } } diff --git a/parse_test.go b/parse_test.go new file mode 100644 index 0000000..c5dea74 --- /dev/null +++ b/parse_test.go @@ -0,0 +1,85 @@ +package main + +import ( + "testing" +) + +type parseCmdTest struct { + input string + split []string + ok bool +} + +var parseCmdTests = []parseCmdTest{ + parseCmdTest{`w1 w2`, []string{`w1`, `w2`}, true}, + parseCmdTest{`w1 w2 w3`, []string{`w1`, `w2`, `w3`}, true}, + parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true}, + parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true}, + parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true}, + parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true}, + parseCmdTest{`w1"w2 w3"`, []string{`w1"w2 w3"`}, true}, + parseCmdTest{`w1'w2 w3'`, []string{`w1'w2 w3'`}, true}, + parseCmdTest{`w1"w2 'w3"`, []string{`w1"w2 'w3"`}, true}, + parseCmdTest{`w1'w2 "w3'`, []string{`w1'w2 "w3'`}, true}, + parseCmdTest{`"w1 w2" "w3"`, []string{`"w1 w2"`, `"w3"`}, true}, + parseCmdTest{`'w1 w2' "w3"`, []string{`'w1 w2'`, `"w3"`}, true}, + parseCmdTest{`'w1 \'w2' "w3"`, []string{`'w1 \'w2'`, `"w3"`}, true}, + parseCmdTest{`'w1 \'w2 "w3"`, []string{}, false}, + parseCmdTest{`w1 'w2 w3'"`, []string{}, false}, + parseCmdTest{`w1 "w2 w3"'`, []string{}, false}, + parseCmdTest{`w1 ' "w2 w3"`, []string{}, false}, + parseCmdTest{`w1 " 'w2 w3'`, []string{}, false}, + parseCmdTest{`w1"w2 w3""`, []string{}, false}, + parseCmdTest{`w1'w2 w3''`, []string{}, false}, + parseCmdTest{`w1"w2 'w3""`, []string{}, false}, + parseCmdTest{`w1'w2 "w3''`, []string{}, false}, + parseCmdTest{`"w1 w2" "w3"'`, []string{}, false}, + parseCmdTest{`'w1 w2' "w3"'`, []string{}, false}, + parseCmdTest{`w1,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, + parseCmdTest{`w1,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, + parseCmdTest{`w1 , "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, + parseCmdTest{`w1 , 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, + parseCmdTest{`w1, "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, + parseCmdTest{`w1, 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, + parseCmdTest{`w1 ,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, + parseCmdTest{`w1 ,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, + parseCmdTest{`w1"w2, w3"`, []string{`w1"w2, w3"`}, true}, + parseCmdTest{`w1'w2, w3'`, []string{`w1'w2, w3'`}, true}, + parseCmdTest{`w1"w2, 'w3"`, []string{`w1"w2, 'w3"`}, true}, + parseCmdTest{`w1'w2, "w3'`, []string{`w1'w2, "w3'`}, true}, + parseCmdTest{`"w1, w2" "w3"`, []string{`"w1, w2"`, `"w3"`}, true}, + parseCmdTest{`'w1, w2' "w3"`, []string{`'w1, w2'`, `"w3"`}, true}, + parseCmdTest{`'w1, \'w2' "w3"`, []string{`'w1, \'w2'`, `"w3"`}, true}, + parseCmdTest{`h1, .article-teaser, .article-content`, []string{ + `h1`, `,`, `.article-teaser`, `,`, `.article-content`, + }, true}, + parseCmdTest{`h1 ,.article-teaser ,.article-content`, []string{ + `h1`, `,`, `.article-teaser`, `,`, `.article-content`, + }, true}, + parseCmdTest{`h1 , .article-teaser , .article-content`, []string{ + `h1`, `,`, `.article-teaser`, `,`, `.article-content`, + }, true}, +} + +func sliceEq(s1, s2 []string) bool { + if len(s1) != len(s2) { + return false + } + for i := range s1 { + if s1[i] != s2[i] { + return false + } + } + return true +} + +func TestParseCommands(t *testing.T) { + for _, test := range parseCmdTests { + parsed, err := ParseCommands(test.input) + if test.ok != (err == nil) { + t.Errorf("`%s`: should have cause error? %v", test.input, !test.ok) + } else if !sliceEq(test.split, parsed) { + t.Errorf("`%s`: `%s`: `%s`", test.input, test.split, parsed) + } + } +} diff --git a/pup.go b/pup.go index 1590eb3..43851d0 100644 --- a/pup.go +++ b/pup.go @@ -17,11 +17,15 @@ import ( // |/ \_( # |" // C/ ,--___/ -var VERSION string = "0.3.4" +var VERSION string = "0.3.5" func main() { // process flags and arguments - cmds := ParseArgs() + cmds, err := ParseArgs() + if err != nil { + fmt.Fprintf(os.Stderr, "%s\n", err.Error()) + os.Exit(2) + } // Determine the charset of the input cr, err := charset.NewReader(pupIn, "") @@ -49,12 +53,14 @@ func main() { } } switch cmd { - case "*": + case "*": // select all continue case "+": funcGenerator = SelectFromChildren case ">": funcGenerator = SelectNextSibling + case ",": // nil will signify a comma + selectorFuncs = append(selectorFuncs, nil) default: selector, err := ParseSelector(cmd) if err != nil { @@ -66,9 +72,16 @@ func main() { } } + selectedNodes := []*html.Node{} currNodes := []*html.Node{root} for _, selectorFunc := range selectorFuncs { - currNodes = selectorFunc(currNodes) + if selectorFunc == nil { // hit a comma + selectedNodes = append(selectedNodes, currNodes...) + currNodes = []*html.Node{root} + } else { + currNodes = selectorFunc(currNodes) + } } - pupDisplayer.Display(currNodes) + selectedNodes = append(selectedNodes, currNodes...) + pupDisplayer.Display(selectedNodes) } diff --git a/pup.rb b/pup.rb index 1dc8b5f..3691a32 100644 --- a/pup.rb +++ b/pup.rb @@ -2,14 +2,14 @@ require 'formula' class Pup < Formula homepage 'https://github.com/EricChiang/pup' - version '0.3.4' + version '0.3.5' if Hardware.is_64_bit? - url 'https://github.com/EricChiang/pup/releases/download/v0.3.4/pup_darwin_amd64.zip' - sha1 '5fec62701a49bfd5eaa4b9c980e9c06dcece78c6' + url 'https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_darwin_amd64.zip' + sha1 '6991dc9408e02adfa0ed5866eb7e284a94d79a77' else - url 'https://github.com/EricChiang/pup/releases/download/v0.3.4/pup_darwin_386.zip' - sha1 '1eb129c662d7e323c9b1e8f8ed3b8e28ce521434' + url 'https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_darwin_386.zip' + sha1 'ec58d15a39ab821caa5f903035862690bbeb4dfe' end def install diff --git a/tests/cmds.txt b/tests/cmds.txt index b05f947..d2a1e0c 100644 --- a/tests/cmds.txt +++ b/tests/cmds.txt @@ -39,3 +39,9 @@ td:empty #toc li + span #toc li > li li a:not([rel]) +link, a +link ,a +link , a +link , a sup +link , a:parent-of(sup) +link , a:parent-of(sup) sup diff --git a/tests/expected_output.txt b/tests/expected_output.txt index 4f5dec8..b1ab3b9 100644 --- a/tests/expected_output.txt +++ b/tests/expected_output.txt @@ -39,3 +39,9 @@ dbc580de40eeb8448f0dbe1b98d74cf799a6868b #toc li + a da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span 5d6e3ed3cfe310cde185cbfe1bba6aa7ec2a7f8d #toc li > li 87eee1189dd5296d6c010a1ad329fc53c6099d72 li a:not([rel]) +055f3c98e9160beb13f72f1009ad66b6252a9bba link, a +055f3c98e9160beb13f72f1009ad66b6252a9bba link ,a +055f3c98e9160beb13f72f1009ad66b6252a9bba link , a +0d1f66765d1632c70f8608947890524e78459362 link , a sup +b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup) +0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup