mirror of
https://github.com/ericchiang/pup
synced 2024-11-24 08:58:08 +00:00
commit
571adeb841
95
README.md
95
README.md
@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors:
|
|||||||
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
|
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Even better, let's grab the titles too:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}'
|
||||||
|
```
|
||||||
|
|
||||||
## Basic Usage
|
## Basic Usage
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ cat index.html | pup [selectors and flags]
|
$ cat index.html | pup [flags] [selectors] [optional display function]
|
||||||
```
|
```
|
||||||
|
|
||||||
or
|
or
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ pup < index.html [selectors and flags]
|
$ pup < index.html [flags] [selectors] [optional display function]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
@ -185,7 +191,7 @@ You can mix and match selectors as you wish.
|
|||||||
cat index.html | pup element#id[attribute=value]
|
cat index.html | pup element#id[attribute=value]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Functions
|
## Display Functions
|
||||||
|
|
||||||
Non-HTML selectors which effect the output type are implemented as functions
|
Non-HTML selectors which effect the output type are implemented as functions
|
||||||
which can be provided as a final argument.
|
which can be provided as a final argument.
|
||||||
@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head
|
|||||||
//en.wikivoyage.org/wiki/
|
//en.wikivoyage.org/wiki/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### `json{}`
|
||||||
|
|
||||||
|
Print HTML as JSON.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup div#p-namespaces a
|
||||||
|
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
|
||||||
|
Article
|
||||||
|
</a>
|
||||||
|
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t">
|
||||||
|
Talk
|
||||||
|
</a>
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup div#p-namespaces a json{}
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"attrs": {
|
||||||
|
"accesskey": "c",
|
||||||
|
"href": "/wiki/Robots_exclusion_standard",
|
||||||
|
"title": "View the content page [c]"
|
||||||
|
},
|
||||||
|
"tag": "a",
|
||||||
|
"text": "Article"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attrs": {
|
||||||
|
"accesskey": "t",
|
||||||
|
"href": "/wiki/Talk:Robots_exclusion_standard",
|
||||||
|
"title": "Discussion about the content page [t]"
|
||||||
|
},
|
||||||
|
"tag": "a",
|
||||||
|
"text": "Talk"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
Use the `-i` / `--indent` flag to control the intent level.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup --indent 4 div#p-namespaces a json{}
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"attrs": {
|
||||||
|
"accesskey": "c",
|
||||||
|
"href": "/wiki/Robots_exclusion_standard",
|
||||||
|
"title": "View the content page [c]"
|
||||||
|
},
|
||||||
|
"tag": "a",
|
||||||
|
"text": "Article"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"attrs": {
|
||||||
|
"accesskey": "t",
|
||||||
|
"href": "/wiki/Talk:Robots_exclusion_standard",
|
||||||
|
"title": "Discussion about the content page [t]"
|
||||||
|
},
|
||||||
|
"tag": "a",
|
||||||
|
"text": "Talk"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
If the selectors only return one element the results will be printed as a JSON
|
||||||
|
object, not a list.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup --indent 4 title json{}
|
||||||
|
{
|
||||||
|
"tag": "title",
|
||||||
|
"text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Because there is no universal standard for converting HTML/XML to JSON, a
|
||||||
|
method has been chosen which hopefully fits. The goal is simply to get the
|
||||||
|
output of pup into a more consumable format.
|
||||||
|
|
||||||
## Flags
|
## Flags
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head
|
|||||||
--version display version
|
--version display version
|
||||||
```
|
```
|
||||||
|
|
||||||
## TODO:
|
## TODO
|
||||||
|
|
||||||
* Print as json function `json{}`
|
Add more tests!
|
||||||
|
130
display.go
Normal file
130
display.go
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"code.google.com/p/go.net/html"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Displayer interface {
|
||||||
|
Display(nodes []*html.Node)
|
||||||
|
}
|
||||||
|
|
||||||
|
type TextDisplayer struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t TextDisplayer) Display(nodes []*html.Node) {
|
||||||
|
for _, node := range nodes {
|
||||||
|
if node.Type == html.TextNode {
|
||||||
|
fmt.Println(node.Data)
|
||||||
|
}
|
||||||
|
children := []*html.Node{}
|
||||||
|
child := node.FirstChild
|
||||||
|
for child != nil {
|
||||||
|
children = append(children, child)
|
||||||
|
child = child.NextSibling
|
||||||
|
}
|
||||||
|
t.Display(children)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type AttrDisplayer struct {
|
||||||
|
Attr string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a AttrDisplayer) Display(nodes []*html.Node) {
|
||||||
|
for _, node := range nodes {
|
||||||
|
attributes := node.Attr
|
||||||
|
for _, attr := range attributes {
|
||||||
|
if attr.Key == a.Attr {
|
||||||
|
val := html.EscapeString(attr.Val)
|
||||||
|
fmt.Printf("%s\n", val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type JSONDisplayer struct {
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns a jsonifiable struct
|
||||||
|
func jsonify(node *html.Node) map[string]interface{} {
|
||||||
|
vals := map[string]interface{}{}
|
||||||
|
if len(node.Attr) > 0 {
|
||||||
|
attrs := map[string]string{}
|
||||||
|
for _, attr := range node.Attr {
|
||||||
|
attrs[attr.Key] = html.EscapeString(attr.Val)
|
||||||
|
}
|
||||||
|
vals["attrs"] = attrs
|
||||||
|
}
|
||||||
|
vals["tag"] = node.DataAtom.String()
|
||||||
|
children := []interface{}{}
|
||||||
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
||||||
|
switch child.Type {
|
||||||
|
case html.ElementNode:
|
||||||
|
children = append(children, jsonify(child))
|
||||||
|
case html.TextNode:
|
||||||
|
text := strings.TrimSpace(child.Data)
|
||||||
|
if text != "" {
|
||||||
|
// if there is already text we'll append it
|
||||||
|
currText, ok := vals["text"]
|
||||||
|
if ok {
|
||||||
|
text = fmt.Sprintf("%s %s", currText, text)
|
||||||
|
}
|
||||||
|
vals["text"] = text
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return vals
|
||||||
|
}
|
||||||
|
|
||||||
|
func (j JSONDisplayer) Display(nodes []*html.Node) {
|
||||||
|
var data []byte
|
||||||
|
var err error
|
||||||
|
switch len(nodes) {
|
||||||
|
case 1:
|
||||||
|
jsonNode := jsonify(nodes[0])
|
||||||
|
data, err = json.MarshalIndent(&jsonNode, "", indentString)
|
||||||
|
default:
|
||||||
|
jsonNodes := []map[string]interface{}{}
|
||||||
|
for _, node := range nodes {
|
||||||
|
jsonNodes = append(jsonNodes, jsonify(node))
|
||||||
|
}
|
||||||
|
data, err = json.MarshalIndent(&jsonNodes, "", indentString)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
panic("Could not jsonify nodes")
|
||||||
|
}
|
||||||
|
fmt.Printf("%s\n", data)
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
// Display function helpers
|
||||||
|
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
|
||||||
|
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
|
||||||
|
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
|
||||||
|
jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func NewDisplayFunc(text string) (Displayer, error) {
|
||||||
|
if !displayMatcher.MatchString(text) {
|
||||||
|
return nil, fmt.Errorf("Not a display function")
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case textFuncMatcher.MatchString(text):
|
||||||
|
return TextDisplayer{}, nil
|
||||||
|
case attrFuncMatcher.MatchString(text):
|
||||||
|
matches := attrFuncMatcher.FindStringSubmatch(text)
|
||||||
|
if len(matches) != 2 {
|
||||||
|
return nil, fmt.Errorf("")
|
||||||
|
} else {
|
||||||
|
return AttrDisplayer{matches[1]}, nil
|
||||||
|
}
|
||||||
|
case jsonFuncMatcher.MatchString(text):
|
||||||
|
return JSONDisplayer{}, nil
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("Not a display function")
|
||||||
|
}
|
@ -1,70 +0,0 @@
|
|||||||
package funcs
|
|
||||||
|
|
||||||
import (
|
|
||||||
"code.google.com/p/go.net/html"
|
|
||||||
"fmt"
|
|
||||||
"regexp"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Displayer interface {
|
|
||||||
Display(nodes []*html.Node)
|
|
||||||
}
|
|
||||||
|
|
||||||
type TextDisplayer struct {
|
|
||||||
}
|
|
||||||
|
|
||||||
func (t TextDisplayer) Display(nodes []*html.Node) {
|
|
||||||
for _, node := range nodes {
|
|
||||||
if node.Type == html.TextNode {
|
|
||||||
fmt.Println(node.Data)
|
|
||||||
}
|
|
||||||
children := []*html.Node{}
|
|
||||||
child := node.FirstChild
|
|
||||||
for child != nil {
|
|
||||||
children = append(children, child)
|
|
||||||
child = child.NextSibling
|
|
||||||
}
|
|
||||||
t.Display(children)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
type AttrDisplayer struct {
|
|
||||||
Attr string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a AttrDisplayer) Display(nodes []*html.Node) {
|
|
||||||
for _, node := range nodes {
|
|
||||||
attributes := node.Attr
|
|
||||||
for _, attr := range attributes {
|
|
||||||
if attr.Key == a.Attr {
|
|
||||||
val := html.EscapeString(attr.Val)
|
|
||||||
fmt.Printf("%s\n", val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
// Display function helpers
|
|
||||||
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
|
|
||||||
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
|
|
||||||
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
|
|
||||||
)
|
|
||||||
|
|
||||||
func NewDisplayFunc(text string) (Displayer, error) {
|
|
||||||
if !displayMatcher.MatchString(text) {
|
|
||||||
return nil, fmt.Errorf("Not a display function")
|
|
||||||
}
|
|
||||||
switch {
|
|
||||||
case textFuncMatcher.MatchString(text):
|
|
||||||
return TextDisplayer{}, nil
|
|
||||||
case attrFuncMatcher.MatchString(text):
|
|
||||||
matches := attrFuncMatcher.FindStringSubmatch(text)
|
|
||||||
if len(matches) != 2 {
|
|
||||||
return nil, fmt.Errorf("")
|
|
||||||
} else {
|
|
||||||
return AttrDisplayer{matches[1]}, nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil, fmt.Errorf("Not a display function")
|
|
||||||
}
|
|
7
main.go
7
main.go
@ -4,7 +4,6 @@ import (
|
|||||||
"code.google.com/p/go.net/html"
|
"code.google.com/p/go.net/html"
|
||||||
"code.google.com/p/go.net/html/charset"
|
"code.google.com/p/go.net/html/charset"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/ericchiang/pup/funcs"
|
|
||||||
"github.com/ericchiang/pup/selector"
|
"github.com/ericchiang/pup/selector"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
@ -12,7 +11,7 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const VERSION string = "0.3.0"
|
const VERSION string = "0.3.1"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
// Flags
|
// Flags
|
||||||
@ -22,7 +21,7 @@ var (
|
|||||||
maxPrintLevel int = -1
|
maxPrintLevel int = -1
|
||||||
printNumber bool = false
|
printNumber bool = false
|
||||||
printColor bool = false
|
printColor bool = false
|
||||||
displayer funcs.Displayer = nil
|
displayer Displayer = nil
|
||||||
)
|
)
|
||||||
|
|
||||||
// Print to stderr and exit
|
// Print to stderr and exit
|
||||||
@ -177,7 +176,7 @@ func main() {
|
|||||||
// if this is the last element, check for a function like
|
// if this is the last element, check for a function like
|
||||||
// text{} or attr{}
|
// text{} or attr{}
|
||||||
if i+1 == len(cmds) {
|
if i+1 == len(cmds) {
|
||||||
d, err := funcs.NewDisplayFunc(cmd)
|
d, err := NewDisplayFunc(cmd)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
displayer = d
|
displayer = d
|
||||||
selectors = selectors[0 : len(cmds)-1]
|
selectors = selectors[0 : len(cmds)-1]
|
||||||
|
Loading…
Reference in New Issue
Block a user