1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-15 02:00:55 +00:00

json{} displayer added

This commit is contained in:
ericchiang 2014-10-11 12:58:29 -04:00
parent dd9e318ca4
commit dfe4a38973
4 changed files with 223 additions and 79 deletions

View File

@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors:
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
```
Even better, let's grab the titles too:
```bash
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}'
```
## Basic Usage
```bash
$ cat index.html | pup [selectors and flags]
$ cat index.html | pup [flags] [selectors] [optional display function]
```
or
```bash
$ pup < index.html [selectors and flags]
$ pup < index.html [flags] [selectors] [optional display function]
```
## Examples
@ -185,7 +191,7 @@ You can mix and match selectors as you wish.
cat index.html | pup element#id[attribute=value]
```
## Functions
## Display Functions
Non-HTML selectors which effect the output type are implemented as functions
which can be provided as a final argument.
@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head
//en.wikivoyage.org/wiki/
```
#### `json{}`
Print HTML as JSON.
```bash
$ cat robots.html | pup div#p-namespaces a
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
Article
</a>
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t">
Talk
</a>
```
```bash
$ cat robots.html | pup div#p-namespaces a json{}
[
{
"attrs": {
"accesskey": "c",
"href": "/wiki/Robots_exclusion_standard",
"title": "View the content page [c]"
},
"tag": "a",
"text": "Article"
},
{
"attrs": {
"accesskey": "t",
"href": "/wiki/Talk:Robots_exclusion_standard",
"title": "Discussion about the content page [t]"
},
"tag": "a",
"text": "Talk"
}
]
```
Use the `-i` / `--indent` flag to control the intent level.
```bash
$ cat robots.html | pup --indent 4 div#p-namespaces a json{}
[
{
"attrs": {
"accesskey": "c",
"href": "/wiki/Robots_exclusion_standard",
"title": "View the content page [c]"
},
"tag": "a",
"text": "Article"
},
{
"attrs": {
"accesskey": "t",
"href": "/wiki/Talk:Robots_exclusion_standard",
"title": "Discussion about the content page [t]"
},
"tag": "a",
"text": "Talk"
}
]
```
If the selectors only return one element the results will be printed as a JSON
object, not a list.
```bash
$ cat robots.html | pup --indent 4 title json{}
{
"tag": "title",
"text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
}
```
Because there is no universal standard for converting HTML/XML to JSON, a
method has been chosen which hopefully fits. The goal is simply to get the
output of pup into a more consumable format.
## Flags
```bash
@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head
--version display version
```
## TODO:
## TODO
* Print as json function `json{}`
Add more tests!

130
display.go Normal file
View File

@ -0,0 +1,130 @@
package main
import (
"encoding/json"
"fmt"
"regexp"
"strings"
"code.google.com/p/go.net/html"
)
type Displayer interface {
Display(nodes []*html.Node)
}
type TextDisplayer struct {
}
func (t TextDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
if node.Type == html.TextNode {
fmt.Println(node.Data)
}
children := []*html.Node{}
child := node.FirstChild
for child != nil {
children = append(children, child)
child = child.NextSibling
}
t.Display(children)
}
}
type AttrDisplayer struct {
Attr string
}
func (a AttrDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
attributes := node.Attr
for _, attr := range attributes {
if attr.Key == a.Attr {
val := html.EscapeString(attr.Val)
fmt.Printf("%s\n", val)
}
}
}
}
type JSONDisplayer struct {
}
// returns a jsonifiable struct
func jsonify(node *html.Node) map[string]interface{} {
vals := map[string]interface{}{}
if len(node.Attr) > 0 {
attrs := map[string]string{}
for _, attr := range node.Attr {
attrs[attr.Key] = html.EscapeString(attr.Val)
}
vals["attrs"] = attrs
}
vals["tag"] = node.DataAtom.String()
children := []interface{}{}
for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Type {
case html.ElementNode:
children = append(children, jsonify(child))
case html.TextNode:
text := strings.TrimSpace(child.Data)
if text != "" {
// if there is already text we'll append it
currText, ok := vals["text"]
if ok {
text = fmt.Sprintf("%s %s", currText, text)
}
vals["text"] = text
}
}
}
return vals
}
func (j JSONDisplayer) Display(nodes []*html.Node) {
var data []byte
var err error
switch len(nodes) {
case 1:
jsonNode := jsonify(nodes[0])
data, err = json.MarshalIndent(&jsonNode, "", indentString)
default:
jsonNodes := []map[string]interface{}{}
for _, node := range nodes {
jsonNodes = append(jsonNodes, jsonify(node))
}
data, err = json.MarshalIndent(&jsonNodes, "", indentString)
}
if err != nil {
panic("Could not jsonify nodes")
}
fmt.Printf("%s\n", data)
}
var (
// Display function helpers
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`)
)
func NewDisplayFunc(text string) (Displayer, error) {
if !displayMatcher.MatchString(text) {
return nil, fmt.Errorf("Not a display function")
}
switch {
case textFuncMatcher.MatchString(text):
return TextDisplayer{}, nil
case attrFuncMatcher.MatchString(text):
matches := attrFuncMatcher.FindStringSubmatch(text)
if len(matches) != 2 {
return nil, fmt.Errorf("")
} else {
return AttrDisplayer{matches[1]}, nil
}
case jsonFuncMatcher.MatchString(text):
return JSONDisplayer{}, nil
}
return nil, fmt.Errorf("Not a display function")
}

View File

@ -1,70 +0,0 @@
package funcs
import (
"code.google.com/p/go.net/html"
"fmt"
"regexp"
)
type Displayer interface {
Display(nodes []*html.Node)
}
type TextDisplayer struct {
}
func (t TextDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
if node.Type == html.TextNode {
fmt.Println(node.Data)
}
children := []*html.Node{}
child := node.FirstChild
for child != nil {
children = append(children, child)
child = child.NextSibling
}
t.Display(children)
}
}
type AttrDisplayer struct {
Attr string
}
func (a AttrDisplayer) Display(nodes []*html.Node) {
for _, node := range nodes {
attributes := node.Attr
for _, attr := range attributes {
if attr.Key == a.Attr {
val := html.EscapeString(attr.Val)
fmt.Printf("%s\n", val)
}
}
}
}
var (
// Display function helpers
displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
)
func NewDisplayFunc(text string) (Displayer, error) {
if !displayMatcher.MatchString(text) {
return nil, fmt.Errorf("Not a display function")
}
switch {
case textFuncMatcher.MatchString(text):
return TextDisplayer{}, nil
case attrFuncMatcher.MatchString(text):
matches := attrFuncMatcher.FindStringSubmatch(text)
if len(matches) != 2 {
return nil, fmt.Errorf("")
} else {
return AttrDisplayer{matches[1]}, nil
}
}
return nil, fmt.Errorf("Not a display function")
}

View File

@ -4,7 +4,6 @@ import (
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/charset"
"fmt"
"github.com/ericchiang/pup/funcs"
"github.com/ericchiang/pup/selector"
"io"
"os"
@ -12,7 +11,7 @@ import (
"strings"
)
const VERSION string = "0.3.0"
const VERSION string = "0.3.1"
var (
// Flags
@ -22,7 +21,7 @@ var (
maxPrintLevel int = -1
printNumber bool = false
printColor bool = false
displayer funcs.Displayer = nil
displayer Displayer = nil
)
// Print to stderr and exit
@ -177,7 +176,7 @@ func main() {
// if this is the last element, check for a function like
// text{} or attr{}
if i+1 == len(cmds) {
d, err := funcs.NewDisplayFunc(cmd)
d, err := NewDisplayFunc(cmd)
if err == nil {
displayer = d
selectors = selectors[0 : len(cmds)-1]