From dfe4a389730ced080712b4f07a57b0ac41f95965 Mon Sep 17 00:00:00 2001 From: ericchiang Date: Sat, 11 Oct 2014 12:58:29 -0400 Subject: [PATCH] json{} displayer added --- README.md | 95 ++++++++++++++++++++++++++++++++-- display.go | 130 +++++++++++++++++++++++++++++++++++++++++++++++ funcs/display.go | 70 ------------------------- main.go | 7 ++- 4 files changed, 223 insertions(+), 79 deletions(-) create mode 100644 display.go delete mode 100644 funcs/display.go diff --git a/README.md b/README.md index f007552..cfdc1c1 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors: $ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}' ``` +Even better, let's grab the titles too: + +```bash +$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}' +``` + ## Basic Usage ```bash -$ cat index.html | pup [selectors and flags] +$ cat index.html | pup [flags] [selectors] [optional display function] ``` or ```bash -$ pup < index.html [selectors and flags] +$ pup < index.html [flags] [selectors] [optional display function] ``` ## Examples @@ -185,7 +191,7 @@ You can mix and match selectors as you wish. cat index.html | pup element#id[attribute=value] ``` -## Functions +## Display Functions Non-HTML selectors which effect the output type are implemented as functions which can be provided as a final argument. @@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head //en.wikivoyage.org/wiki/ ``` +#### `json{}` + +Print HTML as JSON. + +```bash +$ cat robots.html | pup div#p-namespaces a + + Article + + + Talk + +``` + +```bash +$ cat robots.html | pup div#p-namespaces a json{} +[ + { + "attrs": { + "accesskey": "c", + "href": "/wiki/Robots_exclusion_standard", + "title": "View the content page [c]" + }, + "tag": "a", + "text": "Article" + }, + { + "attrs": { + "accesskey": "t", + "href": "/wiki/Talk:Robots_exclusion_standard", + "title": "Discussion about the content page [t]" + }, + "tag": "a", + "text": "Talk" + } +] +``` + +Use the `-i` / `--indent` flag to control the intent level. + +```bash +$ cat robots.html | pup --indent 4 div#p-namespaces a json{} +[ + { + "attrs": { + "accesskey": "c", + "href": "/wiki/Robots_exclusion_standard", + "title": "View the content page [c]" + }, + "tag": "a", + "text": "Article" + }, + { + "attrs": { + "accesskey": "t", + "href": "/wiki/Talk:Robots_exclusion_standard", + "title": "Discussion about the content page [t]" + }, + "tag": "a", + "text": "Talk" + } +] +``` + +If the selectors only return one element the results will be printed as a JSON +object, not a list. + +```bash +$ cat robots.html | pup --indent 4 title json{} +{ + "tag": "title", + "text": "Robots exclusion standard - Wikipedia, the free encyclopedia" +} +``` + +Because there is no universal standard for converting HTML/XML to JSON, a +method has been chosen which hopefully fits. The goal is simply to get the +output of pup into a more consumable format. + ## Flags ```bash @@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head --version display version ``` -## TODO: +## TODO -* Print as json function `json{}` +Add more tests! diff --git a/display.go b/display.go new file mode 100644 index 0000000..0e0d80a --- /dev/null +++ b/display.go @@ -0,0 +1,130 @@ +package main + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" + + "code.google.com/p/go.net/html" +) + +type Displayer interface { + Display(nodes []*html.Node) +} + +type TextDisplayer struct { +} + +func (t TextDisplayer) Display(nodes []*html.Node) { + for _, node := range nodes { + if node.Type == html.TextNode { + fmt.Println(node.Data) + } + children := []*html.Node{} + child := node.FirstChild + for child != nil { + children = append(children, child) + child = child.NextSibling + } + t.Display(children) + } +} + +type AttrDisplayer struct { + Attr string +} + +func (a AttrDisplayer) Display(nodes []*html.Node) { + for _, node := range nodes { + attributes := node.Attr + for _, attr := range attributes { + if attr.Key == a.Attr { + val := html.EscapeString(attr.Val) + fmt.Printf("%s\n", val) + } + } + } +} + +type JSONDisplayer struct { +} + +// returns a jsonifiable struct +func jsonify(node *html.Node) map[string]interface{} { + vals := map[string]interface{}{} + if len(node.Attr) > 0 { + attrs := map[string]string{} + for _, attr := range node.Attr { + attrs[attr.Key] = html.EscapeString(attr.Val) + } + vals["attrs"] = attrs + } + vals["tag"] = node.DataAtom.String() + children := []interface{}{} + for child := node.FirstChild; child != nil; child = child.NextSibling { + switch child.Type { + case html.ElementNode: + children = append(children, jsonify(child)) + case html.TextNode: + text := strings.TrimSpace(child.Data) + if text != "" { + // if there is already text we'll append it + currText, ok := vals["text"] + if ok { + text = fmt.Sprintf("%s %s", currText, text) + } + vals["text"] = text + } + } + } + return vals +} + +func (j JSONDisplayer) Display(nodes []*html.Node) { + var data []byte + var err error + switch len(nodes) { + case 1: + jsonNode := jsonify(nodes[0]) + data, err = json.MarshalIndent(&jsonNode, "", indentString) + default: + jsonNodes := []map[string]interface{}{} + for _, node := range nodes { + jsonNodes = append(jsonNodes, jsonify(node)) + } + data, err = json.MarshalIndent(&jsonNodes, "", indentString) + } + if err != nil { + panic("Could not jsonify nodes") + } + fmt.Printf("%s\n", data) +} + +var ( + // Display function helpers + displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`) + textFuncMatcher = regexp.MustCompile(`^text\{\}$`) + attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`) + jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`) +) + +func NewDisplayFunc(text string) (Displayer, error) { + if !displayMatcher.MatchString(text) { + return nil, fmt.Errorf("Not a display function") + } + switch { + case textFuncMatcher.MatchString(text): + return TextDisplayer{}, nil + case attrFuncMatcher.MatchString(text): + matches := attrFuncMatcher.FindStringSubmatch(text) + if len(matches) != 2 { + return nil, fmt.Errorf("") + } else { + return AttrDisplayer{matches[1]}, nil + } + case jsonFuncMatcher.MatchString(text): + return JSONDisplayer{}, nil + } + return nil, fmt.Errorf("Not a display function") +} diff --git a/funcs/display.go b/funcs/display.go deleted file mode 100644 index f168425..0000000 --- a/funcs/display.go +++ /dev/null @@ -1,70 +0,0 @@ -package funcs - -import ( - "code.google.com/p/go.net/html" - "fmt" - "regexp" -) - -type Displayer interface { - Display(nodes []*html.Node) -} - -type TextDisplayer struct { -} - -func (t TextDisplayer) Display(nodes []*html.Node) { - for _, node := range nodes { - if node.Type == html.TextNode { - fmt.Println(node.Data) - } - children := []*html.Node{} - child := node.FirstChild - for child != nil { - children = append(children, child) - child = child.NextSibling - } - t.Display(children) - } -} - -type AttrDisplayer struct { - Attr string -} - -func (a AttrDisplayer) Display(nodes []*html.Node) { - for _, node := range nodes { - attributes := node.Attr - for _, attr := range attributes { - if attr.Key == a.Attr { - val := html.EscapeString(attr.Val) - fmt.Printf("%s\n", val) - } - } - } -} - -var ( - // Display function helpers - displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`) - textFuncMatcher = regexp.MustCompile(`^text\{\}$`) - attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`) -) - -func NewDisplayFunc(text string) (Displayer, error) { - if !displayMatcher.MatchString(text) { - return nil, fmt.Errorf("Not a display function") - } - switch { - case textFuncMatcher.MatchString(text): - return TextDisplayer{}, nil - case attrFuncMatcher.MatchString(text): - matches := attrFuncMatcher.FindStringSubmatch(text) - if len(matches) != 2 { - return nil, fmt.Errorf("") - } else { - return AttrDisplayer{matches[1]}, nil - } - } - return nil, fmt.Errorf("Not a display function") -} diff --git a/main.go b/main.go index c37365c..f9a4769 100644 --- a/main.go +++ b/main.go @@ -4,7 +4,6 @@ import ( "code.google.com/p/go.net/html" "code.google.com/p/go.net/html/charset" "fmt" - "github.com/ericchiang/pup/funcs" "github.com/ericchiang/pup/selector" "io" "os" @@ -12,7 +11,7 @@ import ( "strings" ) -const VERSION string = "0.3.0" +const VERSION string = "0.3.1" var ( // Flags @@ -22,7 +21,7 @@ var ( maxPrintLevel int = -1 printNumber bool = false printColor bool = false - displayer funcs.Displayer = nil + displayer Displayer = nil ) // Print to stderr and exit @@ -177,7 +176,7 @@ func main() { // if this is the last element, check for a function like // text{} or attr{} if i+1 == len(cmds) { - d, err := funcs.NewDisplayFunc(cmd) + d, err := NewDisplayFunc(cmd) if err == nil { displayer = d selectors = selectors[0 : len(cmds)-1]