pull/111/merge
andremarianiello 2 years ago committed by GitHub
commit dd97440993
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -272,7 +272,7 @@ $ cat robots.html | pup 'div#p-namespaces a'
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c"> <a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
Article Article
</a> </a>
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t"> <a href="/wiki/Talk:Robots_exclusion_standard" rel="discussion" title="Discussion about the content page [t]" accesskey="t">
Talk Talk
</a> </a>
``` ```
@ -282,16 +282,25 @@ $ cat robots.html | pup 'div#p-namespaces a json{}'
[ [
{ {
"accesskey": "c", "accesskey": "c",
"children": [
{
"text": "Article"
}
],
"href": "/wiki/Robots_exclusion_standard", "href": "/wiki/Robots_exclusion_standard",
"tag": "a", "tag": "a",
"text": "Article",
"title": "View the content page [c]" "title": "View the content page [c]"
}, },
{ {
"accesskey": "t", "accesskey": "t",
"children": [
{
"text": "Talk"
}
],
"href": "/wiki/Talk:Robots_exclusion_standard", "href": "/wiki/Talk:Robots_exclusion_standard",
"rel": "discussion",
"tag": "a", "tag": "a",
"text": "Talk",
"title": "Discussion about the content page [t]" "title": "Discussion about the content page [t]"
} }
] ]
@ -304,32 +313,30 @@ $ cat robots.html | pup -i 4 'div#p-namespaces a json{}'
[ [
{ {
"accesskey": "c", "accesskey": "c",
"children": [
{
"text": "Article"
}
],
"href": "/wiki/Robots_exclusion_standard", "href": "/wiki/Robots_exclusion_standard",
"tag": "a", "tag": "a",
"text": "Article",
"title": "View the content page [c]" "title": "View the content page [c]"
}, },
{ {
"accesskey": "t", "accesskey": "t",
"children": [
{
"text": "Talk"
}
],
"href": "/wiki/Talk:Robots_exclusion_standard", "href": "/wiki/Talk:Robots_exclusion_standard",
"rel": "discussion",
"tag": "a", "tag": "a",
"text": "Talk",
"title": "Discussion about the content page [t]" "title": "Discussion about the content page [t]"
} }
] ]
``` ```
If the selectors only return one element the results will be printed as a JSON
object, not a list.
```bash
$ cat robots.html | pup --indent 4 'title json{}'
{
"tag": "title",
"text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
}
```
Because there is no universal standard for converting HTML/XML to JSON, a Because there is no universal standard for converting HTML/XML to JSON, a
method has been chosen which hopefully fits. The goal is simply to get the method has been chosen which hopefully fits. The goal is simply to get the
output of pup into a more consumable format. output of pup into a more consumable format.

@ -272,38 +272,32 @@ func jsonify(node *html.Node) map[string]interface{} {
} }
} }
} }
vals["tag"] = node.DataAtom.String() switch node.Type {
children := []interface{}{} case html.ElementNode:
for child := node.FirstChild; child != nil; child = child.NextSibling { vals["tag"] = node.Data
switch child.Type { case html.TextNode:
case html.ElementNode: text := node.Data
children = append(children, jsonify(child)) if text != "" {
case html.TextNode:
text := strings.TrimSpace(child.Data)
if text != "" {
if pupEscapeHTML {
// don't escape javascript
if node.DataAtom != atom.Script {
text = html.EscapeString(text)
}
}
// if there is already text we'll append it
currText, ok := vals["text"]
if ok {
text = fmt.Sprintf("%s %s", currText, text)
}
vals["text"] = text
}
case html.CommentNode:
comment := strings.TrimSpace(child.Data)
if pupEscapeHTML { if pupEscapeHTML {
comment = html.EscapeString(comment) // don't escape javascript
} if node.DataAtom != atom.Script {
currComment, ok := vals["comment"] text = html.EscapeString(text)
if ok { }
comment = fmt.Sprintf("%s %s", currComment, comment)
} }
vals["comment"] = comment vals["text"] = text
}
case html.CommentNode:
comment := strings.TrimSpace(node.Data)
if pupEscapeHTML {
comment = html.EscapeString(comment)
}
vals["comment"] = comment
}
children := []interface{}{}
for child := node.FirstChild; child != nil; child = child.NextSibling {
jChild := jsonify(child)
if len(jChild) > 0 {
children = append(children, jChild)
} }
} }
if len(children) > 0 { if len(children) > 0 {

@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li
66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"] 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{} 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{} 199188dc8f1522426a628e41d96264bffb8beb0f json{}
95ef88ded9dab22ee3206cca47b9c3a376274bda text{} 95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
da39a3ee5e6b4b0d3255bfef95601890afd80709 .after da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1)
613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{} da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{} cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{}
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li

Loading…
Cancel
Save