1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-28 16:41:32 +00:00

Preserving sibling relationship of all node types

This commit is contained in:
Andre Marianiello 2019-02-23 15:08:25 -05:00
parent 14e452d641
commit 2bb485903c
3 changed files with 49 additions and 37 deletions

View File

@ -272,7 +272,7 @@ $ cat robots.html | pup 'div#p-namespaces a'
<a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c"> <a href="/wiki/Robots_exclusion_standard" title="View the content page [c]" accesskey="c">
Article Article
</a> </a>
<a href="/wiki/Talk:Robots_exclusion_standard" title="Discussion about the content page [t]" accesskey="t"> <a href="/wiki/Talk:Robots_exclusion_standard" rel="discussion" title="Discussion about the content page [t]" accesskey="t">
Talk Talk
</a> </a>
``` ```
@ -282,16 +282,25 @@ $ cat robots.html | pup 'div#p-namespaces a json{}'
[ [
{ {
"accesskey": "c", "accesskey": "c",
"children": [
{
"text": "Article"
}
],
"href": "/wiki/Robots_exclusion_standard", "href": "/wiki/Robots_exclusion_standard",
"tag": "a", "tag": "a",
"text": "Article",
"title": "View the content page [c]" "title": "View the content page [c]"
}, },
{ {
"accesskey": "t", "accesskey": "t",
"children": [
{
"text": "Talk"
}
],
"href": "/wiki/Talk:Robots_exclusion_standard", "href": "/wiki/Talk:Robots_exclusion_standard",
"rel": "discussion",
"tag": "a", "tag": "a",
"text": "Talk",
"title": "Discussion about the content page [t]" "title": "Discussion about the content page [t]"
} }
] ]
@ -304,16 +313,25 @@ $ cat robots.html | pup -i 4 'div#p-namespaces a json{}'
[ [
{ {
"accesskey": "c", "accesskey": "c",
"children": [
{
"text": "Article"
}
],
"href": "/wiki/Robots_exclusion_standard", "href": "/wiki/Robots_exclusion_standard",
"tag": "a", "tag": "a",
"text": "Article",
"title": "View the content page [c]" "title": "View the content page [c]"
}, },
{ {
"accesskey": "t", "accesskey": "t",
"children": [
{
"text": "Talk"
}
],
"href": "/wiki/Talk:Robots_exclusion_standard", "href": "/wiki/Talk:Robots_exclusion_standard",
"rel": "discussion",
"tag": "a", "tag": "a",
"text": "Talk",
"title": "Discussion about the content page [t]" "title": "Discussion about the content page [t]"
} }
] ]

View File

@ -272,38 +272,32 @@ func jsonify(node *html.Node) map[string]interface{} {
} }
} }
} }
vals["tag"] = node.DataAtom.String() switch node.Type {
case html.ElementNode:
vals["tag"] = node.Data
case html.TextNode:
text := strings.TrimSpace(node.Data)
if text != "" {
if pupEscapeHTML {
// don't escape javascript
if node.DataAtom != atom.Script {
text = html.EscapeString(text)
}
}
vals["text"] = text
}
case html.CommentNode:
comment := strings.TrimSpace(node.Data)
if pupEscapeHTML {
comment = html.EscapeString(comment)
}
vals["comment"] = comment
}
children := []interface{}{} children := []interface{}{}
for child := node.FirstChild; child != nil; child = child.NextSibling { for child := node.FirstChild; child != nil; child = child.NextSibling {
switch child.Type { jChild := jsonify(child)
case html.ElementNode: if len(jChild) > 0 {
children = append(children, jsonify(child)) children = append(children, jChild)
case html.TextNode:
text := strings.TrimSpace(child.Data)
if text != "" {
if pupEscapeHTML {
// don't escape javascript
if node.DataAtom != atom.Script {
text = html.EscapeString(text)
}
}
// if there is already text we'll append it
currText, ok := vals["text"]
if ok {
text = fmt.Sprintf("%s %s", currText, text)
}
vals["text"] = text
}
case html.CommentNode:
comment := strings.TrimSpace(child.Data)
if pupEscapeHTML {
comment = html.EscapeString(comment)
}
currComment, ok := vals["comment"]
if ok {
comment = fmt.Sprintf("%s %s", currComment, comment)
}
vals["comment"] = comment
} }
} }
if len(children) > 0 { if len(children) > 0 {

View File

@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li
66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"] 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{} 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{} 199188dc8f1522426a628e41d96264bffb8beb0f json{}
95ef88ded9dab22ee3206cca47b9c3a376274bda text{} 95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
da39a3ee5e6b4b0d3255bfef95601890afd80709 .after da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1)
613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{} da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{} cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{}
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li