mirror of
https://github.com/ericchiang/pup
synced 2025-03-22 01:15:42 +00:00
add :matches selector
This commit is contained in:
parent
5a57cf1113
commit
097e035473
11
README.md
11
README.md
@ -137,6 +137,16 @@ $ cat robots.html | pup ':contains("History")'
|
|||||||
</span>
|
</span>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ cat robots.html | pup ':matches("Histor*")'
|
||||||
|
<span class="toctext">
|
||||||
|
History
|
||||||
|
</span>
|
||||||
|
<span class="mw-headline" id="History">
|
||||||
|
History
|
||||||
|
</span>
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ cat robots.html | pup ':parent-of([action="edit"])'
|
$ cat robots.html | pup ':parent-of([action="edit"])'
|
||||||
<span class="wb-langlinks-edit wb-langlinks-link">
|
<span class="wb-langlinks-edit wb-langlinks-link">
|
||||||
@ -211,6 +221,7 @@ pup ':last-of-type'
|
|||||||
pup ':only-child'
|
pup ':only-child'
|
||||||
pup ':only-of-type'
|
pup ':only-of-type'
|
||||||
pup ':contains("text")'
|
pup ':contains("text")'
|
||||||
|
pup ':matches("pattern")'
|
||||||
pup ':nth-child(n)'
|
pup ':nth-child(n)'
|
||||||
pup ':nth-of-type(n)'
|
pup ':nth-of-type(n)'
|
||||||
pup ':nth-last-child(n)'
|
pup ':nth-last-child(n)'
|
||||||
|
54
selector.go
54
selector.go
@ -371,6 +371,11 @@ func ParsePseudo(selector *CSSSelector, s scanner.Scanner) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
case strings.HasPrefix(cmd, "matches("):
|
||||||
|
selector.Pseudo, err = parseMatchesPseudo(cmd[len("matches("):])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
case strings.HasPrefix(cmd, "nth-child("),
|
case strings.HasPrefix(cmd, "nth-child("),
|
||||||
strings.HasPrefix(cmd, "nth-last-child("),
|
strings.HasPrefix(cmd, "nth-last-child("),
|
||||||
strings.HasPrefix(cmd, "nth-last-of-type("),
|
strings.HasPrefix(cmd, "nth-last-of-type("),
|
||||||
@ -592,6 +597,55 @@ func parseContainsPseudo(cmd string) (PseudoClass, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parse a :matches("") selector
|
||||||
|
// expects the input to be a valid regexp that matches text
|
||||||
|
func parseMatchesPseudo(cmd string) (PseudoClass, error) {
|
||||||
|
var s scanner.Scanner
|
||||||
|
s.Init(strings.NewReader(cmd))
|
||||||
|
switch s.Next() {
|
||||||
|
case '"':
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
|
||||||
|
}
|
||||||
|
pattern := bytes.NewBuffer([]byte{})
|
||||||
|
for {
|
||||||
|
r := s.Next()
|
||||||
|
switch r {
|
||||||
|
case '"':
|
||||||
|
// ')' then EOF must follow '"'
|
||||||
|
if s.Next() != ')' {
|
||||||
|
return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
|
||||||
|
}
|
||||||
|
if s.Next() != scanner.EOF {
|
||||||
|
return nil, fmt.Errorf("'matches(\"\")' must end selector")
|
||||||
|
}
|
||||||
|
p, err := regexp.Compile(pattern.String())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
contains := func(node *html.Node) bool {
|
||||||
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
||||||
|
if c.Type == html.TextNode {
|
||||||
|
if p.MatchString(c.Data) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return contains, nil
|
||||||
|
case '\\':
|
||||||
|
s.Next()
|
||||||
|
case scanner.EOF:
|
||||||
|
return nil, fmt.Errorf("Malformed 'contains(\"\")' selector")
|
||||||
|
default:
|
||||||
|
if _, err := pattern.WriteRune(r); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Parse a :not(selector) selector
|
// Parse a :not(selector) selector
|
||||||
// expects the input to be everything after the open parenthesis
|
// expects the input to be everything after the open parenthesis
|
||||||
// e.g. for `not(div#id)` the argument would be `div#id)`
|
// e.g. for `not(div#id)` the argument would be `div#id)`
|
||||||
|
@ -47,3 +47,5 @@ link , a:parent-of(sup)
|
|||||||
link , a:parent-of(sup) sup
|
link , a:parent-of(sup) sup
|
||||||
li --number
|
li --number
|
||||||
li -n
|
li -n
|
||||||
|
p:contains("Rob")
|
||||||
|
p:matches("Ro*")
|
||||||
|
@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li
|
|||||||
66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
|
66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
|
||||||
0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
|
0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
|
||||||
0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
|
0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
|
||||||
ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{}
|
199188dc8f1522426a628e41d96264bffb8beb0f json{}
|
||||||
95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
|
95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
|
||||||
e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
|
e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
|
||||||
@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1)
|
|||||||
613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
|
613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
|
||||||
97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{}
|
cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{}
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li
|
||||||
@ -47,3 +47,5 @@ b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup)
|
|||||||
0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup
|
0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number
|
||||||
da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n
|
||||||
|
4c15ca8f190a4412469e487fab6f7ad2479f922f p:contains("Rob")
|
||||||
|
da39a3ee5e6b4b0d3255bfef95601890afd80709 p:matches("Ro*")
|
||||||
|
@ -2,13 +2,13 @@
|
|||||||
|
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
from subprocess import Popen, PIPE, STDOUT
|
from subprocess import Popen, PIPE
|
||||||
|
|
||||||
data = open("index.html", "r").read()
|
data = open("index.html", "rb").read()
|
||||||
|
|
||||||
for line in open("cmds.txt", "r"):
|
for line in open("cmds.txt", "r"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
p = Popen(['pup', line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
|
p = Popen(["pup", line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
|
||||||
h = sha1()
|
h = sha1()
|
||||||
h.update(p.communicate(input=data)[0])
|
h.update(p.communicate(input=data)[0])
|
||||||
print("%s %s" % (h.hexdigest(), line))
|
print("%s %s" % (h.hexdigest(), line))
|
||||||
|
Loading…
Reference in New Issue
Block a user