From 097e035473a3547693df7234508a7c2f12804f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= Date: Thu, 20 Feb 2025 18:14:49 +0300 Subject: [PATCH] add :matches selector --- README.md | 11 ++++++++ selector.go | 54 +++++++++++++++++++++++++++++++++++++++ tests/cmds.txt | 2 ++ tests/expected_output.txt | 6 +++-- tests/run.py | 6 ++--- 5 files changed, 74 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index a24ac88..7ab1558 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,16 @@ $ cat robots.html | pup ':contains("History")' ``` +```bash +$ cat robots.html | pup ':matches("Histor*")' + + History + + + History + +``` + ```bash $ cat robots.html | pup ':parent-of([action="edit"])' @@ -211,6 +221,7 @@ pup ':last-of-type' pup ':only-child' pup ':only-of-type' pup ':contains("text")' +pup ':matches("pattern")' pup ':nth-child(n)' pup ':nth-of-type(n)' pup ':nth-last-child(n)' diff --git a/selector.go b/selector.go index 6249f77..5b6a05d 100644 --- a/selector.go +++ b/selector.go @@ -371,6 +371,11 @@ func ParsePseudo(selector *CSSSelector, s scanner.Scanner) error { if err != nil { return err } + case strings.HasPrefix(cmd, "matches("): + selector.Pseudo, err = parseMatchesPseudo(cmd[len("matches("):]) + if err != nil { + return err + } case strings.HasPrefix(cmd, "nth-child("), strings.HasPrefix(cmd, "nth-last-child("), strings.HasPrefix(cmd, "nth-last-of-type("), @@ -592,6 +597,55 @@ func parseContainsPseudo(cmd string) (PseudoClass, error) { } } +// Parse a :matches("") selector +// expects the input to be a valid regexp that matches text +func parseMatchesPseudo(cmd string) (PseudoClass, error) { + var s scanner.Scanner + s.Init(strings.NewReader(cmd)) + switch s.Next() { + case '"': + default: + return nil, fmt.Errorf("Malformed 'matches(\"\")' selector") + } + pattern := bytes.NewBuffer([]byte{}) + for { + r := s.Next() + switch r { + case '"': + // ')' then EOF must follow '"' + if s.Next() != ')' { + return nil, fmt.Errorf("Malformed 'matches(\"\")' selector") + } + if s.Next() != scanner.EOF { + return nil, fmt.Errorf("'matches(\"\")' must end selector") + } + p, err := regexp.Compile(pattern.String()) + if err != nil { + return nil, err + } + contains := func(node *html.Node) bool { + for c := node.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.TextNode { + if p.MatchString(c.Data) { + return true + } + } + } + return false + } + return contains, nil + case '\\': + s.Next() + case scanner.EOF: + return nil, fmt.Errorf("Malformed 'contains(\"\")' selector") + default: + if _, err := pattern.WriteRune(r); err != nil { + return nil, err + } + } + } +} + // Parse a :not(selector) selector // expects the input to be everything after the open parenthesis // e.g. for `not(div#id)` the argument would be `div#id)` diff --git a/tests/cmds.txt b/tests/cmds.txt index beca6c2..50da070 100644 --- a/tests/cmds.txt +++ b/tests/cmds.txt @@ -47,3 +47,5 @@ link , a:parent-of(sup) link , a:parent-of(sup) sup li --number li -n +p:contains("Rob") +p:matches("Ro*") diff --git a/tests/expected_output.txt b/tests/expected_output.txt index 7f06b47..43408f8 100644 --- a/tests/expected_output.txt +++ b/tests/expected_output.txt @@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"] 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{} -ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{} +199188dc8f1522426a628e41d96264bffb8beb0f json{} 95ef88ded9dab22ee3206cca47b9c3a376274bda text{} e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet da39a3ee5e6b4b0d3255bfef95601890afd80709 .after @@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1) 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{} -97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{} +cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{} da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li @@ -47,3 +47,5 @@ b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup) 0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n +4c15ca8f190a4412469e487fab6f7ad2479f922f p:contains("Rob") +da39a3ee5e6b4b0d3255bfef95601890afd80709 p:matches("Ro*") diff --git a/tests/run.py b/tests/run.py index 67a13e0..ca662a7 100755 --- a/tests/run.py +++ b/tests/run.py @@ -2,13 +2,13 @@ from __future__ import print_function from hashlib import sha1 -from subprocess import Popen, PIPE, STDOUT +from subprocess import Popen, PIPE -data = open("index.html", "r").read() +data = open("index.html", "rb").read() for line in open("cmds.txt", "r"): line = line.strip() - p = Popen(['pup', line], stdout=PIPE, stdin=PIPE, stderr=PIPE) + p = Popen(["pup", line], stdout=PIPE, stdin=PIPE, stderr=PIPE) h = sha1() h.update(p.communicate(input=data)[0]) print("%s %s" % (h.hexdigest(), line))