From 097e035473a3547693df7234508a7c2f12804f8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= <ege.gunes@percona.com>
Date: Thu, 20 Feb 2025 18:14:49 +0300
Subject: [PATCH] add :matches selector

---
 README.md                 | 11 ++++++++
 selector.go               | 54 +++++++++++++++++++++++++++++++++++++++
 tests/cmds.txt            |  2 ++
 tests/expected_output.txt |  6 +++--
 tests/run.py              |  6 ++---
 5 files changed, 74 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index a24ac88..7ab1558 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,16 @@ $ cat robots.html | pup ':contains("History")'
 </span>
 ```
 
+```bash
+$ cat robots.html | pup ':matches("Histor*")'
+<span class="toctext">
+ History
+</span>
+<span class="mw-headline" id="History">
+ History
+</span>
+```
+
 ```bash
 $ cat robots.html | pup ':parent-of([action="edit"])'
 <span class="wb-langlinks-edit wb-langlinks-link">
@@ -211,6 +221,7 @@ pup ':last-of-type'
 pup ':only-child'
 pup ':only-of-type'
 pup ':contains("text")'
+pup ':matches("pattern")'
 pup ':nth-child(n)'
 pup ':nth-of-type(n)'
 pup ':nth-last-child(n)'
diff --git a/selector.go b/selector.go
index 6249f77..5b6a05d 100644
--- a/selector.go
+++ b/selector.go
@@ -371,6 +371,11 @@ func ParsePseudo(selector *CSSSelector, s scanner.Scanner) error {
 		if err != nil {
 			return err
 		}
+	case strings.HasPrefix(cmd, "matches("):
+		selector.Pseudo, err = parseMatchesPseudo(cmd[len("matches("):])
+		if err != nil {
+			return err
+		}
 	case strings.HasPrefix(cmd, "nth-child("),
 		strings.HasPrefix(cmd, "nth-last-child("),
 		strings.HasPrefix(cmd, "nth-last-of-type("),
@@ -592,6 +597,55 @@ func parseContainsPseudo(cmd string) (PseudoClass, error) {
 	}
 }
 
+// Parse a :matches("") selector
+// expects the input to be a valid regexp that matches text
+func parseMatchesPseudo(cmd string) (PseudoClass, error) {
+	var s scanner.Scanner
+	s.Init(strings.NewReader(cmd))
+	switch s.Next() {
+	case '"':
+	default:
+		return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
+	}
+	pattern := bytes.NewBuffer([]byte{})
+	for {
+		r := s.Next()
+		switch r {
+		case '"':
+			// ')' then EOF must follow '"'
+			if s.Next() != ')' {
+				return nil, fmt.Errorf("Malformed 'matches(\"\")' selector")
+			}
+			if s.Next() != scanner.EOF {
+				return nil, fmt.Errorf("'matches(\"\")' must end selector")
+			}
+			p, err := regexp.Compile(pattern.String())
+			if err != nil {
+				return nil, err
+			}
+			contains := func(node *html.Node) bool {
+				for c := node.FirstChild; c != nil; c = c.NextSibling {
+					if c.Type == html.TextNode {
+						if p.MatchString(c.Data) {
+							return true
+						}
+					}
+				}
+				return false
+			}
+			return contains, nil
+		case '\\':
+			s.Next()
+		case scanner.EOF:
+			return nil, fmt.Errorf("Malformed 'contains(\"\")' selector")
+		default:
+			if _, err := pattern.WriteRune(r); err != nil {
+				return nil, err
+			}
+		}
+	}
+}
+
 // Parse a :not(selector) selector
 // expects the input to be everything after the open parenthesis
 // e.g. for `not(div#id)` the argument would be `div#id)`
diff --git a/tests/cmds.txt b/tests/cmds.txt
index beca6c2..50da070 100644
--- a/tests/cmds.txt
+++ b/tests/cmds.txt
@@ -47,3 +47,5 @@ link , a:parent-of(sup)
 link , a:parent-of(sup) sup
 li --number
 li -n
+p:contains("Rob")
+p:matches("Ro*")
diff --git a/tests/expected_output.txt b/tests/expected_output.txt
index 7f06b47..43408f8 100644
--- a/tests/expected_output.txt
+++ b/tests/expected_output.txt
@@ -10,7 +10,7 @@ a92e50c09cd56970625ac3b74efbddb83b2731bb table li
 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type
 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"]
 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{}
-ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{}
+199188dc8f1522426a628e41d96264bffb8beb0f json{}
 95ef88ded9dab22ee3206cca47b9c3a376274bda text{}
 e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet
 da39a3ee5e6b4b0d3255bfef95601890afd80709 .after
@@ -34,7 +34,7 @@ d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1)
 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc
 da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a
 da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{}
-97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{}
+cd0d4cc32346750408f7d4f5e78ec9a6e5b79a0d #toc li + a json{}
 da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span
 da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span
 da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li
@@ -47,3 +47,5 @@ b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup)
 0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup
 da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number
 da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n
+4c15ca8f190a4412469e487fab6f7ad2479f922f p:contains("Rob")
+da39a3ee5e6b4b0d3255bfef95601890afd80709 p:matches("Ro*")
diff --git a/tests/run.py b/tests/run.py
index 67a13e0..ca662a7 100755
--- a/tests/run.py
+++ b/tests/run.py
@@ -2,13 +2,13 @@
 
 from __future__ import print_function
 from hashlib import sha1
-from subprocess import Popen, PIPE, STDOUT
+from subprocess import Popen, PIPE
 
-data = open("index.html", "r").read()
+data = open("index.html", "rb").read()
 
 for line in open("cmds.txt", "r"):
     line = line.strip()
-    p = Popen(['pup', line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
+    p = Popen(["pup", line], stdout=PIPE, stdin=PIPE, stderr=PIPE)
     h = sha1()
     h.update(p.communicate(input=data)[0])
     print("%s %s" % (h.hexdigest(), line))