1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-28 08:31:26 +00:00

Enabled multiple attribute matchers for same attribute.

A selector with multiple attribute selectors on the same attribute would use only the last selector,
because `selector.Attr[attrname]` gets overwritten. Something like 'a[href^=https][href$=.zip]'
would give the results from 'a[href$=.zip]'.

Now, the attribute selectors are collected into a list, and we check whether all selectors match.
This commit is contained in:
Jan Seeger 2020-09-04 11:24:41 +02:00
parent 681d7bb639
commit f9e1b3ed7d

View File

@ -77,7 +77,7 @@ type PseudoClass func(*html.Node) bool
type CSSSelector struct {
Tag string
Attrs map[string]*regexp.Regexp
Attrs map[string][]*regexp.Regexp
Pseudo PseudoClass
}
@ -90,15 +90,16 @@ func (s CSSSelector) Match(node *html.Node) bool {
return false
}
}
for attrKey, matcher := range s.Attrs {
for attrKey, matchers := range s.Attrs {
matched := false
for _, attr := range node.Attr {
if attrKey == attr.Key {
if !matcher.MatchString(attr.Val) {
return false
for _, matcher := range matchers {
if !matcher.MatchString(attr.Val) {
return false
}
matched = true
}
matched = true
break
}
}
if !matched {
@ -116,7 +117,7 @@ func (s CSSSelector) Match(node *html.Node) bool {
func ParseSelector(cmd string) (selector CSSSelector, err error) {
selector = CSSSelector{
Tag: "",
Attrs: map[string]*regexp.Regexp{},
Attrs: map[string][]*regexp.Regexp{},
Pseudo: nil,
}
var s scanner.Scanner
@ -153,13 +154,21 @@ func ParseTagMatcher(selector *CSSSelector, s scanner.Scanner) error {
}
}
func AddAttrSelector(selector *CSSSelector, attrname string, regex *regexp.Regexp) {
if slice, ok := selector.Attrs[attrname]; ok && len(slice) > 0 {
selector.Attrs[attrname] = append(selector.Attrs[attrname], regex)
} else {
selector.Attrs[attrname] = []*regexp.Regexp{regex}
}
}
// Parse a class matcher
// e.g. `.btn`
func ParseClassMatcher(selector *CSSSelector, s scanner.Scanner) error {
var class bytes.Buffer
defer func() {
regexpStr := `(\A|\s)` + regexp.QuoteMeta(class.String()) + `(\s|\z)`
selector.Attrs["class"] = regexp.MustCompile(regexpStr)
AddAttrSelector(selector, "class", regexp.MustCompile(regexpStr))
}()
for {
c := s.Next()
@ -188,7 +197,7 @@ func ParseIdMatcher(selector *CSSSelector, s scanner.Scanner) error {
var id bytes.Buffer
defer func() {
regexpStr := `^` + regexp.QuoteMeta(id.String()) + `$`
selector.Attrs["id"] = regexp.MustCompile(regexpStr)
AddAttrSelector(selector, "id", regexp.MustCompile(regexpStr))
}()
for {
c := s.Next()
@ -233,9 +242,9 @@ func ParseAttrMatcher(selector *CSSSelector, s scanner.Scanner) error {
case '~':
regexpStr = `(\A|\s)` + regexp.QuoteMeta(attrVal.String()) + `(\s|\z)`
}
selector.Attrs[attrKey.String()] = regexp.MustCompile(regexpStr)
AddAttrSelector(selector, attrKey.String(), regexp.MustCompile(regexpStr))
} else {
selector.Attrs[attrKey.String()] = regexp.MustCompile(`^.*$`)
AddAttrSelector(selector, attrKey.String(), regexp.MustCompile(`^.*$`))
}
}()
// After reaching ']' proceed