2014-09-01 00:01:03 +00:00
|
|
|
package selector
|
|
|
|
|
|
|
|
import (
|
|
|
|
"code.google.com/p/go.net/html"
|
|
|
|
"fmt"
|
|
|
|
"regexp"
|
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// A CSS Selector
|
2014-09-15 00:34:21 +00:00
|
|
|
type BasicSelector struct {
|
2014-09-01 20:39:26 +00:00
|
|
|
Name *regexp.Regexp
|
|
|
|
Attrs map[string]*regexp.Regexp
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
|
|
|
|
2014-09-15 00:34:21 +00:00
|
|
|
type Selector interface {
|
|
|
|
// Does this selector match a given node?
|
|
|
|
Match(node *html.Node) bool
|
|
|
|
// Find all nodes which match a selector. May return itself.
|
|
|
|
FindAll(node *html.Node) []*html.Node
|
|
|
|
// Find all child nodes which match a selector.
|
|
|
|
FindAllChildren(node *html.Node) []*html.Node
|
|
|
|
}
|
|
|
|
|
2014-09-01 20:39:26 +00:00
|
|
|
type selectorField int
|
2014-09-01 00:01:03 +00:00
|
|
|
|
|
|
|
const (
|
2014-09-01 20:39:26 +00:00
|
|
|
ClassField selectorField = iota
|
|
|
|
IDField
|
|
|
|
NameField
|
|
|
|
AttrField
|
2014-09-01 00:01:03 +00:00
|
|
|
)
|
|
|
|
|
2014-09-01 20:39:26 +00:00
|
|
|
// Parse an attribute command to a key string and a regexp
|
|
|
|
func parseAttrField(command string) (attrKey string, matcher *regexp.Regexp,
|
|
|
|
err error) {
|
|
|
|
|
|
|
|
attrSplit := strings.Split(command, "=")
|
|
|
|
matcherString := ""
|
|
|
|
switch len(attrSplit) {
|
|
|
|
case 1:
|
|
|
|
attrKey = attrSplit[0]
|
|
|
|
matcherString = ".*"
|
|
|
|
case 2:
|
|
|
|
attrKey = attrSplit[0]
|
|
|
|
attrVal := attrSplit[1]
|
|
|
|
if len(attrKey) == 0 {
|
|
|
|
err = fmt.Errorf("No attribute key")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
attrKeyLen := len(attrKey)
|
|
|
|
switch attrKey[attrKeyLen-1] {
|
|
|
|
case '~':
|
2014-09-14 23:00:31 +00:00
|
|
|
matcherString = fmt.Sprintf(`\b%s\b`, attrVal)
|
2014-09-01 20:39:26 +00:00
|
|
|
case '$':
|
|
|
|
matcherString = fmt.Sprintf("%s$", attrVal)
|
|
|
|
case '^':
|
|
|
|
matcherString = fmt.Sprintf("^%s", attrVal)
|
|
|
|
case '*':
|
|
|
|
matcherString = fmt.Sprintf("%s", attrVal)
|
|
|
|
default:
|
|
|
|
attrKeyLen++
|
|
|
|
matcherString = fmt.Sprintf("^%s$", attrVal)
|
|
|
|
}
|
|
|
|
attrKey = attrKey[:attrKeyLen-1]
|
|
|
|
default:
|
|
|
|
err = fmt.Errorf("more than one '='")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
matcher, err = regexp.Compile(matcherString)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// Set a field of this selector.
|
2014-09-15 00:34:21 +00:00
|
|
|
func (s *BasicSelector) setFieldValue(f selectorField, v string) error {
|
2014-09-01 00:01:03 +00:00
|
|
|
if v == "" {
|
|
|
|
return nil
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
switch f {
|
|
|
|
case ClassField:
|
2014-09-14 23:00:31 +00:00
|
|
|
r, err := regexp.Compile(fmt.Sprintf(`\b%s\b`, v))
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
s.Attrs["class"] = r
|
|
|
|
case IDField:
|
2014-09-14 23:00:31 +00:00
|
|
|
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
s.Attrs["id"] = r
|
|
|
|
case NameField:
|
2014-09-14 23:00:31 +00:00
|
|
|
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-09-01 00:01:03 +00:00
|
|
|
s.Name = r
|
2014-09-01 20:39:26 +00:00
|
|
|
case AttrField:
|
2014-09-14 23:00:31 +00:00
|
|
|
// Attribute fields are a little more complicated
|
2014-09-01 20:39:26 +00:00
|
|
|
keystring, matcher, err := parseAttrField(v)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
s.Attrs[keystring] = matcher
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// Convert a string to a selector.
|
2014-09-15 00:34:21 +00:00
|
|
|
func NewSelector(s string) (Selector, error) {
|
2014-09-01 00:01:03 +00:00
|
|
|
attrs := map[string]*regexp.Regexp{}
|
2014-09-15 00:34:21 +00:00
|
|
|
selector := BasicSelector{nil, attrs}
|
2014-09-01 20:39:26 +00:00
|
|
|
nextField := NameField
|
2014-09-01 00:01:03 +00:00
|
|
|
start := 0
|
2014-09-14 23:00:31 +00:00
|
|
|
// Parse the selector character by character
|
2014-09-01 00:01:03 +00:00
|
|
|
for i, c := range s {
|
|
|
|
switch c {
|
|
|
|
case '.':
|
2014-09-01 20:39:26 +00:00
|
|
|
if nextField == AttrField {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
err := selector.setFieldValue(nextField, s[start:i])
|
2014-09-01 00:01:03 +00:00
|
|
|
if err != nil {
|
2014-09-01 18:50:10 +00:00
|
|
|
return selector, err
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
nextField = ClassField
|
2014-09-01 00:01:03 +00:00
|
|
|
start = i + 1
|
|
|
|
case '#':
|
2014-09-01 20:39:26 +00:00
|
|
|
if nextField == AttrField {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
err := selector.setFieldValue(nextField, s[start:i])
|
|
|
|
if err != nil {
|
|
|
|
return selector, err
|
|
|
|
}
|
|
|
|
nextField = IDField
|
|
|
|
start = i + 1
|
|
|
|
case '[':
|
|
|
|
err := selector.setFieldValue(nextField, s[start:i])
|
|
|
|
if err != nil {
|
|
|
|
return selector, err
|
|
|
|
}
|
|
|
|
nextField = AttrField
|
|
|
|
start = i + 1
|
|
|
|
case ']':
|
|
|
|
if nextField != AttrField {
|
|
|
|
return selector, fmt.Errorf(
|
|
|
|
"']' must be preceeded by '['")
|
|
|
|
}
|
|
|
|
err := selector.setFieldValue(nextField, s[start:i])
|
2014-09-01 00:01:03 +00:00
|
|
|
if err != nil {
|
2014-09-01 18:50:10 +00:00
|
|
|
return selector, err
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
|
|
|
start = i + 1
|
|
|
|
}
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
err := selector.setFieldValue(nextField, s[start:])
|
2014-09-01 18:50:10 +00:00
|
|
|
if err != nil {
|
|
|
|
return selector, err
|
|
|
|
}
|
|
|
|
return selector, nil
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// Find all nodes which match a selector.
|
2014-09-15 00:34:21 +00:00
|
|
|
func (sel BasicSelector) FindAllChildren(node *html.Node) []*html.Node {
|
2014-09-01 00:01:03 +00:00
|
|
|
selected := []*html.Node{}
|
|
|
|
child := node.FirstChild
|
|
|
|
for child != nil {
|
|
|
|
childSelected := sel.FindAll(child)
|
|
|
|
selected = append(selected, childSelected...)
|
|
|
|
child = child.NextSibling
|
|
|
|
}
|
|
|
|
return selected
|
|
|
|
}
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// Find all nodes which match a selector. May return itself.
|
2014-09-15 00:34:21 +00:00
|
|
|
func (sel BasicSelector) FindAll(node *html.Node) []*html.Node {
|
2014-09-01 00:01:03 +00:00
|
|
|
selected := []*html.Node{}
|
|
|
|
if sel.Match(node) {
|
|
|
|
return []*html.Node{node}
|
|
|
|
}
|
|
|
|
child := node.FirstChild
|
|
|
|
for child != nil {
|
|
|
|
childSelected := sel.FindAll(child)
|
|
|
|
selected = append(selected, childSelected...)
|
|
|
|
child = child.NextSibling
|
|
|
|
}
|
|
|
|
return selected
|
|
|
|
}
|
|
|
|
|
2014-09-01 18:50:10 +00:00
|
|
|
// Does this selector match a given node?
|
2014-09-15 00:34:21 +00:00
|
|
|
func (sel BasicSelector) Match(node *html.Node) bool {
|
2014-09-01 00:01:03 +00:00
|
|
|
if node.Type != html.ElementNode {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if sel.Name != nil {
|
|
|
|
if !sel.Name.MatchString(strings.ToLower(node.Data)) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
matchedAttrs := []string{}
|
2014-09-01 00:01:03 +00:00
|
|
|
for _, attr := range node.Attr {
|
2014-09-01 20:39:26 +00:00
|
|
|
matcher, ok := sel.Attrs[attr.Key]
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if !matcher.MatchString(attr.Val) {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
matchedAttrs = append(matchedAttrs, attr.Key)
|
|
|
|
}
|
|
|
|
for k := range sel.Attrs {
|
|
|
|
attrMatched := false
|
|
|
|
for _, attrKey := range matchedAttrs {
|
|
|
|
if k == attrKey {
|
|
|
|
attrMatched = true
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
if !attrMatched {
|
|
|
|
return false
|
|
|
|
}
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|
2014-09-01 20:39:26 +00:00
|
|
|
return true
|
2014-09-01 00:01:03 +00:00
|
|
|
}
|