1
0
mirror of https://github.com/ericchiang/pup synced 2024-11-30 20:08:13 +00:00
pup/selector/selector.go

346 lines
6.9 KiB
Go
Raw Normal View History

2014-09-01 00:01:03 +00:00
package selector
import (
"code.google.com/p/go.net/html"
"fmt"
"regexp"
2014-09-18 01:44:04 +00:00
"strconv"
2014-09-01 00:01:03 +00:00
"strings"
)
2014-09-01 18:50:10 +00:00
// A CSS Selector
2014-09-15 00:34:21 +00:00
type BasicSelector struct {
2014-09-01 20:39:26 +00:00
Name *regexp.Regexp
Attrs map[string]*regexp.Regexp
2014-09-01 00:01:03 +00:00
}
2014-09-15 00:34:21 +00:00
type Selector interface {
2014-09-18 00:32:49 +00:00
Select(nodes []*html.Node) []*html.Node
2014-09-15 00:34:21 +00:00
}
2014-09-01 20:39:26 +00:00
type selectorField int
2014-09-01 00:01:03 +00:00
const (
2014-09-01 20:39:26 +00:00
ClassField selectorField = iota
IDField
NameField
AttrField
2014-09-01 00:01:03 +00:00
)
2014-09-01 20:39:26 +00:00
// Parse an attribute command to a key string and a regexp
func parseAttrField(command string) (attrKey string, matcher *regexp.Regexp,
err error) {
attrSplit := strings.Split(command, "=")
matcherString := ""
switch len(attrSplit) {
case 1:
attrKey = attrSplit[0]
matcherString = ".*"
case 2:
attrKey = attrSplit[0]
attrVal := attrSplit[1]
if len(attrKey) == 0 {
err = fmt.Errorf("No attribute key")
return
}
attrKeyLen := len(attrKey)
switch attrKey[attrKeyLen-1] {
case '~':
2014-09-14 23:00:31 +00:00
matcherString = fmt.Sprintf(`\b%s\b`, attrVal)
2014-09-01 20:39:26 +00:00
case '$':
matcherString = fmt.Sprintf("%s$", attrVal)
case '^':
matcherString = fmt.Sprintf("^%s", attrVal)
case '*':
matcherString = fmt.Sprintf("%s", attrVal)
default:
attrKeyLen++
matcherString = fmt.Sprintf("^%s$", attrVal)
}
attrKey = attrKey[:attrKeyLen-1]
default:
err = fmt.Errorf("more than one '='")
return
}
matcher, err = regexp.Compile(matcherString)
return
}
2014-09-01 18:50:10 +00:00
// Set a field of this selector.
2014-09-15 00:34:21 +00:00
func (s *BasicSelector) setFieldValue(f selectorField, v string) error {
2014-09-01 00:01:03 +00:00
if v == "" {
return nil
}
2014-09-01 20:39:26 +00:00
switch f {
case ClassField:
2014-09-14 23:00:31 +00:00
r, err := regexp.Compile(fmt.Sprintf(`\b%s\b`, v))
if err != nil {
return err
}
2014-09-01 20:39:26 +00:00
s.Attrs["class"] = r
case IDField:
2014-09-14 23:00:31 +00:00
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
if err != nil {
return err
}
2014-09-01 20:39:26 +00:00
s.Attrs["id"] = r
case NameField:
2014-09-14 23:00:31 +00:00
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
if err != nil {
return err
}
2014-09-01 00:01:03 +00:00
s.Name = r
2014-09-01 20:39:26 +00:00
case AttrField:
2014-09-14 23:00:31 +00:00
// Attribute fields are a little more complicated
2014-09-01 20:39:26 +00:00
keystring, matcher, err := parseAttrField(v)
if err != nil {
return err
}
s.Attrs[keystring] = matcher
2014-09-01 00:01:03 +00:00
}
return nil
}
2014-09-01 18:50:10 +00:00
// Convert a string to a selector.
2014-09-15 00:34:21 +00:00
func NewSelector(s string) (Selector, error) {
2014-09-18 01:44:04 +00:00
// A very simple test for a selector function
if strings.Contains(s, "{") {
return parseSelectorFunc(s)
}
// Otherwise let's evaluate a basic selector
2014-09-01 00:01:03 +00:00
attrs := map[string]*regexp.Regexp{}
2014-09-15 00:34:21 +00:00
selector := BasicSelector{nil, attrs}
2014-09-01 20:39:26 +00:00
nextField := NameField
2014-09-01 00:01:03 +00:00
start := 0
for i, c := range s {
switch c {
case '.':
2014-09-01 20:39:26 +00:00
if nextField == AttrField {
continue
}
err := selector.setFieldValue(nextField, s[start:i])
2014-09-01 00:01:03 +00:00
if err != nil {
2014-09-01 18:50:10 +00:00
return selector, err
2014-09-01 00:01:03 +00:00
}
2014-09-01 20:39:26 +00:00
nextField = ClassField
2014-09-01 00:01:03 +00:00
start = i + 1
case '#':
2014-09-01 20:39:26 +00:00
if nextField == AttrField {
continue
}
err := selector.setFieldValue(nextField, s[start:i])
if err != nil {
return selector, err
}
nextField = IDField
start = i + 1
case '[':
err := selector.setFieldValue(nextField, s[start:i])
if err != nil {
return selector, err
}
nextField = AttrField
start = i + 1
case ']':
if nextField != AttrField {
return selector, fmt.Errorf(
"']' must be preceeded by '['")
}
err := selector.setFieldValue(nextField, s[start:i])
2014-09-01 00:01:03 +00:00
if err != nil {
2014-09-01 18:50:10 +00:00
return selector, err
2014-09-01 00:01:03 +00:00
}
start = i + 1
}
}
2014-09-01 20:39:26 +00:00
err := selector.setFieldValue(nextField, s[start:])
2014-09-01 18:50:10 +00:00
if err != nil {
return selector, err
}
return selector, nil
2014-09-01 00:01:03 +00:00
}
2014-09-18 00:32:49 +00:00
func (sel BasicSelector) Select(nodes []*html.Node) []*html.Node {
selected := []*html.Node{}
for _, node := range nodes {
selected = append(selected, sel.FindAllChildren(node)...)
}
return selected
}
2014-09-01 18:50:10 +00:00
// Find all nodes which match a selector.
2014-09-15 00:34:21 +00:00
func (sel BasicSelector) FindAllChildren(node *html.Node) []*html.Node {
2014-09-01 00:01:03 +00:00
selected := []*html.Node{}
child := node.FirstChild
for child != nil {
childSelected := sel.FindAll(child)
selected = append(selected, childSelected...)
child = child.NextSibling
}
return selected
}
2014-09-01 18:50:10 +00:00
// Find all nodes which match a selector. May return itself.
2014-09-15 00:34:21 +00:00
func (sel BasicSelector) FindAll(node *html.Node) []*html.Node {
2014-09-01 00:01:03 +00:00
selected := []*html.Node{}
if sel.Match(node) {
return []*html.Node{node}
}
child := node.FirstChild
for child != nil {
childSelected := sel.FindAll(child)
selected = append(selected, childSelected...)
child = child.NextSibling
}
return selected
}
2014-09-01 18:50:10 +00:00
// Does this selector match a given node?
2014-09-15 00:34:21 +00:00
func (sel BasicSelector) Match(node *html.Node) bool {
2014-09-01 00:01:03 +00:00
if node.Type != html.ElementNode {
return false
}
if sel.Name != nil {
if !sel.Name.MatchString(strings.ToLower(node.Data)) {
return false
}
}
2014-09-01 20:39:26 +00:00
matchedAttrs := []string{}
2014-09-01 00:01:03 +00:00
for _, attr := range node.Attr {
2014-09-01 20:39:26 +00:00
matcher, ok := sel.Attrs[attr.Key]
if !ok {
continue
}
if !matcher.MatchString(attr.Val) {
return false
}
matchedAttrs = append(matchedAttrs, attr.Key)
}
for k := range sel.Attrs {
attrMatched := false
for _, attrKey := range matchedAttrs {
if k == attrKey {
attrMatched = true
2014-09-01 00:01:03 +00:00
}
}
2014-09-01 20:39:26 +00:00
if !attrMatched {
return false
}
2014-09-01 00:01:03 +00:00
}
2014-09-01 20:39:26 +00:00
return true
2014-09-01 00:01:03 +00:00
}
2014-09-18 01:44:04 +00:00
type SliceSelector struct {
Start int
LimitStart bool
End int
LimitEnd bool
By int
2014-09-19 20:37:06 +00:00
N int
2014-09-18 01:44:04 +00:00
}
func (sel SliceSelector) Select(nodes []*html.Node) []*html.Node {
var start, end, by int
selected := []*html.Node{}
nNodes := len(nodes)
switch {
case !sel.LimitStart:
start = 0
case sel.Start < 0:
2014-09-19 20:37:06 +00:00
start = nNodes + sel.Start
2014-09-18 01:44:04 +00:00
default:
start = sel.Start
}
switch {
2014-09-19 20:37:06 +00:00
case sel.N == 1:
end = start + 1
2014-09-18 01:44:04 +00:00
case !sel.LimitEnd:
end = nNodes
case sel.End < 0:
2014-09-19 20:37:06 +00:00
end = nNodes + sel.End
2014-09-18 01:44:04 +00:00
default:
end = sel.End
}
by = sel.By
if by == 0 {
return selected
}
if by > 0 {
for i := start; i < nNodes && i < end; i = i + by {
selected = append(selected, nodes[i])
}
} else {
2014-09-18 22:28:02 +00:00
for i := end - 1; i >= 0 && i >= start; i = i + by {
2014-09-18 01:44:04 +00:00
selected = append(selected, nodes[i])
}
}
return selected
}
// expects input to be the slice only, e.g. "9:4:-1"
func parseSliceSelector(s string) (sel SliceSelector, err error) {
sel = SliceSelector{
Start: 0,
End: 0,
By: 1,
LimitStart: false,
LimitEnd: false,
}
split := strings.Split(s, ":")
n := len(split)
2014-09-19 20:37:06 +00:00
sel.N = n
2014-09-18 01:44:04 +00:00
if n > 3 {
err = fmt.Errorf("too many slices")
return
}
var value int
if split[0] != "" {
value, err = strconv.Atoi(split[0])
if err != nil {
return
}
sel.Start = value
sel.LimitStart = true
}
if n == 1 {
sel.End = sel.Start + 1
sel.LimitEnd = true
return
}
if split[1] != "" {
value, err = strconv.Atoi(split[1])
if err != nil {
return
}
sel.End = value
sel.LimitEnd = true
}
if n == 2 {
return
}
if split[2] != "" {
value, err = strconv.Atoi(split[2])
if err != nil {
return
}
sel.By = value
}
return
}
func parseSelectorFunc(s string) (Selector, error) {
switch {
2014-09-18 22:28:02 +00:00
case strings.HasPrefix(s, "slice{"):
2014-09-18 01:44:04 +00:00
if !strings.HasSuffix(s, "}") {
return nil, fmt.Errorf(
"slice func must end with a '}'")
}
2014-09-18 22:28:02 +00:00
s = strings.TrimPrefix(s, "slice{")
2014-09-18 01:44:04 +00:00
s = strings.TrimSuffix(s, "}")
return parseSliceSelector(s)
}
return nil, fmt.Errorf("%s is an invalid function", s)
}