1
0
mirror of https://github.com/ericchiang/pup synced 2024-11-28 02:48:16 +00:00

attribute selectors added

This commit is contained in:
ericchiang 2014-09-01 16:39:26 -04:00
parent 825f458e22
commit 84e54e1430
3 changed files with 110 additions and 44 deletions

View File

@ -104,6 +104,5 @@ $ pup < robots.html table -l 2
## TODO: ## TODO:
* Attribute css selectors.
* Print attribute value rather than html ({href}) * Print attribute value rather than html ({href})
* Print result as JSON (--json) * Print result as JSON (--json)

View File

@ -123,8 +123,7 @@ func main() {
for i, cmd := range cmds { for i, cmd := range cmds {
selectors[i], err = selector.NewSelector(cmd) selectors[i], err = selector.NewSelector(cmd)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, err.Error()) Fatal("Selector parse error: %s", err)
os.Exit(2)
} }
} }
currNodes := []*html.Node{root} currNodes := []*html.Node{root}

View File

@ -9,36 +9,81 @@ import (
// A CSS Selector // A CSS Selector
type Selector struct { type Selector struct {
Class, ID, Name *regexp.Regexp Name *regexp.Regexp
Attrs map[string]*regexp.Regexp Attrs map[string]*regexp.Regexp
} }
type selectorField string type selectorField int
const ( const (
Class selectorField = "class" ClassField selectorField = iota
ID selectorField = "id" IDField
Name selectorField = "name" NameField
AttrField
) )
// Parse an attribute command to a key string and a regexp
func parseAttrField(command string) (attrKey string, matcher *regexp.Regexp,
err error) {
attrSplit := strings.Split(command, "=")
matcherString := ""
switch len(attrSplit) {
case 1:
attrKey = attrSplit[0]
matcherString = ".*"
case 2:
attrKey = attrSplit[0]
attrVal := attrSplit[1]
if len(attrKey) == 0 {
err = fmt.Errorf("No attribute key")
return
}
attrKeyLen := len(attrKey)
switch attrKey[attrKeyLen-1] {
case '~':
matcherString = fmt.Sprintf(`[^\s]%s[$\s]`, attrVal)
case '$':
matcherString = fmt.Sprintf("%s$", attrVal)
case '^':
matcherString = fmt.Sprintf("^%s", attrVal)
case '*':
matcherString = fmt.Sprintf("%s", attrVal)
default:
attrKeyLen++
matcherString = fmt.Sprintf("^%s$", attrVal)
}
attrKey = attrKey[:attrKeyLen-1]
default:
err = fmt.Errorf("more than one '='")
return
}
matcher, err = regexp.Compile(matcherString)
return
}
// Set a field of this selector. // Set a field of this selector.
func (s *Selector) setFieldValue(a selectorField, v string) error { func (s *Selector) setFieldValue(f selectorField, v string) error {
if v == "" { if v == "" {
return nil return nil
} }
// wildcards become '.*'
v = strings.Replace(v, "*", ".*", -1)
r, err := regexp.Compile(fmt.Sprintf("^%s$", v)) r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
if err != nil { if err != nil {
return err return err
} }
switch a { switch f {
case Class: case ClassField:
s.Class = r s.Attrs["class"] = r
case ID: case IDField:
s.ID = r s.Attrs["id"] = r
case Name: case NameField:
s.Name = r s.Name = r
case AttrField:
keystring, matcher, err := parseAttrField(v)
if err != nil {
return err
}
s.Attrs[keystring] = matcher
} }
return nil return nil
} }
@ -46,28 +91,51 @@ func (s *Selector) setFieldValue(a selectorField, v string) error {
// Convert a string to a selector. // Convert a string to a selector.
func NewSelector(s string) (*Selector, error) { func NewSelector(s string) (*Selector, error) {
attrs := map[string]*regexp.Regexp{} attrs := map[string]*regexp.Regexp{}
selector := &Selector{nil, nil, nil, attrs} selector := &Selector{nil, attrs}
nextAttr := Name nextField := NameField
start := 0 start := 0
for i, c := range s { for i, c := range s {
switch c { switch c {
case '.': case '.':
err := selector.setFieldValue(nextAttr, s[start:i]) if nextField == AttrField {
continue
}
err := selector.setFieldValue(nextField, s[start:i])
if err != nil { if err != nil {
return selector, err return selector, err
} }
nextAttr = Class nextField = ClassField
start = i + 1 start = i + 1
case '#': case '#':
err := selector.setFieldValue(nextAttr, s[start:i]) if nextField == AttrField {
continue
}
err := selector.setFieldValue(nextField, s[start:i])
if err != nil {
return selector, err
}
nextField = IDField
start = i + 1
case '[':
err := selector.setFieldValue(nextField, s[start:i])
if err != nil {
return selector, err
}
nextField = AttrField
start = i + 1
case ']':
if nextField != AttrField {
return selector, fmt.Errorf(
"']' must be preceeded by '['")
}
err := selector.setFieldValue(nextField, s[start:i])
if err != nil { if err != nil {
return selector, err return selector, err
} }
nextAttr = ID
start = i + 1 start = i + 1
} }
} }
err := selector.setFieldValue(nextAttr, s[start:]) err := selector.setFieldValue(nextField, s[start:])
if err != nil { if err != nil {
return selector, err return selector, err
} }
@ -111,27 +179,27 @@ func (sel *Selector) Match(node *html.Node) bool {
return false return false
} }
} }
classMatched := sel.Class == nil matchedAttrs := []string{}
idMatched := sel.ID == nil
for _, attr := range node.Attr { for _, attr := range node.Attr {
switch attr.Key { matcher, ok := sel.Attrs[attr.Key]
case "class": if !ok {
if !classMatched { continue
if !sel.Class.MatchString(attr.Val) { }
return false if !matcher.MatchString(attr.Val) {
} else { return false
classMatched = true }
} matchedAttrs = append(matchedAttrs, attr.Key)
} }
case "id": for k := range sel.Attrs {
if !idMatched { attrMatched := false
if !sel.ID.MatchString(attr.Val) { for _, attrKey := range matchedAttrs {
return false if k == attrKey {
} else { attrMatched = true
idMatched = true
}
} }
} }
if !attrMatched {
return false
}
} }
return classMatched && idMatched return true
} }