mirror of
https://github.com/ericchiang/pup
synced 2024-11-24 08:58:08 +00:00
attribute selectors added
This commit is contained in:
parent
825f458e22
commit
84e54e1430
@ -104,6 +104,5 @@ $ pup < robots.html table -l 2
|
|||||||
|
|
||||||
## TODO:
|
## TODO:
|
||||||
|
|
||||||
* Attribute css selectors.
|
|
||||||
* Print attribute value rather than html ({href})
|
* Print attribute value rather than html ({href})
|
||||||
* Print result as JSON (--json)
|
* Print result as JSON (--json)
|
||||||
|
3
main.go
3
main.go
@ -123,8 +123,7 @@ func main() {
|
|||||||
for i, cmd := range cmds {
|
for i, cmd := range cmds {
|
||||||
selectors[i], err = selector.NewSelector(cmd)
|
selectors[i], err = selector.NewSelector(cmd)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, err.Error())
|
Fatal("Selector parse error: %s", err)
|
||||||
os.Exit(2)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currNodes := []*html.Node{root}
|
currNodes := []*html.Node{root}
|
||||||
|
@ -9,36 +9,81 @@ import (
|
|||||||
|
|
||||||
// A CSS Selector
|
// A CSS Selector
|
||||||
type Selector struct {
|
type Selector struct {
|
||||||
Class, ID, Name *regexp.Regexp
|
Name *regexp.Regexp
|
||||||
Attrs map[string]*regexp.Regexp
|
Attrs map[string]*regexp.Regexp
|
||||||
}
|
}
|
||||||
|
|
||||||
type selectorField string
|
type selectorField int
|
||||||
|
|
||||||
const (
|
const (
|
||||||
Class selectorField = "class"
|
ClassField selectorField = iota
|
||||||
ID selectorField = "id"
|
IDField
|
||||||
Name selectorField = "name"
|
NameField
|
||||||
|
AttrField
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Parse an attribute command to a key string and a regexp
|
||||||
|
func parseAttrField(command string) (attrKey string, matcher *regexp.Regexp,
|
||||||
|
err error) {
|
||||||
|
|
||||||
|
attrSplit := strings.Split(command, "=")
|
||||||
|
matcherString := ""
|
||||||
|
switch len(attrSplit) {
|
||||||
|
case 1:
|
||||||
|
attrKey = attrSplit[0]
|
||||||
|
matcherString = ".*"
|
||||||
|
case 2:
|
||||||
|
attrKey = attrSplit[0]
|
||||||
|
attrVal := attrSplit[1]
|
||||||
|
if len(attrKey) == 0 {
|
||||||
|
err = fmt.Errorf("No attribute key")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
attrKeyLen := len(attrKey)
|
||||||
|
switch attrKey[attrKeyLen-1] {
|
||||||
|
case '~':
|
||||||
|
matcherString = fmt.Sprintf(`[^\s]%s[$\s]`, attrVal)
|
||||||
|
case '$':
|
||||||
|
matcherString = fmt.Sprintf("%s$", attrVal)
|
||||||
|
case '^':
|
||||||
|
matcherString = fmt.Sprintf("^%s", attrVal)
|
||||||
|
case '*':
|
||||||
|
matcherString = fmt.Sprintf("%s", attrVal)
|
||||||
|
default:
|
||||||
|
attrKeyLen++
|
||||||
|
matcherString = fmt.Sprintf("^%s$", attrVal)
|
||||||
|
}
|
||||||
|
attrKey = attrKey[:attrKeyLen-1]
|
||||||
|
default:
|
||||||
|
err = fmt.Errorf("more than one '='")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
matcher, err = regexp.Compile(matcherString)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Set a field of this selector.
|
// Set a field of this selector.
|
||||||
func (s *Selector) setFieldValue(a selectorField, v string) error {
|
func (s *Selector) setFieldValue(f selectorField, v string) error {
|
||||||
if v == "" {
|
if v == "" {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// wildcards become '.*'
|
|
||||||
v = strings.Replace(v, "*", ".*", -1)
|
|
||||||
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
|
r, err := regexp.Compile(fmt.Sprintf("^%s$", v))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
switch a {
|
switch f {
|
||||||
case Class:
|
case ClassField:
|
||||||
s.Class = r
|
s.Attrs["class"] = r
|
||||||
case ID:
|
case IDField:
|
||||||
s.ID = r
|
s.Attrs["id"] = r
|
||||||
case Name:
|
case NameField:
|
||||||
s.Name = r
|
s.Name = r
|
||||||
|
case AttrField:
|
||||||
|
keystring, matcher, err := parseAttrField(v)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
s.Attrs[keystring] = matcher
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -46,28 +91,51 @@ func (s *Selector) setFieldValue(a selectorField, v string) error {
|
|||||||
// Convert a string to a selector.
|
// Convert a string to a selector.
|
||||||
func NewSelector(s string) (*Selector, error) {
|
func NewSelector(s string) (*Selector, error) {
|
||||||
attrs := map[string]*regexp.Regexp{}
|
attrs := map[string]*regexp.Regexp{}
|
||||||
selector := &Selector{nil, nil, nil, attrs}
|
selector := &Selector{nil, attrs}
|
||||||
nextAttr := Name
|
nextField := NameField
|
||||||
start := 0
|
start := 0
|
||||||
for i, c := range s {
|
for i, c := range s {
|
||||||
switch c {
|
switch c {
|
||||||
case '.':
|
case '.':
|
||||||
err := selector.setFieldValue(nextAttr, s[start:i])
|
if nextField == AttrField {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
err := selector.setFieldValue(nextField, s[start:i])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return selector, err
|
return selector, err
|
||||||
}
|
}
|
||||||
nextAttr = Class
|
nextField = ClassField
|
||||||
start = i + 1
|
start = i + 1
|
||||||
case '#':
|
case '#':
|
||||||
err := selector.setFieldValue(nextAttr, s[start:i])
|
if nextField == AttrField {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
err := selector.setFieldValue(nextField, s[start:i])
|
||||||
|
if err != nil {
|
||||||
|
return selector, err
|
||||||
|
}
|
||||||
|
nextField = IDField
|
||||||
|
start = i + 1
|
||||||
|
case '[':
|
||||||
|
err := selector.setFieldValue(nextField, s[start:i])
|
||||||
|
if err != nil {
|
||||||
|
return selector, err
|
||||||
|
}
|
||||||
|
nextField = AttrField
|
||||||
|
start = i + 1
|
||||||
|
case ']':
|
||||||
|
if nextField != AttrField {
|
||||||
|
return selector, fmt.Errorf(
|
||||||
|
"']' must be preceeded by '['")
|
||||||
|
}
|
||||||
|
err := selector.setFieldValue(nextField, s[start:i])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return selector, err
|
return selector, err
|
||||||
}
|
}
|
||||||
nextAttr = ID
|
|
||||||
start = i + 1
|
start = i + 1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
err := selector.setFieldValue(nextAttr, s[start:])
|
err := selector.setFieldValue(nextField, s[start:])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return selector, err
|
return selector, err
|
||||||
}
|
}
|
||||||
@ -111,27 +179,27 @@ func (sel *Selector) Match(node *html.Node) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
classMatched := sel.Class == nil
|
matchedAttrs := []string{}
|
||||||
idMatched := sel.ID == nil
|
|
||||||
for _, attr := range node.Attr {
|
for _, attr := range node.Attr {
|
||||||
switch attr.Key {
|
matcher, ok := sel.Attrs[attr.Key]
|
||||||
case "class":
|
if !ok {
|
||||||
if !classMatched {
|
continue
|
||||||
if !sel.Class.MatchString(attr.Val) {
|
}
|
||||||
return false
|
if !matcher.MatchString(attr.Val) {
|
||||||
} else {
|
return false
|
||||||
classMatched = true
|
}
|
||||||
}
|
matchedAttrs = append(matchedAttrs, attr.Key)
|
||||||
}
|
}
|
||||||
case "id":
|
for k := range sel.Attrs {
|
||||||
if !idMatched {
|
attrMatched := false
|
||||||
if !sel.ID.MatchString(attr.Val) {
|
for _, attrKey := range matchedAttrs {
|
||||||
return false
|
if k == attrKey {
|
||||||
} else {
|
attrMatched = true
|
||||||
idMatched = true
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if !attrMatched {
|
||||||
|
return false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return classMatched && idMatched
|
return true
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user