mirror of
https://github.com/ericchiang/pup
synced 2025-01-15 02:00:55 +00:00
Allow user to specify charset
This commit is contained in:
parent
d00d65425a
commit
a07991268b
11
README.md
11
README.md
@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required).
|
|||||||
|
|
||||||
brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb
|
brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb
|
||||||
|
|
||||||
For linux distrubtions use the following commands to install under your `PATH`
|
|
||||||
environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures.
|
|
||||||
|
|
||||||
ARCH=linux_amd64
|
|
||||||
cd /tmp
|
|
||||||
wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip
|
|
||||||
unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip
|
|
||||||
sudo mv pup /usr/local/bin
|
|
||||||
pup --version
|
|
||||||
|
|
||||||
## Quick start
|
## Quick start
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@ -353,5 +343,6 @@ output of pup into a more consumable format.
|
|||||||
-i --indent number of spaces to use for indent or character
|
-i --indent number of spaces to use for indent or character
|
||||||
-n --number print number of elements selected
|
-n --number print number of elements selected
|
||||||
-l --limit restrict number of levels printed
|
-l --limit restrict number of levels printed
|
||||||
|
--charset specify the charset for pup to use
|
||||||
--version display version
|
--version display version
|
||||||
```
|
```
|
||||||
|
29
parse.go
29
parse.go
@ -6,16 +6,41 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html"
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
pupIn io.ReadCloser = os.Stdin
|
pupIn io.ReadCloser = os.Stdin
|
||||||
|
pupCharset string = ""
|
||||||
pupMaxPrintLevel int = -1
|
pupMaxPrintLevel int = -1
|
||||||
pupPrintColor bool = false
|
pupPrintColor bool = false
|
||||||
pupIndentString string = " "
|
pupIndentString string = " "
|
||||||
pupDisplayer Displayer = TreeDisplayer{}
|
pupDisplayer Displayer = TreeDisplayer{}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Parse the html while handling the charset
|
||||||
|
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
|
||||||
|
var err error
|
||||||
|
if cs == "" {
|
||||||
|
// attempt to guess the charset of the HTML document
|
||||||
|
r, err = charset.NewReader(r, "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// let the user specify the charset
|
||||||
|
e, name := charset.Lookup(cs)
|
||||||
|
if name == "" {
|
||||||
|
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
|
||||||
|
}
|
||||||
|
r = transform.NewReader(r, e.NewDecoder())
|
||||||
|
}
|
||||||
|
return html.Parse(r)
|
||||||
|
}
|
||||||
|
|
||||||
func PrintHelp(w io.Writer, exitCode int) {
|
func PrintHelp(w io.Writer, exitCode int) {
|
||||||
helpString := `Usage
|
helpString := `Usage
|
||||||
pup [flags] [selectors] [optional display function]
|
pup [flags] [selectors] [optional display function]
|
||||||
@ -28,6 +53,7 @@ Flags
|
|||||||
-i --indent number of spaces to use for indent or character
|
-i --indent number of spaces to use for indent or character
|
||||||
-n --number print number of elements selected
|
-n --number print number of elements selected
|
||||||
-l --limit restrict number of levels printed
|
-l --limit restrict number of levels printed
|
||||||
|
--charset specify the charset for pup to use
|
||||||
--version display version
|
--version display version
|
||||||
`
|
`
|
||||||
fmt.Fprintf(w, helpString, VERSION)
|
fmt.Fprintf(w, helpString, VERSION)
|
||||||
@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
|
|||||||
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
|
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
|
||||||
}
|
}
|
||||||
i++
|
i++
|
||||||
|
case "--charset":
|
||||||
|
pupCharset = cmds[i+1]
|
||||||
|
i++
|
||||||
case "--version":
|
case "--version":
|
||||||
fmt.Println(VERSION)
|
fmt.Println(VERSION)
|
||||||
os.Exit(0)
|
os.Exit(0)
|
||||||
|
15
pup.go
15
pup.go
@ -5,7 +5,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html"
|
||||||
"golang.org/x/net/html/charset"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// _=,_
|
// _=,_
|
||||||
@ -17,7 +16,7 @@ import (
|
|||||||
// |/ \_( # |"
|
// |/ \_( # |"
|
||||||
// C/ ,--___/
|
// C/ ,--___/
|
||||||
|
|
||||||
var VERSION string = "0.3.6"
|
var VERSION string = "0.3.7"
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
// process flags and arguments
|
// process flags and arguments
|
||||||
@ -27,19 +26,13 @@ func main() {
|
|||||||
os.Exit(2)
|
os.Exit(2)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Determine the charset of the input
|
|
||||||
cr, err := charset.NewReader(pupIn, "")
|
|
||||||
if err != nil {
|
|
||||||
fmt.Fprintf(os.Stderr, err.Error())
|
|
||||||
os.Exit(2)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the input and get the root node
|
// Parse the input and get the root node
|
||||||
root, err := html.Parse(cr)
|
root, err := ParseHTML(pupIn, pupCharset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
fmt.Fprintf(os.Stderr, err.Error())
|
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
|
||||||
os.Exit(2)
|
os.Exit(2)
|
||||||
}
|
}
|
||||||
|
pupIn.Close()
|
||||||
|
|
||||||
// Parse the selectors
|
// Parse the selectors
|
||||||
selectorFuncs := []SelectorFunc{}
|
selectorFuncs := []SelectorFunc{}
|
||||||
|
Loading…
Reference in New Issue
Block a user