1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-15 02:00:55 +00:00

Allow user to specify charset

This commit is contained in:
Eric Chiang 2014-12-13 23:52:41 -05:00
parent d00d65425a
commit a07991268b
3 changed files with 34 additions and 21 deletions

View File

@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required).
brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb
For linux distrubtions use the following commands to install under your `PATH`
environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures.
ARCH=linux_amd64
cd /tmp
wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip
unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip
sudo mv pup /usr/local/bin
pup --version
## Quick start
```bash
@ -353,5 +343,6 @@ output of pup into a more consumable format.
-i --indent number of spaces to use for indent or character
-n --number print number of elements selected
-l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version
```

View File

@ -6,16 +6,41 @@ import (
"os"
"strconv"
"strings"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)
var (
pupIn io.ReadCloser = os.Stdin
pupCharset string = ""
pupMaxPrintLevel int = -1
pupPrintColor bool = false
pupIndentString string = " "
pupDisplayer Displayer = TreeDisplayer{}
)
// Parse the html while handling the charset
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
var err error
if cs == "" {
// attempt to guess the charset of the HTML document
r, err = charset.NewReader(r, "")
if err != nil {
return nil, err
}
} else {
// let the user specify the charset
e, name := charset.Lookup(cs)
if name == "" {
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
}
r = transform.NewReader(r, e.NewDecoder())
}
return html.Parse(r)
}
func PrintHelp(w io.Writer, exitCode int) {
helpString := `Usage
pup [flags] [selectors] [optional display function]
@ -28,6 +53,7 @@ Flags
-i --indent number of spaces to use for indent or character
-n --number print number of elements selected
-l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version
`
fmt.Fprintf(w, helpString, VERSION)
@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
}
i++
case "--charset":
pupCharset = cmds[i+1]
i++
case "--version":
fmt.Println(VERSION)
os.Exit(0)

15
pup.go
View File

@ -5,7 +5,6 @@ import (
"os"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
)
// _=,_
@ -17,7 +16,7 @@ import (
// |/ \_( # |"
// C/ ,--___/
var VERSION string = "0.3.6"
var VERSION string = "0.3.7"
func main() {
// process flags and arguments
@ -27,19 +26,13 @@ func main() {
os.Exit(2)
}
// Determine the charset of the input
cr, err := charset.NewReader(pupIn, "")
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(2)
}
// Parse the input and get the root node
root, err := html.Parse(cr)
root, err := ParseHTML(pupIn, pupCharset)
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
fmt.Fprintf(os.Stderr, "%s\n", err.Error())
os.Exit(2)
}
pupIn.Close()
// Parse the selectors
selectorFuncs := []SelectorFunc{}