diff --git a/README.md b/README.md index 9888c44..fd5055f 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required). brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb -For linux distrubtions use the following commands to install under your `PATH` -environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures. - - ARCH=linux_amd64 - cd /tmp - wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip - unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip - sudo mv pup /usr/local/bin - pup --version - ## Quick start ```bash @@ -353,5 +343,6 @@ output of pup into a more consumable format. -i --indent number of spaces to use for indent or character -n --number print number of elements selected -l --limit restrict number of levels printed +--charset specify the charset for pup to use --version display version ``` diff --git a/parse.go b/parse.go index d9d570a..9080c4f 100644 --- a/parse.go +++ b/parse.go @@ -6,16 +6,41 @@ import ( "os" "strconv" "strings" + + "golang.org/x/net/html" + "golang.org/x/net/html/charset" + "golang.org/x/text/transform" ) var ( pupIn io.ReadCloser = os.Stdin + pupCharset string = "" pupMaxPrintLevel int = -1 pupPrintColor bool = false pupIndentString string = " " pupDisplayer Displayer = TreeDisplayer{} ) +// Parse the html while handling the charset +func ParseHTML(r io.Reader, cs string) (*html.Node, error) { + var err error + if cs == "" { + // attempt to guess the charset of the HTML document + r, err = charset.NewReader(r, "") + if err != nil { + return nil, err + } + } else { + // let the user specify the charset + e, name := charset.Lookup(cs) + if name == "" { + return nil, fmt.Errorf("'%s' is not a valid charset", cs) + } + r = transform.NewReader(r, e.NewDecoder()) + } + return html.Parse(r) +} + func PrintHelp(w io.Writer, exitCode int) { helpString := `Usage pup [flags] [selectors] [optional display function] @@ -28,6 +53,7 @@ Flags -i --indent number of spaces to use for indent or character -n --number print number of elements selected -l --limit restrict number of levels printed + --charset specify the charset for pup to use --version display version ` fmt.Fprintf(w, helpString, VERSION) @@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) { return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd) } i++ + case "--charset": + pupCharset = cmds[i+1] + i++ case "--version": fmt.Println(VERSION) os.Exit(0) diff --git a/pup.go b/pup.go index 7586d08..98c65d7 100644 --- a/pup.go +++ b/pup.go @@ -5,7 +5,6 @@ import ( "os" "golang.org/x/net/html" - "golang.org/x/net/html/charset" ) // _=,_ @@ -17,7 +16,7 @@ import ( // |/ \_( # |" // C/ ,--___/ -var VERSION string = "0.3.6" +var VERSION string = "0.3.7" func main() { // process flags and arguments @@ -27,19 +26,13 @@ func main() { os.Exit(2) } - // Determine the charset of the input - cr, err := charset.NewReader(pupIn, "") - if err != nil { - fmt.Fprintf(os.Stderr, err.Error()) - os.Exit(2) - } - // Parse the input and get the root node - root, err := html.Parse(cr) + root, err := ParseHTML(pupIn, pupCharset) if err != nil { - fmt.Fprintf(os.Stderr, err.Error()) + fmt.Fprintf(os.Stderr, "%s\n", err.Error()) os.Exit(2) } + pupIn.Close() // Parse the selectors selectorFuncs := []SelectorFunc{}