1
0
mirror of https://github.com/ericchiang/pup synced 2025-01-28 08:31:26 +00:00

Allow user to specify charset

This commit is contained in:
Eric Chiang 2014-12-13 23:52:41 -05:00
parent d00d65425a
commit a07991268b
3 changed files with 34 additions and 21 deletions

View File

@ -19,16 +19,6 @@ If you're on OS X, use [Brew](http://brew.sh/) to install (no Go required).
brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb
For linux distrubtions use the following commands to install under your `PATH`
environment variable. You can set `ARCH` to `linux_386` for 32-bit infrastructures.
ARCH=linux_amd64
cd /tmp
wget https://github.com/EricChiang/pup/releases/download/v0.3.5/pup_${ARCH}.zip
unzip pup_${ARCH}.zip && rm pup_${ARCH}.zip
sudo mv pup /usr/local/bin
pup --version
## Quick start ## Quick start
```bash ```bash
@ -353,5 +343,6 @@ output of pup into a more consumable format.
-i --indent number of spaces to use for indent or character -i --indent number of spaces to use for indent or character
-n --number print number of elements selected -n --number print number of elements selected
-l --limit restrict number of levels printed -l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version --version display version
``` ```

View File

@ -6,16 +6,41 @@ import (
"os" "os"
"strconv" "strconv"
"strings" "strings"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
) )
var ( var (
pupIn io.ReadCloser = os.Stdin pupIn io.ReadCloser = os.Stdin
pupCharset string = ""
pupMaxPrintLevel int = -1 pupMaxPrintLevel int = -1
pupPrintColor bool = false pupPrintColor bool = false
pupIndentString string = " " pupIndentString string = " "
pupDisplayer Displayer = TreeDisplayer{} pupDisplayer Displayer = TreeDisplayer{}
) )
// Parse the html while handling the charset
func ParseHTML(r io.Reader, cs string) (*html.Node, error) {
var err error
if cs == "" {
// attempt to guess the charset of the HTML document
r, err = charset.NewReader(r, "")
if err != nil {
return nil, err
}
} else {
// let the user specify the charset
e, name := charset.Lookup(cs)
if name == "" {
return nil, fmt.Errorf("'%s' is not a valid charset", cs)
}
r = transform.NewReader(r, e.NewDecoder())
}
return html.Parse(r)
}
func PrintHelp(w io.Writer, exitCode int) { func PrintHelp(w io.Writer, exitCode int) {
helpString := `Usage helpString := `Usage
pup [flags] [selectors] [optional display function] pup [flags] [selectors] [optional display function]
@ -28,6 +53,7 @@ Flags
-i --indent number of spaces to use for indent or character -i --indent number of spaces to use for indent or character
-n --number print number of elements selected -n --number print number of elements selected
-l --limit restrict number of levels printed -l --limit restrict number of levels printed
--charset specify the charset for pup to use
--version display version --version display version
` `
fmt.Fprintf(w, helpString, VERSION) fmt.Fprintf(w, helpString, VERSION)
@ -81,6 +107,9 @@ func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) {
return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd) return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd)
} }
i++ i++
case "--charset":
pupCharset = cmds[i+1]
i++
case "--version": case "--version":
fmt.Println(VERSION) fmt.Println(VERSION)
os.Exit(0) os.Exit(0)

15
pup.go
View File

@ -5,7 +5,6 @@ import (
"os" "os"
"golang.org/x/net/html" "golang.org/x/net/html"
"golang.org/x/net/html/charset"
) )
// _=,_ // _=,_
@ -17,7 +16,7 @@ import (
// |/ \_( # |" // |/ \_( # |"
// C/ ,--___/ // C/ ,--___/
var VERSION string = "0.3.6" var VERSION string = "0.3.7"
func main() { func main() {
// process flags and arguments // process flags and arguments
@ -27,19 +26,13 @@ func main() {
os.Exit(2) os.Exit(2)
} }
// Determine the charset of the input
cr, err := charset.NewReader(pupIn, "")
if err != nil {
fmt.Fprintf(os.Stderr, err.Error())
os.Exit(2)
}
// Parse the input and get the root node // Parse the input and get the root node
root, err := html.Parse(cr) root, err := ParseHTML(pupIn, pupCharset)
if err != nil { if err != nil {
fmt.Fprintf(os.Stderr, err.Error()) fmt.Fprintf(os.Stderr, "%s\n", err.Error())
os.Exit(2) os.Exit(2)
} }
pupIn.Close()
// Parse the selectors // Parse the selectors
selectorFuncs := []SelectorFunc{} selectorFuncs := []SelectorFunc{}