pu369com

(golang)HTTP基本认证机制及使用gocolly登录爬取

内网有个网页用了HTTP基本认证机制,想用gocolly爬取,不知道怎么登录,只好研究HTTP基本认证机制

参考这里:https://www.jb51.net/article/89070.htm  

下面开始参考作者dotcoo了:-)

看了<<http权威指南>>第12章HTTP基本认证机制(本站下载地址://www.jb51.net/books/93254.html),感觉讲的蛮详细的,写了一个小小例子测试.

请求响应过程:

==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
<==
HTTP/1.1 401 Unauthorized
WWW-Authenticate: Basic realm="Dotcoo User Login"
==>
GET /hello HTTP/1.1
Host: 127.0.0.1:12345
Authorization: Basic YWRtaW46YWRtaW5wd2Q=
<==
HTTP/1.1 200 OK
Content-Type: text/plain; charset=utf-8

golang HTTP基本认证机制的实现代码

package main
import (
    "fmt"
    "io"
    "net/http"
    "log"
    "encoding/base64"
    "strings"
)
// hello world, the web server
func HelloServer(w http.ResponseWriter, req *http.Request) {
    auth := req.Header.Get("Authorization")
    if auth == "" {
        w.Header().Set("WWW-Authenticate", `Basic realm="Dotcoo User Login"`)
        w.WriteHeader(http.StatusUnauthorized)
        return
    }
    fmt.Println(auth)
    auths := strings.SplitN(auth, " ", 2)
    if len(auths) != 2 {
        fmt.Println("error")
        return
    }
    authMethod := auths[0]
    authB64 := auths[1]
    switch authMethod {
    case "Basic":
        authstr, err := base64.StdEncoding.DecodeString(authB64)
        if err != nil {
            fmt.Println(err)
            io.WriteString(w, "Unauthorized!\n")
            return
        }
        fmt.Println(string(authstr))
        userPwd := strings.SplitN(string(authstr), ":", 2)
        if len(userPwd) != 2 {
            fmt.Println("error")
            return
        }
        username := userPwd[0]
        password := userPwd[1]
        fmt.Println("Username:", username)
        fmt.Println("Password:", password)
        fmt.Println()
    default:
        fmt.Println("error")
        return
    }
    io.WriteString(w, "hello, world!\n")
}
func main() {
    http.HandleFunc("/hello", HelloServer)
    err := http.ListenAndServe(":8000", nil)
    if err != nil {
        log.Fatal("ListenAndServe: ", err)
    }
}

试验了上面的例子后,基本明白了HTTP基本认证的过程。但是怎么用gocolly访问呢?

参考:https://stackoverflow.com/questions/50576248/using-colly-framework-i-cant-login-to-the-evernote-account

但是答复者Matías Insaurralde提供的模拟浏览器访问的例子编译不通过,不明白其中的hptsKey的意思。代码放在下面供参考(可跳过):

package evernote

import (
    "bytes"
    "errors"
    "fmt"
    "io/ioutil"
    "net/http"
    "net/http/cookiejar"
    "net/url"
    "regexp"
    "strings"
)

const (
    evernoteLoginURL = "https://www.evernote.com/Login.action"
)

var (
    evernoteJSParamsExpr = regexp.MustCompile(`document.getElementById\("(.*)"\).value = "(.*)"`)
    evernoteRedirectExpr = regexp.MustCompile(`Redirecting to <a href="(.*)">`)

    errNoMatches   = errors.New("No matches")
    errRedirectURL = errors.New("Redirect URL not found")
)

// EvernoteClient wraps all methods required to interact with the website.
type EvernoteClient struct {
    Username   string
    Password   string
    httpClient *http.Client

    // These parameters persist during the login process:
    hpts  string
    hptsh string
}

// NewEvernoteClient initializes a new Evernote client.
func NewEvernoteClient(username, password string) *EvernoteClient {
    // Allocate a new cookie jar to mimic the browser behavior:
    cookieJar, _ := cookiejar.New(nil)

    // Fill up basic data:
    c := &EvernoteClient{
        Username: username,
        Password: password,
    }

    // When initializing the http.Client, copy default values from http.DefaultClient
    // Pass a pointer to the cookie jar that was created earlier:
    c.httpClient = &http.Client{
        Transport:     http.DefaultTransport,
        CheckRedirect: http.DefaultClient.CheckRedirect,
        Jar:           cookieJar,
        Timeout:       http.DefaultClient.Timeout,
    }
    return c
}

func (e *EvernoteClient) extractJSParams(body []byte) (err error) {
    matches := evernoteJSParamsExpr.FindAllSubmatch(body, -1)
    if len(matches) == 0 {
        return errNoMatches
    }
    for _, submatches := range matches {
        if len(submatches) < 3 {
            err = errNoMatches
            break
        }
        key := submatches[1]
        val := submatches[2]

        if bytes.Compare(key, hptsKey) == 0 {
            e.hpts = string(val)
        }
        if bytes.Compare(key, hptshKey) == 0 {
            e.hptsh = string(val)
        }
    }
    return nil
}

// Login handles the login action.
func (e *EvernoteClient) Login() error {
    // First step: fetch the login page as a browser visitor would do:
    res, err := e.httpClient.Get(evernoteLoginURL)
    if err != nil {
        return err
    }
    if res.Body == nil {
        return errors.New("No response body")
    }
    body, err := ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    err = e.extractJSParams(body)
    if err != nil {
        return err
    }

    // Second step: we have extracted the "hpts" and "hptsh" parameters
    // We send a request using only the username and setting "evaluateUsername":
    values := &url.Values{}
    values.Set("username", e.Username)
    values.Set("evaluateUsername", "")
    values.Set("analyticsLoginOrigin", "login_action")
    values.Set("clipperFlow", "false")
    values.Set("showSwitchService", "true")
    values.Set("hpts", e.hpts)
    values.Set("hptsh", e.hptsh)

    rawValues := values.Encode()
    req, err := http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
    if err != nil {
        return err
    }
    req.Header.Set("Accept", "application/json")
    req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Set("x-requested-with", "XMLHttpRequest")
    req.Header.Set("referer", evernoteLoginURL)
    res, err = e.httpClient.Do(req)
    if err != nil {
        return err
    }
    body, err = ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr := string(body)
    if !strings.Contains(bodyStr, `"usePasswordAuth":true`) {
        return errors.New("Password auth not enabled")
    }

    // Third step: do the final request, append password to form data:
    values.Del("evaluateUsername")
    values.Set("password", e.Password)
    values.Set("login", "Sign in")

    rawValues = values.Encode()
    req, err = http.NewRequest(http.MethodPost, evernoteLoginURL, bytes.NewBufferString(rawValues))
    if err != nil {
        return err
    }
    req.Header.Set("Accept", "text/html")
    req.Header.Set("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8")
    req.Header.Set("x-requested-with", "XMLHttpRequest")
    req.Header.Set("referer", evernoteLoginURL)
    res, err = e.httpClient.Do(req)
    if err != nil {
        return err
    }

    // Check the body in order to find the redirect URL:
    body, err = ioutil.ReadAll(res.Body)
    if err != nil {
        return err
    }
    bodyStr = string(body)
    matches := evernoteRedirectExpr.FindAllStringSubmatch(bodyStr, -1)
    if len(matches) == 0 {
        return errRedirectURL
    }
    m := matches[0]
    if len(m) < 2 {
        return errRedirectURL
    }
    redirectURL := m[1]
    fmt.Println("Login is ok, redirect URL:", redirectURL)
    return nil
}
After you successfully get the redirect URL, you should be able to send authenticated requests as long as you keep using the HTTP client that was used for the login process, the cookie jar plays a very important role here.

To call this code use:

func main() {
    evernoteClient := NewEvernoteClient("user@company", "password")
    err := evernoteClient.Login()
    if err != nil {
        panic(err)
    }
}

只好自己写,经反复试验,发现对于本文开头自己写的server,只需以下代码即可通过验证,输出了hello,world!(将访问方式改为POST也一样。)

package main

import (
    "fmt"

    "io/ioutil"
    "net/http"
)

// Login handles the login action.
func Login() {
    //生成client 参数为默认
    client := &http.Client{}
    //要访问的url
    url := "http://localhost:8000/hello"
    //要提交的请求
    req, _ := http.NewRequest("GET", url, nil)
    //最重要的一句,用户名和密码可随意写
    req.SetBasicAuth("aa", "bb")
    fmt.Println("POST访问")
    //返回结果
    res, _ := client.Do(req)
    defer res.Body.Close()
    fmt.Println("header:")
    header := res.Header
    fmt.Println(header)
    fmt.Println("realm:")
    basicRealm := res.Header.Get("Www-Authenticate")
    fmt.Println(basicRealm)
    fmt.Println("body:")
    body, _ := ioutil.ReadAll(res.Body)
    fmt.Println(string(body))

}

func main() {   
    Login()  
}

查看SetBasicAuth的定义为(liteide中在光标位置按Ctrl+shift+J):

func (r *Request) SetBasicAuth(username, password string) {
    r.Header.Set("Authorization", "Basic "+basicAuth(username, password))
}

而basicAuth的定义为

func basicAuth(username, password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodeToString([]byte(auth))
}

那么,用gocolly访问的代码如下:

package main

import (
    "encoding/base64"
    "fmt"
    "net/http"

    "github.com/gocolly/colly"
)

func basicAuth(username, password string) string {
    auth := username + ":" + password
    return base64.StdEncoding.EncodeToString([]byte(auth))
}
func main() {
    c := colly.NewCollector()
    h := http.Header{}
    h.Set("Authorization", "Basic "+basicAuth("aaaa", "bbbb"))

    c.OnResponse(func(r *colly.Response) {
        //fmt.Println(r)
        fmt.Println(string(r.Body))
    })

    c.Request("GET", "http://localhost:8000/hello", nil, nil, h)
}

注:对于其他网站,也许要用Fiddler抓包,设置相应的header和cookie才行。

posted on 2019-02-20 20:05  pu369com  阅读(3337)  评论(0编辑  收藏  举报

导航