Go语言爬虫1-网络请求

Posted on 2013-03-09 20:31 蝈蝈俊阅读(4625) 评论(0) 收藏举报

下面是找的几个例子：

例子1：获得百度首页的html源文件：

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

func main(){

    response,_:=http.Get("http://www.baidu.com")

    defer response.Body.Close()

    body,_:=ioutil.ReadAll(response.Body)

    fmt.Println(string(body))

例子2，增加了一些错误验证

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

    "os"

func main(){

    response,err:=http.Get("http://www.baidu.com/")

    if err!=nil{

        fmt.Printf("%s",err)

        os.Exit(1)

    }else{

        defer response.Body.Close()

        contents,err:=ioutil.ReadAll(response.Body)

        if err!=nil{

            fmt.Printf("%s",err)

            os.Exit(1)

        fmt.Printf("%s\n",string(contents))

http下有Get，Post，PostForm三个函数。这三个函数直接实现了简单的http客户端

package main

import(

    "fmt"

    "io/ioutil"

    "log"

    "net/http"

func main(){

    res,err:=http.Get("http://www.ghj1976.net/")

    if err!=nil{

        log.Fatal(err)

    defer res.Body.Close()

    robots,err:=ioutil.ReadAll(res.Body)

    if err!=nil{

        log.Fatal(err)

    fmt.Printf("%s",robots)

例子3：把百度的网页存在本地一个文件：

package main

import(

    "fmt"

    "log"

    "net/http"

    "os"

func main(){

    resp,err:=http.Get("http://www.baidu.com")

    if err!=nil{

        //handleerror

        fmt.Println(err)

        log.Fatal(err)

    defer resp.Body.Close()

    if resp.StatusCode==http.StatusOK{

        fmt.Println(resp.StatusCode)

    buf:=make([]byte,1024)

    //createfile

    f,err1:=os.OpenFile("baidu.html",os.O_RDWR|os.O_CREATE|os.O_APPEND,os.ModePerm)

    if err1!=nil{

        panic(err1)

        return

    defer f.Close()

    for{

        n,_:=resp.Body.Read(buf)

        if 0==n{

            break

        f.WriteString(string(buf[:n]))

其他可以借鉴的

golang 批量检查页面

http://www.simonzhang.net/?p=1346

除了使用Get、Post、PostForm 这三个函数来建立一个简单客户端，还可以使用：
http.Client和http.NewRequest来模拟请求

例子：指定公共头的请求百度页面

package main

import(

    "fmt"

    "io/ioutil"

    "net/http"

func main(){

    client:=&http.Client{}

    reqest,_:=http.NewRequest("GET","http://www.baidu.com",nil)

    reqest.Header.Set("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")

    reqest.Header.Set("Accept-Charset","GBK,utf-8;q=0.7,*;q=0.3")

    reqest.Header.Set("Accept-Encoding","gzip,deflate,sdch")

    reqest.Header.Set("Accept-Language","zh-CN,zh;q=0.8")

    reqest.Header.Set("Cache-Control","max-age=0")

    reqest.Header.Set("Connection","keep-alive")

    response,_:=client.Do(reqest)

    if response.StatusCode==200{

        body,_:=ioutil.ReadAll(response.Body)

        bodystr:=string(body)

        fmt.Println(bodystr)

参考资料：

用golang的正则regexp：去除HTML，CSS，SCRIPT代码，仅保留页面文字
http://bpbp.iteye.com/blog/1668869

刷新页面返回顶部

蝈蝈俊的技术心得