pycurl参考文档：http://pycurl.io/docs/latest/index.html 是英文文档，看起来也不是特么吃力跟着做问题不大。

#coding=utf-8

import pycurl
import StringIO

"""
简单原则：不要对str使用encode，不要对unicode使用decode (

 s是code_A的str s.decode(‘code_A‘).encode(‘code_B‘) 0
"""

class PySpider():

    def __init__(self):
        self.c = pycurl.Curl()
        self.c.setopt(pycurl.COOKIEFILE, "cookie_file_name")#把cookie保存在该文件中
        self.c.setopt(pycurl.COOKIEJAR, "cookie_file_name")
        #设置跳转
        self.c.setopt(pycurl.FOLLOWLOCATION, 1)  #遇到302时候是否进行自动跳转
        self.c.setopt(pycurl.MAXREDIRS, 5)       #网页最多跳转的次数

        #设置超时
        self.c.setopt(pycurl.CONNECTTIMEOUT,60)  #设置链接超时
        self.c.setopt(pycurl.TIMEOUT,120)        #设置下载超时

        #设置代理 如果有需要请去掉注释，并设置合适的参数
        #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
        #c.setopt(pycurl.PROXYUSERPWD, '’'aaa:aaa')

    #-----------------------------------get函数-----------------------------------#
    def GetData(self, url):
        headers = ['Accept:*/*',
                'User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0']
        buf = StringIO.StringIO()

        self.c.setopt(self.c.WRITEFUNCTION, buf.write)
        self.c.setopt(pycurl.URL, url)
        self.c.setopt(self.c.HTTPHEADER,  headers)
        self.c.perform()
        the_page =buf.getvalue()
        charset = re.findall("""charset=([a-zA-Z0-9_\S][^"^>^']*)""",the_page)
        buf.close()
        return the_page,charset

    def PostData(self, url, data):

        headers = ['User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0']
        #写入html
        buf = StringIO.StringIO()
        self.c.setopt(self.c.WRITEFUNCTION, buf.write)

        #设置POST传入数据#是否是post方法，默认是get#post的数据，是字典：个字典:{"key":"value"}
        self.c.setopt(pycurl.POST,1)
        self.c.setopt(pycurl.POSTFIELDS, data) 

        #访问的url 
        self.c.setopt(pycurl.URL, url)

        # 传入Headers
        self.c.setopt(pycurl.HEADER, True)
        self.c.setopt(self.c.HTTPHEADER, headers)
        self.c.perform()
        charset = re.findall('charset=([a-zA-Z0-9_\S][^"^>]*)',the_page)
        the_page = buf.getvalue()
        buf.close()
        return the_page,charset

稍微封装了一下，就变成上面这玩意了保存上面代码到Spider.py文件中，用法如下：

from Spider import PySpider
import json

spider = PySpider()

#GET方法
html = spider.GetData('http://www.baidu.com') 
print html,charset

#post方法
postdate = json.dumps({
　'cityListName':'gz',
  'trade':'',
})
html = spider.PostData('http://qy.m.58.com/m_entlist/ajax_listinfo/2') 
print html

posted on 2016-02-24 14:33 酱油的seo学习笔记阅读(1036) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

公告