爬虫学习汇总 - 愤怒中的小草

爬虫学习汇总

import requests
import os

#url 请求
def getHTMLText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()  #如果状态不是200，引发HttpError异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常"

#带参数的url请求
def getHTMLParams(url, params):
    try:
        r = requests.get(url, params)
        r.raise_for_status()  #如果状态不是200，引发HttpError异常
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常"

#爬取图片  path存储文件的路径
def getPicture(url,path):
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print('文件保存成功')
        else:
            print('文件已存在')
    except:
        print('爬取失败')


if __name__ == "__main__":
   # url = "http://www.baidu.com"
   # print(getHTMLText(url))

    #kv = {'wd': 'python'}
    #url = "http://www.baidu.com/s"
    #r = getHTMLParams(url, kv)
    #print(len(r))
    #爬取图片的简单样例
    url = "http://pic41.nipic.com/20140508/18609517_112216473140_2.jpg"
    root = "D://pics//"
    path = root + url.split('/')[-1]
    getPicture(url,path)
#robots协议  举例可以查看京东https://www.jd.com/robots.txt
#百度搜索关键字接口
#http://www.baidu.com/s?wd=keyword

#www.ip138.com  IP归属地查询

posted on 2019-08-18 20:15 愤怒中的小草阅读(166) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

愤怒中的小草

公告