爬虫基础巩固

爬虫基础
    导包
        import requests
        from urllib.parse import urlencode
# 导入解析模块
        from urllib.request import Request
# Request 请求
        from urllib.parse import quote
# 使用 quote 解析中文
        from urllib.request import urlopen
# urlopen 打开
        from fake_useragent import UserAgent
# 导入 ua
        import ssl
# 使用 ssl 忽略证书
        from urllib.request import HTTPHandler
        from urllib.request import build_opener
# 导入 build_opener
        from urllib.request import ProxyHandler
# 导入 私人代理
        from http.cookiejar import MozillaCookieJar
# 导入 cookie , 从 http.cookiejar 中
        from urllib.error import URLError
# 捕捉 URL 异常
        from lxml import etree
# 导入 etree,使用 xpath 进行解析
        import http.cookiejar
# 导入 cookiejar  
        import json
# 导入 json
        import jsonpath
# 导入 jsonpath
        from selenium import webdriver
# 导入外部驱动
        from selenium.webdriver.common.keys import Keys
# 要想调用键盘按键操作需要引入keys包
    headers 
        headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'
}
        headers = {
    'User-Agent':UserAgent().random
}
            from fake_useragent import UserAgent
        headers = {
    'User-Agent':UserAgent().chrome
}
        使用 ua 列表
            user_agent = [
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Mozilla/5.0 (Windows NT 6.1; rv2.0.1) Gecko/20100101 Firefox/4.0.1",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
        "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
]
                ua = random.choice(user_agent)
                    headers = {'User-Agent':ua}
    url
        url = 'https://www.baidu.com/'
# 要进行访问的 URL
        url = 'https://www.baidu.com/s?wd={}'.format(quote('瀚阳的小驿站'))
        args = {
    'wd':"Hany驿站",
    "ie":"utf-8"
}
            url = 'https://www.baidu.com/s?wd={}'.format(urlencode(args))
    获取 response
        get 请求
            params = {
    'wd':'Python'
}
                response = requests.get(url,params = params,headers = headers)
            params = {
    'wd':'ip'
}
                proxies = {
    'http':'代理'
    # "http":"http://用户名:密码@120.27.224.41:16818"
}
                    response = requests.get(url, params=params, headers=headers, proxies=proxies)
            response = requests.get(url,headers = headers)
            response = requests.get(url,verify = False,headers = headers)
        Request 请求
            form_data = {
    'user':'账号',
    'password':'密码'
}
                f_data = urlencode(form_data)
                    request = Request(url = url,headers = headers,data = f_data)
                        handler = HTTPCookieProcessor()
                            opener = build_opener(handler)
                                response = opener.open(request)
            request = Request(url = url,headers = headers)
                response = urlopen(request)
            request = Request(url,headers=headers)
                handler = HTTPHandler()
# 构建 handler
                    opener = build_opener(handler)
# 将 handler 添加到 build_opener中
                        response = opener.open(request)
            request = urllib.request.Request(url)
                request.add_header('User-Agent', ua)
                    context = ssl._create_unverified_context()
                        reponse = urllib.request.urlopen(request, context = context)
                            response = urllib.request.urlopen(request, data=formdata)
            # 构建请求体
formdata = {
    'from':'en',
    'to':'zh',
    'query':word,
    'transtype':'enter',
    'simple_means_flag':'3'
}
                # 将formdata进行urlencode编码,并且转化为bytes类型
formdata = urllib.parse.urlencode(formdata).encode('utf-8')
                    request = urllib.request.Request(url, headers=headers)
            # 创建一个HTTPHandler对象,用来处理http请求
http_handler = urllib.request.HTTPHandler()
# 构建一个HTTPHandler 处理器对象,支持处理HTTPS请求
                # 通过build_opener,创建支持http请求的opener对象
opener = urllib.request.build_opener(http_handler)
                    # 创建请求对象
# 抓取https,如果开启fiddler,则会报证书错误
# 不开启fiddler,抓取https,得不到百度网页,
request = urllib.request.Request('http://www.baidu.com/')
                        # 调用opener对象的open方法,发送http请求
reponse = opener.open(request)
        使用 proxies 代理进行请求
            proxies = {
    'http':'代理'
    # "http":"http://用户名:密码@120.27.224.41:16818"
}
                response = requests.get(url,headers = headers,proxies = proxies)
            request = Request(url,headers = headers)
                handler = ProxyHandler({"http":"110.243.3.207"})
# 代理网址 
                    opener = build_opener(handler)
                        response = opener.open(request)
        post 请求
            data = {
    'user':'用户名',
    'password':'密码'
}
                response = requests.post(url,headers = headers,data = data)
# 使用 data 传递参数
        使用 session 
session = requests.Session()
            get 请求
                session.get(info_url,headers = headers)
            post 请求
                params = {
    'user':'用户名',
    'password':'密码'
}
                    session.post(url,headers = headers,data = params)
        使用 ssl 忽略证书
            context = ssl._create_unverified_context()
                response = urlopen(request,context = context)
        使用 cookie
            form_data = {
    'user':'用户名',
    'password':'密码'
}
                f_data = urlencode(form_data).encode()
                    request = Request(url = login_url,headers = headers,data = f_data)
                        cookie_jar = MozillaCookieJar()
                            handler = HTTPCookieProcessor(cookie_jar)
                                opener = build_opener(handler)
                                    response = opener.open(request)
                                        cookie_jar.save('cookie.txt',ignore_discard=True,ignore_expires=True)
# 失效或者过期依旧进行保存
            request = Request(url = info_url,headers = headers)
                cookie_jar = MozillaCookieJar()
                    cookie_jar.load('cookie.txt',ignore_expires=True,ignore_discard=True)
                        handler = HTTPCookieProcessor(cookie_jar)
                            opener = build_opener(handler)
                                response = opener.open(request)
        设置时间戳
            response = requests.get(url,timeout = 0.001)
# 设置时间戳
        cookie = http.cookiejar.CookieJar()
# 通过CookieJar创建一个cookie对象,用来保存cookie值
            cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
# 通过HTTPCookieProcessor构建一个处理器对象,用来处理cookie
                opener = urllib.request.build_opener(cookie_handler)
                    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'Referer':'https://passport.weibo.cn/signin/login?entry=mweibo&r=http%3A%2F%2Fweibo.cn%2F&backTitle=%CE%A2%B2%A9&vt=',
    'Content-Type':'application/x-www-form-urlencoded',
    # 'Host': 'passport.weibo.cn',
    # 'Connection': 'keep-alive',
    # 'Content-Length': '173',
    # 'Origin':'https://passport.weibo.cn',
    # 'Accept': '*/*',
}
                        url = 'https://passport.weibo.cn/sso/login'
                            formdata = {
    'username':'17701256561', 
    'password':'2630030lzb',
    'savestate':'1',
    'r':'http://weibo.cn/',
    'ec':'0',
    'pagerefer':'',
    'entry':'mweibo',
    'wentry':'',
    'loginfrom':'',
    'client_id':'',
    'code':'',
    'qq':'',
    'mainpageflag':'1',
    'hff':'',
    'hfp':''
}
                                formdata = urllib.parse.urlencode(formdata).encode()
# post表单里面的数据要转化为bytes类型,才能发送过去
                                    request = urllib.request.Request(url, headers=headers)
                                        response = opener.open(request, data=formdata)
                    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
    'Referer':'https://coding.net/login',
    'Content-Type':'application/x-www-form-urlencoded',
}
                        post_url = 'https://coding.net/api/v2/account/login'
                            data = {
    'account': 'wolfcode',
    'password': '7c4a8d09ca3762af61e59520943dc26494f8941b',
    'remember_me': 'false'
}
                                data = urllib.parse.urlencode(data).encode()
# 向指定的post地址发送登录请求
                                    request = urllib.request.Request(post_url, headers=headers)
                                        response = opener.open(request, data=data)
# 通过opener登录
                                            # 登录成功之后,通过opener打开其他地址即可
    response 属性和方法
        response.getcode()
# 获取 HTTP 响应码  200
        response.geturl()
# 获取访问的网址信息
        response.info()
# 获取服务器响应的HTTP请求头
        info = response.read()
# 读取内容
info.decode()
# 打印内容
        response.read().decode()
        print(request.get_header("User-agent"))
# 获取请求头信息
        response.text
# 获取内容
        response.encoding = 'utf-8'
        response.json()
# 获取响应信息(json 格式字符串)
        response.request.headers
# 请求头内容
        response.cookie
# 获取 cookie
        response.readline()
# 获取一行信息
        response.status
# 查看状态码
    正则表达式
        $通配符,匹配字符串结尾
            ret = re.match("[\w]{4,20}@163\.com$", email)
# \w 匹配字母或数字
# {4,20}匹配前一个字符4到20次
        re.match匹配字符(仅匹配开头)
        ret = re.findall(r"\d+","Hany.age = 22, python.version = 3.7.5")
# 输出全部找到的结果  \d + 一次或多次
        ret = re.search(r"\d+",'阅读次数为:9999')
# 只要找到规则即可,从头到尾
        re中匹配 [ ] 中列举的字符
            ret = re.match("[hH]","hello Python")
# 大小写h都可以的情况
            ret = re.match("[0-3,5-9]Hello Python","7Hello Python")
# 匹配0到3 5到9的数字
        re中匹配不是以4,7结尾的手机号码
            ret = re.match("1\d{9}[0-3,5-6,8-9]", tel)
        re中匹配中奖号码
            import re
# 匹配中奖号码
str2 = '17711602423'
pattern = re.compile('^(1[3578]\d)(\d{4})(\d{4})$')
print(pattern.sub(r'\1****\3',str2))
# r 字符串编码转化

'''177****2423'''
        re中匹配中文字符
            pattern = re.compile('[\u4e00-\u9fa5]')
strs = '你好 Hello hany'
print(pattern.findall(strs))
                # ['你', '好']
            pattern = re.compile('[\u4e00-\u9fa5]+')
print(pattern.findall(strs))
                # ['你好']
        re中将括号中字符作为一个分组
            ret = re.match("\w{4,20}@163\.com", "test@163.com")
print(ret.group())  # test@163.com
        re中对分组起别名
            ret = re.match(r"<(?P<name1>\w*)><(?P<name2>\w*)>.*</(?P=name2)></(?P=name1)>", "<html><h1>www.itcast.cn</h1></html>")
print(ret.group())
                <html><h1>www.itcast.cn</h1></html>
        re中匹配数字
            # 使用\d进行匹配
ret = re.match("嫦娥\d号","嫦娥1号发射成功")
print(ret.group())
        re中匹配左右任意一个表达式
            ret = re.match("[1-9]?\d$|100","78")
print(ret.group())  # 78

        re中匹配多个字符 问号
            ret = re.match("[1-9]?\d[1-9]","33")
print(ret.group())
                # 33
            ret = re.match("[1-9]?\d","33")
print(ret.group())
                # 33
        re中匹配多个字符 星号
            ret = re.match("[A-Z][a-z]*","MnnM")
print(ret.group())
                # Mnn
            ret = re.match("[A-Z][a-z]*","Aabcdef")
print(ret.group())
                # Aabcdef
        re中匹配多个字符 加号
            import re
#匹配前一个字符出现1次或无限次
names = ["name1", "_name", "2_name", "__name__"]

for name in names:
    ret = re.match("[a-zA-Z_]+[\w]*",name)
    if ret:
        print("变量名 %s 符合要求" % ret.group())
    else:
        print("变量名 %s 非法" % name)
                变量名 name1 符合要求
变量名 _name 符合要求
变量名 2_name 非法
变量名 __name__ 符合要求
        re中引用分组匹配字符串
            # 通过引用分组中匹配到的数据即可,但是要注意是元字符串,即类似 r""这种格式
ret = re.match(r"<([a-zA-Z]*)>\w*</\1>", "<html>hh</html>")
# </\1>匹配第一个规则
print(ret.group())
                # <html>hh</html>
            ret = re.match(r"<(\w*)><(\w*)>.*</\2></\1>", label)
        re中的贪婪和非贪婪
            ret = re.match(r"aa(\d+)","aa2343ddd")
# 尽量多的匹配字符
print(ret.group())
                # aa2343
            # 使用? 将re贪婪转换为非贪婪
ret = re.match(r"aa(\d+?)","aa2343ddd")
# 只输出一个数字
print(ret.group())
                # aa2
        re使用split切割字符串
            str1 = 'one,two,three,four'
pattern = re.compile(',')
# 按照,将string分割后返回
print(pattern.split(str1))
                # ['one', 'two', 'three', 'four']
            str2 = 'one1two2three3four'
print(re.split('\d+',str2))
                # ['one', 'two', 'three', 'four']
        re匹配中subn,进行替换并返回替换次数
            pattern = re.compile('\d+')
strs = 'one1two2three3four'
print(pattern.subn('-',strs))
                # ('one-two-three-four', 3) 3为替换的次数
        re匹配中sub将匹配到的数据进行替换
            pattern = re.compile('\d')
str1 = 'one1two2three3four'
print(pattern.sub('-',str1))
                # one-two-three-four
            print(re.sub('\d','-',str1))
                # one-two-three-four
        获取图片
            src="https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_201611131917_small.jpg"
ret = re.search(r"https://.*?\.jpg", src)
print(ret.group())
                https://rpic.douyucdn.cn/appCovers/2016/11/13/1213973_201611131917_small.jpg
        re匹配前一个字符出现m次
            res = re.compile('[a-zA-Z]{1}')
strs = '123abc456'
print(re.search(res,strs).group( ))
                # a
            res = re.compile('[a-zA-Z]{1}')
strs = '123abc456'
print(re.findall(res,strs)) #findall返回列表元素对象不具有group函数
                # ['a', 'b', 'c']
        分组 group
            strs = 'hello 123,world 456'
pattern = re.compile('(\w+) (\d+)')
for i in pattern.finditer(strs):
    print(i.group(0))
    print(i.group(1))
    print(i.group(2))#当存在第二个分组时
                hello 123
hello
123
world 456
world
456
            print(pattern.sub(r'\2 \1',strs))
# 先输出第二组,后输出第一组
                # 123 hello,456 world
            print(pattern.sub(r'\1 \2',strs))
# 先输出第一组,后输出第二组
                # hello 123,world 456
        
        
        
        
        
    忽略警告
        requests.packages.urllib3.disable_warnings()
    quote 编码
        urllib.parse.quote()   除了-._/09AZaz 都会编码
        urllib.parse.quote_plus()   还会编码  /
        url = 'kw=中国'
            urllib.parse.quote(url)
            urllib.parse.quote_plus(url)
    保存网址内容为某个文件格式
        urllib.request.urlretrieve(url, '名称.后缀名')
    json
        # 将字节码解码为utf8的字符串
data = data.decode('utf-8')
            # 将json格式的字符串转化为json对象
obj = json.loads(data)
                # 禁用ascii之后,写入数据,就是正确的
html = json.dumps(obj, ensure_ascii=False)
                    # 将json对象通过str函数强制转化为字符串然后按照utf-8格式写入,这样就可以写成中文汉字了
# 写文件的时候要指定encoding,否则会按照系统的编码集写文件
        loads
            引号中为列表
                string = '[1, 2, 3, 4, "haha"]'
json.loads(string)
            引号中为字典
                str_dict = '{"name":"goudan", "age":100, "height":180}'
json.loads(str_dict)
            obj = json.load(open('jsontest.json', encoding='utf-8'))
# load  读取文件中json形式的字符串 转化成python对象
        dumps
            json.dumps() 序列化时默认使用的ascii编码
# 添加参数 ensure_ascii=False 禁用ascii编码,按utf-8编码
            json.dump(str_dict, open('jsontest.json', 'w', encoding='utf-8'), ensure_ascii=False)
# dump将对象序列化之后写入文件
        load
            obj = json.load(open('book.json', encoding='utf-8'))
                book = jsonpath.jsonpath(obj, '$..book')
    保存文件
        # 得到html为bytes类型
html = response.read()
            # 将bytes类型转化为字符串类型
html = html.decode('utf-8')
                # 输出文件时,需要将bytes类型使用wb写入文件,否则出错
fp = open('baidu.html', 'w')
                    fp.write(html)
                        fp.close()
        html = reponse.read()
                with open(filename, 'wb') as f:
        f.write(html)
        # 通过read读取过来为字节码
data = response.read()
            # 将字节码解码为utf8的字符串
data = data.decode('utf-8')
                # 将json格式的字符串转化为json对象
obj = json.loads(data)
                    # 禁用ascii之后,写入数据,就是正确的
html = json.dumps(obj, ensure_ascii=False)
                        # 将json对象通过str函数强制转化为字符串然后按照utf-8格式写入,这样就可以写成中文汉字了
# 写文件的时候要指定encoding,否则会按照系统的编码集写文件
                            with open('json.txt', 'w', encoding='utf-8') as f:
    f.write(html)
    etree
        html_tree = etree.parse('文件名.html')
# 通过读取文件得到tree对象
        xpath 用法
            result = html_tree.xpath('//li')
# 获取所有的li标签
            result = html_tree.xpath('//li/@class')
# 获取所有li标签的class属性
            result = html_tree.xpath('//li/a[@href="link1.html"]')
# 获取所有li下面a中属性href为link1.html的a
            result = html_tree.xpath('//li[last()]/a/@href')
# 获取最后一个li的a里面的href,结果为一个字符串
            result = html_tree.xpath('//*[@class="mimi"]')
# 获取class为mimi的节点
            result = html_tree.xpath('//li[@class="popo"]/a')
# 符合条件的所有li里面的所有a节点
                result = html_tree.xpath('//li[@class="popo"]/a/text()')
# 符合条件的所有li里面的所有a节点的内容
                result = html_tree.xpath('//li[@class="popo"]/a')[0].text
# 符合条件的所有li里面的 a节点的内容
            xpath使用后,加上 .extract()
                只有一个元素可以使用 .extract_first()
        tostring
            etree.tostring(result[0]).decode('utf-8')
# 将tree对象转化为字符串
            html = etree.tostring(html_tree)
print(html.decode('utf-8'))
        etree.HTML
            html_tree = etree.HTML('文件名.html')
# 将html字符串解析为文档类型
            html_bytes = response.read()
                html_tree = etree.HTML(html_bytes.decode('utf-8'))
            response = requests.get(url,headers = headers)
e = etree.HTML(response.text)
                img_path = '//article//img/@src'
img_urls = e.xpath(img_path)
        string(.) 方法
            xpath获取到的对象列表中的某一个元素
                ret = score.xpath('string(.)').extract()[0]
    BeautifulSoup
        获取 soup
            soup = BeautifulSoup(open('文件名.html', encoding='utf-8'), 'lxml')
            soup = BeautifulSoup(driver.page_source, 'lxml')
        # 在所有内容中第一个符合要求的标签
            soup.title
            soup.a
            soup.ul
        a_tag = soup.a  
            a_tag.name
# 获得标签名字
            a_tag.attrs
# 得到标签的所有属性,字典类型
            a_tag.get('href')
# 获取 href 
            a_tag['title']
# 查看 a 标签的 title 值
            a_tag.string
# 获取 a 标签的内容
        获取标签下的子节点
            contents 
                soup.div.contents
# 获取 div 标签下所有子节点
                soup.head.contents[1]
# 获取 div 下第二个子节点
            children
                # .children属性得到的是一个生成器,可以遍历生成器
                    # 遍历生成器打印对象
for child in soup.body.children:
    print(child)
                    # 只遍历直接子节点
for child in soup.div.children:
    print(child)
                    # descendants会递归遍历子孙节点
for child in soup.div.descendants:
    print(child)
        find_all 方法,查找所有的内容
            soup.find_all(re.compile('^b'))
# 传入正则表达式  找到所有以b开头的标签
            soup.find_all(['a', 'b'])
# 传入列表  找到所有的a标签和b标签
        select 方法
            soup.select('a')
# 通过类名
            soup.select('.aa')
# 通过id名
            soup.select('#wangyi')
# 组合查找
            soup.select('div .la')
# 直接层级
            soup.select('.div > .la')
# 根据属性查找
            soup.select('input[class="haha"]')
# 查找 input 标签下 class 为 haha 的 标签
            soup.select('.la')[0].get_text()
# 找到节点之后获取内容  通过get_text()方法,并且记得添加下标
    jsonpath
        jsonpath 方法
            obj = json.load(open('book.json', encoding='utf-8'))
                book = jsonpath.jsonpath(obj, '$..book')
# 所有book
                    authors = jsonpath.jsonpath(obj, '$..book..author')
# 所有book中的所有作者
                        # book中的前两本书   '$..book[:2]'
# book中的最后两本书 '$..book[-2:]'
                            book = jsonpath.jsonpath(obj, '$..book[0,1]')
# 获取前面的两本书
                                book = jsonpath.jsonpath(obj, '$..book[?(@.isbn)]')
# 所有book中,有属性isbn的书籍
                                    book = jsonpath.jsonpath(obj, '$.store.book[?(@.price<10)]')
# 所有book中,价格小于10的书籍
        xpath和jsonpath
            
    补充资料
        day01
        http
            状态码
            协议简介
        fiddler
            简介
        环境安装
        类型
        问题
        day02
        day03
        day04
        常用函数
    webdriver 方法
        设置 driver
            driver = webdriver.PhantomJS()
            driver = webdriver.PhantomJS(executable_path="./phantomjs")
# 如果没有在环境变量指定PhantomJS位置
            driver 方法
                text  
# 获取标签内容
                get_attribute('href')
# 获取标签属性
                获取id标签值
element = driver.find_element_by_id("passwd-id")
                    driver.find_element_by_id('kw').send_keys('中国')
                    driver.find_element_by_id('su').click()
# 点击百度一下
                    yanzheng = input('请输入验证码:')
driver.find_element_by_id('captcha_field').send_keys(yanzheng)
                    for x in range(1, 3):
    driver.find_element_by_id('loadMore').click()
    time.sleep(3)
    driver.save_screenshot(str(x) + '.png')
                获取name标签值
element = driver.find_element_by_name("user-name")
                获取标签名值
element = driver.find_element_by_tag_name("input")
                可以通过XPath来匹配
element = driver.find_element_by_xpath("//input[@id='passwd-id']")
                通过css来匹配
element = driver.find_element_by_css_selector("#food span.dairy.aged")
                获取当前url
driver.current_url
                关闭浏览器
driver.quit()
                driver.save_screenshot('图片名.png')
# 保存当前网址为一张图片
                driver.execute_script(js)
# 调用js方法,同时执行javascript脚本

 

posted @ 2020-07-15 16:10  CodeYaSuo  阅读(132)  评论(0编辑  收藏  举报