爬虫

1.selenium的使用

2.爬取京东商品信息

3.selenium拿到cookie，requests使用

4.验证那破解

5.requests-html

6.xpaht选择

一.selenium的使用

selenium的简单使用看博客爬虫day2

选择器：基本用法

 　　# 1、find_element_by_id 根据id找
    # 2、find_element_by_link_text 根据链接名字找到控件（就相当于a标签的文字）
    # 3、find_element_by_partial_link_text 根据链接的名字找到控件（a标签的文字）模糊查询
    # 4、find_element_by_tag_name 根据标签名
    # 5、find_element_by_class_name 根据类名
    # 6、find_element_by_name 根据属性名
    # 7、find_element_by_css_selector 根据css选择器
    # 8、find_element_by_xpath 根据xpath选择
    # 强调：
    # 1、上述均可以改写成find_element(By.ID,'kw')的形式
    # 2、find_elements_by_xxx的形式是查找到多个元素，结果为列表

等待元素被加载：因为代码启动速度太快了，会导致你的控件还没被加载出来，你就执行了

#1、selenium只是模拟浏览器的行为，而浏览器解析页面是需要时间的（执行css，js），一些元素可能需要过一段时间才能加载出来，为了保证能查找到元素，必须等待

#2、等待的方式分两种：
隐式等待：在browser.get（'xxx'）前就设置，针对所有元素有效
显式等待：在browser.get（'xxx'）之后设置，只针对某个元素有效

# 显示等待隐示等待    一般都是用隐示等待
# 1、隐示等待：在查找所有的元素时，如果尚未被加载，则等10秒
# browser.implicitly_wait(10) 表示等待所有
# 2、显示等待:显示等待某个元素被加载
# wait=WebDriverWait(browser,10)
# wait.until(EC.presence_of_element_located((By.ID,'content_left')))

小案例：用代码操控百度登录

 1 from selenium import webdriver
 2 import time
 3 bro=webdriver.Chrome()
 4 bro.get("http://www.baidu.com")
 5 bro.implicitly_wait(10)
 6 # 1、find_element_by_id 根据id找
 7 # 2、find_element_by_link_text 根据链接名字找到控件（就相当于a标签的文字）
 8 # 3、find_element_by_partial_link_text 根据链接的名字找到控件（a标签的文字）模糊查询
 9 # 4、find_element_by_tag_name 根据标签名
10 # 5、find_element_by_class_name 根据类名
11 # 6、find_element_by_name 根据属性名
12 # 7、find_element_by_css_selector 根据css选择器
13 # 8、find_element_by_xpath 根据xpath选择
14 # bro现在就是一个浏览器
15 dl_button = bro.find_element_by_link_text('登录')
16 dl_button.click()# 点击一下
17 user_login = bro.find_element_by_id("TANGRAM__PSP_10__footerULoginBtn")
18 user_login.click()# 在点击一下登录
19 input_name = bro.find_element_by_name('userName')
20 input_name.send_keys('2397096644@qq.com')# 往登录的框里写东西
21 input_password = bro.find_element_by_id("TANGRAM__PSP_10__password")
22 input_password.send_keys('123456789')
23 submit_button = bro.find_element_by_id("TANGRAM__PSP_10__submit")
24 submit_button.click()# 登录
25 
26 time.sleep(10)
27 # 拿到登录成功后的cookie
28 print(bro.get_cookies())
29 # 关闭浏览器
30 bro.close()
31 
32 # 显示等待隐示等待
33 # 1、隐示等待：在查找所有的元素时，如果尚未被加载，则等10秒
34 # browser.implicitly_wait(10) 表示等待所有
35 # 2、显示等待:显示等待某个元素被加载
36 # wait=WebDriverWait(browser,10)
37 # wait.until(EC.presence_of_element_located((By.ID,'content_left')))

案例

二.爬取京东商品信息

案例：

 1 from selenium import webdriver
 2 from selenium.webdriver.common.keys import Keys #键盘按键操作
 3 import time
 4 bro=webdriver.Chrome()
 5 bro.get("https://www.jd.com")
 6 bro.implicitly_wait(10)
 7 
 8 def get_goods(bro):
 9     print("------------------------------------")
10     # 拿到所有的商品
11     goods_li = bro.find_elements_by_class_name('gl-item')
12     # 循环打印一个个的商品
13     for good in goods_li:
14         # 拿到图片的地址 用的是类名选择器 a 标签的 下面的img图片
15         img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')
16         if not img_url:
17             img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
18         url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
19         price = good.find_element_by_css_selector('.p-price i').text
20         name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '')
21         commit = good.find_element_by_css_selector('.p-commit a').text
22         print('''
23         商品链接：%s
24         商品图片：%s
25         商品名字：%s
26         商品价格：%s
27         商品评论数：%s
28 
29         ''' % (url, img_url, name, price, commit))
30 
31     next_page = bro.find_element_by_partial_link_text("下一页")
32     time.sleep(1)
33     next_page.click()
34     time.sleep(1)
35     get_goods(bro)
36 
37 input_search=bro.find_element_by_id('key')
38 input_search.send_keys("性感内衣")
39 input_search.send_keys(Keys.ENTER) #键盘按键操作
40 
41 #进入了另一个页面
42 try:
43     get_goods(bro)
44 except Exception as e:
45     print("结束")
46 finally:
47     bro.close()

View Code

三.selenium拿到cookie，requests使用

详细见：https://www.cnblogs.com/xiaoyuanqujing/articles/11805718.html

 1 # 获取属性：
 2 # tag.get_attribute('src')
 3 # 获取文本内容
 4 # tag.text
 5 # 获取标签ID，位置，名称，大小（了解）
 6 # print(tag.id)
 7 # print(tag.location)
 8 # print(tag.tag_name)
 9 # print(tag.size)
10 #
11 # 模拟浏览器前进后退
12 # browser.back()
13 # time.sleep(10)
14 # browser.forward()
15 
16 # cookies管理
17 # print(browser.get_cookies())  获取cookie
18 # browser.add_cookie({'k1':'xxx','k2':'yyy'})  设置cookie
19 # print(browser.get_cookies())
20 #
21 # 运行js
22 from selenium import webdriver
23 import time
24 
25 bro=webdriver.Chrome()
26 bro.get("http://www.baidu.com")
27 bro.execute_script('alert("hello world")') #打印警告
28 # time.sleep(5)
29 # 选项卡管理
30 # import time
31 # from selenium import webdriver
32 
33 
34 browser=webdriver.Chrome()
35 browser.get('https://www.baidu.com')
36 browser.execute_script('window.open()')
37 #
38 # print(browser.window_handles) #获取所有的选项卡
39 # browser.switch_to_window(browser.window_handles[1])
40 # browser.get('https://www.taobao.com')
41 # time.sleep(3)
42 # browser.switch_to_window(browser.window_handles[0])
43 # browser.get('https://www.sina.com.cn')
44 # browser.close()
45 #
46 # 动作链
47 # from selenium import webdriver
48 # from selenium.webdriver import ActionChains
49 #
50 # from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
51 # import time
52 #
53 # driver = webdriver.Chrome()
54 # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
55 # wait=WebDriverWait(driver,3)
56 # # driver.implicitly_wait(3)  # 使用隐式等待
57 #
58 # try:
59 #     driver.switch_to.frame('iframeResult') ##切换到iframeResult
60 #     sourse=driver.find_element_by_id('draggable')
61 #     target=driver.find_element_by_id('droppable')
62 #
63 #
64 # #方式一：基于同一个动作链串行执行
65 # # actions=ActionChains(driver) #拿到动作链对象
66 # # actions.drag_and_drop(sourse,target) #把动作放到动作链中，准备串行执行
67 # # actions.perform()
68 #
69 # #方式二：不同的动作链，每次移动的位移都不同
70 #
71 #
72 #     ActionChains(driver).click_and_hold(sourse).perform()
73 #     distance=target.location['x']-sourse.location['x']
74 #
75 #
76 #     track=0
77 #     while track < distance:
78 #         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
79 #         track+=2
80 #
81 #     ActionChains(driver).release().perform()
82 #
83 #     time.sleep(10)
84 #
85 #
86 # finally:
87 #     driver.close()

View Code

 1 import time
 2 from selenium import webdriver
 3 import json
 4 # browser=webdriver.Chrome()
 5 # browser.get('https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F')
 6 #
 7 # time.sleep(30)
 8 # cookie=browser.get_cookies()
 9 # print(cookie)
10 # with open('cookie.json','w')as f:
11 #     json.dump(cookie,f)
12 
13 
14 #下一次
15 # import time
16 # from selenium import webdriver
17 # import json
18 # browser=webdriver.Chrome()
19 # browser.get('https://www.cnblogs.com/')
20 # with open('cookie.json','r')as f:
21 #     di=json.load(f)
22 #
23 # cookies = {}
24 # # 获取cookie中的name和value,转化成requests可以使用的形式
25 # for cookie in di:
26 #     cookies[cookie['name']] = cookie['value']
27 # print(cookies)
28 # browser.add_cookie(cookies)
29 # browser.refresh()
30 #
31 # time.sleep(10)
32 
33 #request模块模拟
34 
35 
36 import requests
37 with open('cookie.json','r')as f:
38     di=json.load(f)
39 
40 cookies = {}
41 # 获取cookie中的name和value,转化成requests可以使用的形式
42 for cookie in di:
43     print(cookie)
44     for key in cookie.keys():
45         cookies[key] = cookie[key]
46 
47 
48 print(cookies)
49 res=requests.get('https://i-beta.cnblogs.com/api/user',
50              cookies=cookies)
51 
52 print(res.text)

View Code

四.验证那破解

五.requests-html

详细去百度查询中文文档

六.xpaht选择

 1 doc='''
 2 <html>
 3  <head>
 4   <base href='http://example.com/' />
 5   <title>Example website</title>
 6  </head>
 7  <body>
 8   <div id='images'>
 9    <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
10    <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
11    <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
12    <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
13    <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
14    <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
15   </div>
16  </body>
17 </html>
18 '''
19 from lxml import etree
20 
21 html=etree.HTML(doc)
22 # html=etree.parse('search.html',etree.HTMLParser())
23 # 1 所有节点
24 a=html.xpath('//*')    #匹配所有标签
25 # 2 指定节点（结果为列表）
26 # a=html.xpath('//head')
27 # 3 子节点，子孙节点
28 a=html.xpath('//div/a')
29 a=html.xpath('//body/a') #无数据
30 a=html.xpath('//body//a')
31 # 4 父节点
32 # a=html.xpath('//body//a[@href="image1.html"]/..')
33 a=html.xpath('//body//a[1]/..')  #从1开始
34 # 也可以这样
35 a=html.xpath('//body//a[1]/parent::*')
36 # 5 属性匹配
37 a=html.xpath('//body//a[@href="image1.html"]')
38 
39 # 6 文本获取
40 a=html.xpath('//body//a[@href="image1.html"]/text()')
41 a=html.xpath('//body//a/text()')
42 
43 # 7 属性获取
44 # a=html.xpath('//body//a/@href')
45 # # 注意从1 开始取（不是从0）
46 a=html.xpath('//body//a[2]/@href')
47 # 8 属性多值匹配
48 #  a 标签有多个class类，直接匹配就不可以了，需要用contains
49 # a=html.xpath('//body//a[@class="li"]')
50 a=html.xpath('//body//a[contains(@class,"li")]/text()')
51 # a=html.xpath('//body//a[contains(@class,"li")]/text()')
52 # 9 多属性匹配
53 a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
54 a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
55 a=html.xpath('//body//a[contains(@class,"li")]/text()')
56 # 10 按序选择
57 a=html.xpath('//a[2]/text()')
58 a=html.xpath('//a[2]/@href')
59 # 取最后一个
60 a=html.xpath('//a[last()]/@href')
61 # 位置小于3的
62 a=html.xpath('//a[position()<3]/@href')
63 # 倒数第二个
64 a=html.xpath('//a[last()-2]/@href')
65 # 11 节点轴选择
66 # ancestor：祖先节点
67 # 使用了* 获取所有祖先节点
68 a=html.xpath('//a/ancestor::*')
69 # # 获取祖先节点中的div
70 a=html.xpath('//a/ancestor::div')
71 # attribute：属性值
72 a=html.xpath('//a[1]/attribute::*')
73 # child：直接子节点
74 a=html.xpath('//a[1]/child::*')
75 # descendant：所有子孙节点
76 a=html.xpath('//a[6]/descendant::*')
77 # following:当前节点之后所有节点
78 a=html.xpath('//a[1]/following::*')
79 a=html.xpath('//a[1]/following::*[1]/@href')
80 # following-sibling:当前节点之后同级节点
81 a=html.xpath('//a[1]/following-sibling::*')
82 a=html.xpath('//a[1]/following-sibling::a')
83 a=html.xpath('//a[1]/following-sibling::*[2]/text()')
84 a=html.xpath('//a[1]/following-sibling::*[2]/@href')
85 
86 print(a)

View Code

posted @ 2019-11-27 22:00 ZHANGYUZY 阅读(147) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

ZHANGYUZY

爬虫

公告