1 bs4 遍历文档树
'''
#遍历文档树:即直接通过标签名字选择,特点是选择速度快,但如果存在多个相同的标签则只返回第一个
#1、用法
#2、获取标签的名称
#3、获取标签的属性
#4、获取标签的内容
#5、嵌套选择
#6、子节点、子孙节点
#7、父节点、祖先节点
#8、兄弟节点
'''
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_pp' name='lqz'>asdfasdf<b>asdfas</b><span>span<b>bbb</b></span></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'lxml')
2 bs4的搜索文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_pp' name='lqz'>asdfasdf<b>asdfas</b><span>span<b>bbb</b></span></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister1" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'lxml')
3 find_all的其它参数
4 css选择器
5 selenium的介绍
1 自动化测试工具,控制浏览器,模拟人的行为,做爬虫为了解决使用requests模块无法执行ajax获取数据
2 使用selenium+半人工登录,获取cookie-----》给requests模块使用
6 selenium的使用
1 安装模块 pip3 install selenium
2 驱动浏览器(需要浏览器驱动---》不同去不同浏览器官网下载),下载的驱动要跟浏览器版本对应
http://npm.taobao.org/mirrors/chromedriver/
3 chrom浏览器为例
7 模拟登录百度
from selenium import webdriver
import time
bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('http://www.baidu.com')
bro.implicitly_wait(100)
8 selenium的其它使用
import time
from selenium import webdriver
bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.get('http://www.baidu.com')
# print(bro.page_source)
# 选择器
# 1、find_element_by_id 通过id获取控件
# 2、find_element_by_link_text 通过a标签的文本获取标签
# 3、find_element_by_partial_link_text 通过a标签的文本模糊匹配获取标签
# 4、find_element_by_tag_name 通过标签名找
# 5、find_element_by_class_name 通过类名找
# 6、find_element_by_name 通过name属性找
# 7、find_element_by_css_selector css选择器
# 8、find_element_by_xpath xpath选择器
# input_1=bro.find_element_by_css_selector('#kw')
# # 往输入框中写文字
# input_1.send_keys('美女')
#
# search_btn=bro.find_element_by_css_selector('#su')
# search_btn.click()
# 获取某个标签的属性,位置,id,名字..
# input_1=bro.find_element_by_css_selector('#kw')
# print(input_1.id)
# print(input_1.tag_name)
# print(input_1.get_attribute('maxlength'))
# print(input_1.location)
# print(input_1.size)
## 等待元素被加载(显示等待,隐士等待)
# bro.implicitly_wait(10) # 隐士等待,等待所有,再下方再去找一个控件,如果控件没有加载出来,最多等待10s
# 显示等待(基本不用),指定等待某个控件几秒
# from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
# wait=WebDriverWait(bro,10)
# wait.until(EC.presence_of_element_located((By.ID,'kw')))
## 控件其它操作
# 点击操作
# search_btn=bro.find_element_by_css_selector('#su')
# search_btn.click() # 点击操作
# input_1=bro.find_element_by_css_selector('#kw')
# # 往输入框中写文字
# # 清空操作
# input_1.send_keys('美女')
# time.sleep(1)
# input_1.clear()
# input_1.send_keys('帅哥')
### 执行js
# bro.get('https://www.pearvideo.com/video_1715923')
# bro.execute_script('alert(urlMap.registerUrl)')
# bro.execute_script('scroll(0,30000)') # 滑倒屏幕底部,有的页面滑倒底部自动加载数据
# 模拟浏览器前进后退
#
# bro.get('http://www.baidu.com')
# bro.get('http://www.taobao.com')
# bro.get('http://www.cnblogs.com')
#
# bro.back()
#
# time.sleep(1)
# bro.forward()
# time.sleep(1)
# bro.get('http://www.cnblogs.com')
#
# time.sleep(30)
### cookie的处理
# print(type(bro.get_cookies())) # 把所有cookie取出来
# cookies=bro.get_cookies()
# import json
# with open('cookie.json','w') as f:
# json.dump(cookies,f)
#
# # 取到cookie之后,存到文件中
# # 再打开一个页面,还是这个网站,把cookie之间写入
#
# time.sleep(1)
# # 关闭浏览器
# bro.close()
### 选项卡管理
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get('https://www.taobao.com')
# time.sleep(2)
# browser.switch_to_window(browser.window_handles[0])
# browser.get('https://www.sina.com.cn')
# browser.close()
## 异常捕获
try:
browser=webdriver.Chrome()
browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframssseResult')
except Exception as e:
print(e)
finally:
browser.close() # 关闭浏览器
# 动作链
# 使用selenium爬取京东商品信息
from selenium import webdriver
import json
import time
bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.get('http://www.cnblogs.com')
with open('cookie.json','r') as f:
cookie=json.load(f)
for i in cookie:
bro.add_cookie(i)
time.sleep(1)
bro.refresh()
time.sleep(1)
bro.refresh()
time.sleep(4)
bro.close()