day03
'''''
一.selenium剩余部分
1.点击清除
click
clear
2.action,chains
是一个动作链对象,需要把driver驱动传给他
3.frame的切换
4.执行js代码
'''''
from selenium import webdriver #web驱动
from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys
import time
driver=webdriver.Chrome(r'C:\Users\asus\Desktop\python\Scripts\chromedriver.exe')
try:
driver.implicity_wait(10)
driver.get('https://www.runoob.com/try/try.php?filenname=jqueryui-api-droppable')
time.sleep(5)
driver.switch_to.frame('iframeResult')
time.sleep(1)
#起始方块id:droppable
source=driver.find_element_by_id('draggable')
#目标方块id
target=driver.find_element_by_id('draggable')
print(source.size)
print(source.tag_name)
print(source.text)
print(source.location)
#找到滑动距离
distance=target.location['x']-source.location['x']
#按住起始滑块
ActionChain(driver).click_and_offset(source).perform()
#方式二一点点移动
s=0
while s<distance:
#获取动作链对象
#每一次位移
ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
s+=2
time.sleep(0.1)
#松开起始滑块
ActionChains(driver).release().perform()
time.sleep(10)
#点击,清除
#input=driver.find._element_by_id('key')
#input.send_keys('围城')
#通过class查找搜索按钮
#search=driver.find_element_by_class_name('button')
#search.click()#点击搜索按钮
#time.sleep(3)
#input2=driver.find_element_by_id('key')
#input2.clear()
#time.sleep(1)
#input2.send_keys('墨菲定律')
# input2.send_keys(Keys.ENTER)
#time.sleep(10)
finally:
driver.close()
'''''
执行js代码
'''''
from selenium import webdriver
import time
driver=webdriver.Chrome()
try:
driver.implicitly.Chrome()
driver.get('https://www.baidu.com/')
driver.execute_script(
'''''
alert("浙江万里学院")
'''''
)
time.sleep(10)
finally:
driver.close()
#模拟浏览器前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com/')
browser.get('https://www.taobao.com/')
browser.get('https://www.sina.com.cn/')
#回退
browser.back() time.sleep(5)
#前进
browser.forward() time.sleep(3) browser.close()
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver=webdriver.Chrome()
try:
driver.implicitly_wait(10)
driver.get('https://www.jd.com/')
input_tag=driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send_keys(Keys.ENTER)
time.sleep(5)
#js_code
#window
#...
good_list=driver.find_element_by_class_name('gl-item')
for good in good_list:
#print(good)
#商品名称
good_name=good.find_element_by_css_selector('.p-name em').text
#print(good_name)
#商品链接
good_url=good.find_element_by_css_selector('.p-name a').get_attribute('bref')
#print(good_url)
#商品价格
good_price=good.find_element_by_class_name('p-price').text
#print(name_price)
#商品评价
good_commit=good.find_element_by_class_name('p-commit').text
good_content=f'''
商品名称:{good_name}
商品链接:{good_url}
商品价格:{good_price}
商品评价{good_commit}
\n
'''
print(good_content)
with open('jd.txt','a',encoding='utf-8')as f:
f.write(good_content)
print('商品信息写入成功')
finally:
driver.close()
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def get_good(driver):
num=1
try:
time.seep(5)
# 下拉滑动5000px
js_code = '''
window.scrollTo(0,5000)
'''
driver.execute_script(js_code)
#等5秒,数据加载
time.sleep(5)
good_list = driver.find_element_by_class_name('gl-item')
for good in good_list:
# print(good)
# 商品名称
good_name = good.find_element_by_css_selector('.p-name em').text
# print(good_name)
# 商品链接
good_url = good.find_element_by_css_selector('.p-name a').get_attribute('bref')
# print(good_url)
# 商品价格
good_price = good.find_element_by_class_name('p-price').text
# print(name_price)
# 商品评价
good_commit = good.find_element_by_class_name('p-commit').text
good_content = f'''
商品名称:{good_name}
商品链接:{good_url}
商品价格:{good_price}
商品评价{good_commit}
\n
'''
print(good_content)
with open('jd.txt', 'a', encoding='utf-8')as f:
f.write(good_content)
print('商品信息写入成功')
finally:
driver.close()
if __name__ == '__main__':
driver = webdriver.chrome()
try:
driver.implicitly_wait(10)
# 往京东发送请求
driver.get('https://www.jd.com/')
# 往京东主页输入框输入墨菲定律,按回车
input_tag = driver.find_element_by_id('key')
input_tag.send_keys('墨菲定律')
input_tag.send.keys('Keys.ENTER')
# 调用获取商品信息函数
get_good(driver)
'''
二.BeautifulSoup4
1.什么是BS4
是一个解析库,可以通过某种解析器来帮我们提取想要的数据
2.为什么要用BS4
它可以用简介的语法快速提取用户想要的数据
3.解析器分类
-lxml
-html.parser
4.安装与使用
'''
html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup
#soup=BeautifulSoup(html_doc,'html.parser')
soup=BeautifulSoup(html_doc,'lxml')
#bs4对象
#print(soup)
#类型
#print(type(soup))
#美化功能
#html=soup.prettify()
#print(html)
#遍历文档树
#1.直接使用
#print(soup.html)
#print(type(soup.html))
#print(soup.a)
#2.获取标签名称
print(soup.a.name)
#3.获取标签属性
print(soup.a.attrs)
print(soup.a.attrs['href'])
#4.获取标签内容
print(soup.p.text)
#5.嵌套选择
print(soup.html.body.p)
#6.子节点
print(soup.p.children)
print(list(soup.p.children))
#7.父节点
print(soup.b.parent)
print(list(soup.b.parents))
#8.兄弟节点
print(soup.a.sibling)
print(soup.a.next_siblings)
'''
find:找第一个
find_all:找所有
标签查找与属性查找:
标签:
- 字符串过滤器 字符串全局匹配
name 属性匹配
attrs 属性查找匹配
text 文本匹配
- 正则过滤器
re模块匹配
- 列表过滤器
列表内的数据匹配
- bool过滤器
True匹配
- 方法过滤器
用于一些要的属性以及不需要的属性查找。
属性:
- class_
- id
'''
html_doc = """ <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p> """ from bs4 import BeautifulSoup soup=BeautifulSoup(html_doc,'lxml')
#find与find_all
#name 属性匹配
#attrs 属性查找匹配
#text 文本匹配
#p=soup.find(name='p')
#p_s=soup.find_all(name='p')
#print(p)
#print(p_s)
#p=soup.find(name='p',attrs={"id":"p"})
#print(p)
#tag=soup.find(name='title',text="The Domouse's story")
#print(tag)
#tag=soup.find(name='a',attrs={"class":"sister"},text='Elsie')
#print(tag)
import re
#正则过滤器
#re模块匹配
a=soup.find(name=re.compile('a'))
print(a)
a_s=soup.find_all(name=re.compile('a'))
print(a_s)
#列表过滤器
#列表内的数据匹配
print(soup.find_all(name=['a','p','html',re.compile('a')]))
#bool过滤器
#True匹配
print(soup.find(name=True,attrs={"id":True}))
#方法过滤器
#用于一些要的属性以及不需要的属性查找
def foo(tag):
print(tag.name)
if tag.name=='p'and tag.has_attr("id")and not tag.has_attr("class"):
return tag
print(soup.find_all(name=foo))
#补充知识点
#id
a=soup.find(id='link') print(a)
#class
p=soup.find(class_='sister') print(p)

浙公网安备 33010602011771号