day03
#京东商品查询 selenium实现 import time from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.Chrome(r'D:\Python\Scripts\chromedriver.exe') try: driver.implicitly_wait(10) driver.get('https://wwww.jd.com/') input = driver.find_element_by_id('key') input.send_keys('墨菲定律') input.send_keys(Keys.ENTER) time.sleep(1) list = driver.find_elements_by_class_name('gl-item') for i in list: i_name = i.find_element_by_css_selector('.p-name em').text # 商品链接 i_url = i.find_element_by_css_selector('.p-name a').get_attribute('href') #商品价格 i_price = i.find_element_by_class_name('p-price').text print(f""" 商品名称:{i_name} 商品链接:{i_url} 商品价格:{i_price} """) data = f""" 商品名称:{i_name} 商品链接:{i_url} 商品价格:{i_price} """ with open('商品信息.txt','a',encoding='utf-8') as f: f.write(data) print("商品信息写入完成!!!!!") finally: driver.close()
#爬虫高级import time
from selenium import webdriver from selenium.webdriver.common.keys import Keys def get_x (driver): try: js_code = """ window.scrollTo(0, 5000) """ driver.execute_script(js_code) #执行下拉功能的js代码 #执行下拉之后需等待加载页面 time.sleep(2) list = driver.find_elements_by_class_name('gl-item') for i in list: i_name = i.find_element_by_css_selector('.p-name em').text # 商品链接 i_url = i.find_element_by_css_selector('.p-name a').get_attribute('href') # 商品价格 i_price = i.find_element_by_class_name('p-price').text i_commit = i.find_element_by_class_name('p-commit').text print(f""" 商品名称:{i_name} 商品链接:{i_url} 商品价格:{i_price} 商品评价:{i_commit} """) data = f""" 商品名称:{i_name} 商品链接:{i_url} 商品价格:{i_price} 商品评价:{i_commit} """ with open('商品信息.txt', 'a', encoding='utf-8') as f: f.write(data) print("商品信息写入完成!!!!!") b_next = driver.find_element_by_class_name('pn-next') b_next.click() time.sleep(5) get_x(driver) finally: driver.close() if __name__ == '__main__': driver = webdriver.Chrome(r'D:\Python\Scripts\chromedriver.exe') try: driver.implicitly_wait(10) driver.get('https://wwww.jd.com/') input = driver.find_element_by_id('key') input.send_keys('墨菲定律'
) input.send_keys(Keys.ENTER) get_x(driver) finally: driver.close()
#selenium交互 from selenium import webdriver from selenium.webdriver.common.keys import Keys #键盘按键操作 from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素 import time from selenium.webdriver import ActionChains driver = webdriver.Chrome(r'D:\Python\Scripts\chromedriver.exe') # try: # driver.implicitly_wait(10) # driver.get('https://wwww.jd.com/') # # #click clear操作 # # input = driver.find_element_by_id('key') # # input.send_keys('GTX') # # send = driver.find_element_by_class_name('button') # # send.click() # # time.sleep(1) # # input2 = driver.find_element_by_id('key') # # input2.clear() # # input2.send_keys('墨菲定律') # # input2.send_keys(Keys.ENTER) # # time.sleep(3) # # # finally: # driver.close() #Actions Chains 操作一系列设定好的动作 try: driver.implicitly_wait(10) driver.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') time.sleep(5) #获取动作对象 action = ActionChains(driver) #iframe 切换 # driver.switch_to_frame() driver.switch_to.frame('iframeResult') time.sleep(1) src = driver.find_element_by_id('draggable') tge = driver.find_element_by_id('droppable') #方式一:秒移 #拟定好了一个动作,需要调用执行方法(perform) # action.drag_and_drop(src,tge).perform() # time.sleep(10) #方式二:缓慢移动(防反爬) #1.找滑块的移动位置和距离 #找到滑动距离 # src.location 位置(xy坐标) # src.size 大小 # src.text 文本 # src.tag_name 名称 #滑动距离 distance = tge.location['x'] - src.location['x'] #点住目标 ActionChains(driver).click_and_hold(src).perform() s = 0 while s < distance: ActionChains(driver).move_by_offset(xoffset = 5,yoffset = 0).perform() s += 5 time.sleep(0.1) #松开滑块 ActionChains(driver).release() finally: driver.close()
#bs4选择器 html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'lxml') # print(soup) # print(type(soup)) # 遍历文档树 # 1、直接使用 ***** print(soup.html) print(type(soup.html)) print(soup.a) print(soup.p) # 2、获取标签的名称 print(soup.a.name) # 3、获取标签的属性 ***** print(soup.a.attrs) # 获取a标签中所有的属性 print(soup.a.attrs['href']) # 4、获取标签的文本内容 ***** print(soup.p.text) # $37 # 5、嵌套选择 print(soup.html.body.p) # 6、子节点、子孙节点 print(soup.p.children) # 返回迭代器对象 print(list(soup.p.children)) # [<b>$37</b>] # 7、父节点、祖先节点 print(soup.b.parent) print(soup.b.parents) print(list(soup.b.parents)) # 8、兄弟节点 (sibling: 兄弟姐妹) print(soup.a) # 获取下一个兄弟节点 print(soup.a.next_sibling) # 获取下一个的所有兄弟节点,返回的是一个生成器 print(soup.a.next_siblings) print(list(soup.a.next_siblings)) # 获取上一个兄弟节点 print(soup.a.previous_sibling) # 获取上一个的所有兄弟节点,返回的是一个生成器 print(list(soup.a.previous_siblings))
#bs4安装与简单使用 from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc,'lxml') print(soup) print(type(soup)) html = soup.prettify() print(html)
#bs4搜索文档树 from bs4 import BeautifulSoup import re html_doc = """ <html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<a href="http://example.com/elsie" class="sister" >Elsie</a><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>and they lived at the bottom of a well.</p><p class="story">...</p> """ soup = BeautifulSoup(html_doc,'lxml') # p = soup.find(name = 'p') # p_s = soup.find_all(name='p') # print(p) # print(p_s) # a = soup.find(name = re.compile('a')) # a_all = soup.find_all(name = re.compile('a')) # print(a) # print(a_all) # # link = soup.find(attrs={"id" : re.compile('link')}) # # link_a = soup.find_all(attrs={"id" : re.compile('link')}) # # print(link) # print(link_a) """ 字符串过滤器 find的方法中(name,attrs,text) """ # p = soup.find(name = 'p') # p_s = soup.find_all(name='p') # print(p) # print(p_s) #name + attrs # p = soup.find(name='p',attrs= {'id' : 'p'}) # print(p) #name + text # tag = soup.find(name='title',text="The Dormouse's story") # print(tag) #name + attrs + text # tag = soup.find(name='a',attrs={"class":re.compile('si')},text="Elsie") # tag = soup.find(name='a',attrs={"class":"sister"},text="Elsie") # print(tag) """ -正则过滤器 re模块器 需导入re库 """ #name #根据re模块匹配带有a的节点 # a = soup.find(name=re.compile('a')) # print(a) # a_s = soup.find_all(name=re.compile('a')) # print(a_s)