使用selenium 自动化搜索 获取URL 并爬取图片,并保存
总结:入口程序写的差,内容繁琐,用到知识点多
import os
from chrome_Demo.handless import shaer_browser
import time
import requests
from lxml import etree
import urllib.request
from urllib.request import urlparse
def browser_url():
browser = shaer_browser() #构造无头浏览器
# path = './chromedriver.exe'
# browser= webdriver.Chrome(path)
browser.get('https://www.baidu.com/')
input_key = browser.find_element_by_id('kw')#定位百度一下的输入框
input_key.send_keys('下厨房')#在输入框内输入搜素内容
time.sleep(1)
bd_key = browser.find_element_by_id('su')#定位百度一下
bd_key.click()#点击百度一下
time.sleep(2)
xcf_key = browser.find_element_by_xpath('//*[@id="1"]/h3/a[1]')#定位搜索的标题
xcf_key.click()#点击跳转新网页
n = browser.window_handles
browser.switch_to.window(n[1])#固定在跳转的网页
browser.save_screenshot('mn.png')#截图
# browser.get_screenshot_as_png()
time.sleep(6)
url = browser.current_url #获取当前页面的url
browser.quit()
return url
#使用xpath语法定位到数据
def create_req(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
x_etr = etree.HTML(res.read().decode('utf-8'))
img_list = x_etr.xpath('//div[@class="headline"]//ul//li//a//img/@src')
return img_list
#创建文件夹并保存数据
def save_data(img_list):
img_dir = os.path.join(os.curdir, 'mei_shi')
for img in img_list:
o = urlparse(img)
filename = o.path[1:].split('@')[0]
filepath = os.path.join(img_dir,filename)
if not os.path.isdir(os.path.dirname(filepath)):
os.mkdir(os.path.dirname(filepath))
url = '%s://%s/%s'% (o.scheme,o.netloc,o.path)
p = requests.get(url)
with open(filepath,'wb') as fp:
for block in p.iter_content(1024):
fp.write(block)
def main():
url = browser_url()
img_list = create_req(url)
save_data(img_list)
if __name__ == '__main__':
main()