day02
爬取豆瓣电影信息
#https://movie.douban.com/top250 豆瓣官网 """ 请求url:https://movie.douban.com/top250 请求方式:get 请求头: Cookie: ll="118173"; bid=FcbGGNF-KOk; __yadk_uid=7dzeruOujD8d17GnFEhHAT3onQZG0upO; trc_cookie_storage=taboola%2520global%253Auser-id%3D75ab9050-164e-4f3a-9272-6b03c3c5da2f-tuct3cab8a8; __utma=30149280.1694679395.1558355730.1558951848.1562026729.3; __utmc=30149280; __utmz=30149280.1562026729.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1562026729; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1562026752%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.1976832345.1558355747.1558355747.1562026752.2; __utmb=223695111.0.10.1562026752; __utmc=223695111; __utmz=223695111.1562026752.2.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ap_v=0,6.0; _vwo_uuid_v2=D28E497F8217B8C1AD1F50EE89C7313BB|88e6fe36437af650fc5d0c35801be9d8; _pk_id.100001.4cf6=7597168cdb0c38aa.1558355747.2.1562026797.1558355794. User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36 """ """ 需要爬取的电影信息: 电影名称、 电影url、电影类型 电影主演、电影导演、电影年份 电影评分、电影评论、电影简介 """ #爬虫三部曲 #1.发送请求 import requests def get_page(url): detail_res = requests.get(url) return detail_res #2.解析数据 import re def parse_index(html): """ <li>.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?alt="(.*?)".*?<a href="(.*?)" class="">.*?>(.*?)</span>.*?</span>.*?导演: (.*?).*?主演:(.*?)<br>(.*?)</p>.*?class="rating_num" property="v:average">(.*?)</span><span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span> """ zz = '<li>.*?<em class="">(.*?)</em>.*?<a href="(.*?)".*?<span class="title">(.*?)</span>.*?</span>.*?<p class="">.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>' movie_list = re.findall(zz,html,re.S) return movie_list #电影排名 # 电影名称 # 电影url # 电影主演 # 电影导演 # 电影年份 # 电影类型 # 电影评分 # 电影评论 # 电影简介 #3.保存数据 def save_movie(movie): top, m_url, name, dy, actor, y_t, point, commit, desc = movie dy = dy.replace(' ','') y_t = y_t.replace(' ','') y_t = y_t.replace('\n','') y_t = y_t.strip() data = f""" ====================================== 电影排名:{top} 电影链接:{m_url} 电影名称:{name} 电影导演:{dy} 电影主演:{actor} 年份类型:{y_t} 电影评分:{point} 评论人数: {commit} 电影简介:{desc} ====================================== \n \n """ print(data) if __name__ == '__main__': num = 0 for line in range(10): #拼接所有主页 url = f'https://movie.douban.com/top250?start={num*25}&filter=' num += 1 print(url) #往每个主页发送请求 index_res = get_page(url) #解析网页信息 m_list = parse_index(index_res.text) for i in m_list: save_movie(i)
selenium的基本使用
from selenium import webdriver # web驱动 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time import time # 方式一: 通过驱动打开浏览器 # driver = webdriver.Chrome(r'驱动的绝对路径/webdriver.exe') # 方式二: 把webdriver.exe驱动放到 python解释器安装目录/Scripts文件夹中 # python解释器安装目录/Scripts配置环境变量 # python解释器安装目录 配置环境变量 driver = webdriver.Chrome() try: driver.get('https://www.jd.com/') # 获取显式等待对象10秒 # 可以等待某个标签加载10秒 wait = WebDriverWait(driver, 10) # 查找元素id为key input_tag = wait.until(EC.presence_of_element_located( (By.ID, 'key') )) time.sleep(5) # 在输入框内输入商品名称 input_tag.send_keys('公仔') # 按下键盘回车键 input_tag.send_keys(Keys.ENTER) time.sleep(20) finally: # 关闭浏览器释放操作系统资源 driver.close()
selenium选择器使用
from selenium import webdriver # web驱动 from selenium.webdriver.common.by import By # 按照什么方式查找,By.ID,By.CSS_SELECTOR from selenium.webdriver.common.keys import Keys # 键盘按键操作 from selenium.webdriver.support import expected_conditions as EC # 和下面WebDriverWait一起用的 from selenium.webdriver.support.wait import WebDriverWait # 等待页面加载某些元素 import time driver = webdriver.Chrome(r'D:\Python\Scripts\chromedriver.exe') #打开驱动 #隐式等待 # 1.需要在get之前调用 #显式等待 #1.在get之后调用 """ 以下为隐式等待 """ driver.implicitly_wait(10) #等待任意元素加载十秒 driver.get("https://www.baidu.com/") # time.sleep(5) try: ''' ===============所有方法=================== element是查找一个标签 elements是查找所有标签 ''' #1.通过链接文本去找 login_link = driver.find_element_by_link_text('登录') login_link.click() # 点击 time.sleep(1) #方便观察 #2.通过id查找 user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') user_login.click() time.sleep(1) #3.通过class查找 user_send = driver.find_element_by_class_name('pass-text-input-userName') #查找用户名输入框 user_send.send_keys('master-hk') #发送用户名 time.sleep(1) #4.通过name pwd_send = driver.find_element_by_name('password') #查找密码输入框 pwd_send.send_keys('12345') #发送密码 time.sleep(1) submit = driver.find_element_by_id('TANGRAM__PSP_10__submit') submit.click() time.sleep(1) #通过局部链接文本查找 login_link2 = driver.find_element_by_partial_link_text('登') login_link2.click() time(3) # 6、find_element_by_css_selector # 根据属性选择器查找元素 # .: class # #: id login2_link = driver.find_element_by_css_selector('.tang-pass-footerBarULogin') login2_link.click() # 7、find_element_by_tag_name div = driver.find_elements_by_tag_name('div') print(div) finally: driver.close() #关闭浏览器