day02

爬取豆瓣电影信息

#https://movie.douban.com/top250 豆瓣官网
"""
请求url:https://movie.douban.com/top250
请求方式:get
请求头:
        Cookie: ll="118173"; bid=FcbGGNF-KOk; __yadk_uid=7dzeruOujD8d17GnFEhHAT3onQZG0upO; trc_cookie_storage=taboola%2520global%253Auser-id%3D75ab9050-164e-4f3a-9272-6b03c3c5da2f-tuct3cab8a8; __utma=30149280.1694679395.1558355730.1558951848.1562026729.3; __utmc=30149280; __utmz=30149280.1562026729.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.1.10.1562026729; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1562026752%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.1976832345.1558355747.1558355747.1562026752.2; __utmb=223695111.0.10.1562026752; __utmc=223695111; __utmz=223695111.1562026752.2.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ap_v=0,6.0; _vwo_uuid_v2=D28E497F8217B8C1AD1F50EE89C7313BB|88e6fe36437af650fc5d0c35801be9d8; _pk_id.100001.4cf6=7597168cdb0c38aa.1558355747.2.1562026797.1558355794.
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36
"""
"""
需要爬取的电影信息:
        电影名称、 电影url、电影类型
        电影主演、电影导演、电影年份
        电影评分、电影评论、电影简介
"""
#爬虫三部曲
#1.发送请求
import requests
def get_page(url):
    detail_res = requests.get(url)
    return  detail_res
#2.解析数据
import re
def parse_index(html):
    """
    <li>.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?alt="(.*?)".*?<a href="(.*?)" class="">.*?>(.*?)</span>.*?</span>.*?导演: (.*?).*?主演:(.*?)<br>(.*?)</p>.*?class="rating_num" property="v:average">(.*?)</span><span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>
    """
    zz = '<li>.*?<em class="">(.*?)</em>.*?<a href="(.*?)".*?<span class="title">(.*?)</span>.*?</span>.*?<p class="">.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>'
    movie_list = re.findall(zz,html,re.S)
    return movie_list
#电影排名
# 电影名称
# 电影url
# 电影主演
# 电影导演
# 电影年份
# 电影类型
# 电影评分
# 电影评论
# 电影简介


#3.保存数据
def save_movie(movie):
    top, m_url, name, dy, actor, y_t, point, commit, desc = movie
    dy = dy.replace('&nbsp;','')
    y_t = y_t.replace('&nbsp;','')
    y_t = y_t.replace('\n','')
    y_t = y_t.strip()
    data = f"""
                    ======================================
                    电影排名:{top}
                    电影链接:{m_url}
                    电影名称:{name}
                    电影导演:{dy}
                    电影主演:{actor}
                    年份类型:{y_t}
                    电影评分:{point}
                    评论人数: {commit}
                    电影简介:{desc}
                    ======================================
                    \n
                    \n
                """
    print(data)


if __name__ == '__main__':
    num = 0
    for line in range(10):
        #拼接所有主页
        url = f'https://movie.douban.com/top250?start={num*25}&filter='
        num += 1
        print(url)
        #往每个主页发送请求
        index_res = get_page(url)
        #解析网页信息
        m_list = parse_index(index_res.text)
        for i in m_list:
            save_movie(i)

selenium的基本使用

from selenium import webdriver  # web驱动
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

import time

# 方式一: 通过驱动打开浏览器
# driver = webdriver.Chrome(r'驱动的绝对路径/webdriver.exe')

# 方式二: 把webdriver.exe驱动放到 python解释器安装目录/Scripts文件夹中
# python解释器安装目录/Scripts配置环境变量
# python解释器安装目录 配置环境变量
driver = webdriver.Chrome()

try:

    driver.get('https://www.jd.com/')

    # 获取显式等待对象10秒
    # 可以等待某个标签加载10秒
    wait = WebDriverWait(driver, 10)

    # 查找元素id为key
    input_tag = wait.until(EC.presence_of_element_located(
        (By.ID, 'key')
    ))

    time.sleep(5)
            
    # 在输入框内输入商品名称
    input_tag.send_keys('公仔')

    # 按下键盘回车键
    input_tag.send_keys(Keys.ENTER)


    time.sleep(20)

finally:
    # 关闭浏览器释放操作系统资源
    driver.close()

selenium选择器使用

from selenium import webdriver  # web驱动
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.support import expected_conditions as EC  # 和下面WebDriverWait一起用的
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import time

driver = webdriver.Chrome(r'D:\Python\Scripts\chromedriver.exe')    #打开驱动
#隐式等待
     # 1.需要在get之前调用
#显式等待
    #1.在get之后调用

"""
    以下为隐式等待
"""
driver.implicitly_wait(10)  #等待任意元素加载十秒
driver.get("https://www.baidu.com/")
# time.sleep(5)
try:
    '''
        ===============所有方法===================
            element是查找一个标签
            elements是查找所有标签
    '''
    #1.通过链接文本去找
    login_link = driver.find_element_by_link_text('登录')
    login_link.click()  # 点击
    time.sleep(1)       #方便观察
    #2.通过id查找
    user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
    user_login.click()
    time.sleep(1)
    #3.通过class查找
    user_send = driver.find_element_by_class_name('pass-text-input-userName')   #查找用户名输入框
    user_send.send_keys('master-hk')        #发送用户名
    time.sleep(1)
    #4.通过name
    pwd_send = driver.find_element_by_name('password')          #查找密码输入框
    pwd_send.send_keys('12345')             #发送密码
    time.sleep(1)
    submit = driver.find_element_by_id('TANGRAM__PSP_10__submit')
    submit.click()
    time.sleep(1)
    #通过局部链接文本查找
    login_link2 = driver.find_element_by_partial_link_text('')
    login_link2.click()
    time(3)
    # 6、find_element_by_css_selector
    # 根据属性选择器查找元素
    # .: class
    # #: id
    login2_link = driver.find_element_by_css_selector('.tang-pass-footerBarULogin')
    login2_link.click()

    # 7、find_element_by_tag_name
    div = driver.find_elements_by_tag_name('div')
    print(div)

finally:
    driver.close()      #关闭浏览器

 

posted @ 2019-07-02 21:06  Coder_HK  阅读(162)  评论(0)    收藏  举报