猫眼电影爬取(三):requests+pyquery,并将数据存储到mysql数据库

还是以猫眼电影为例,这次用pyquery库进行爬取

1.简单demo,看看如何使用pyquery提取信息,并将提取到的数据进行组合

# coding: utf-8
# author: hmk

import requests
from pyquery import PyQuery as pq


url = 'http://maoyan.com/board/4'
header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
              "Accept-Encoding": "gzip, deflate, sdch",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Cache-Control": "max-age=0",
              "Connection": "keep-alive",
              "Host": "maoyan.com",
              "Referer": "http://maoyan.com/board",
              "Upgrade-Insecure-Requests": "1",
              "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}
r = requests.get(url, headers=header)
r.encoding = r.apparent_encoding
html = r.text
print(type(html))

doc = pq(html)
# print((doc('dd').find('.board-index')))
# print(doc('.name').text())
# print(doc('.releasetime').text())
# print(doc('dd').find('.integer').text()+doc('.fraction').text())
list = []
for t in doc('dd'):
   index = pq(t).find('.board-index').text()
   print(index)
   movie = pq(t).find('.name').text()
   print(movie)
   time = pq(t).find('.releasetime').text()
   print(time)
   score = pq(t).find('.integer').text() + pq(t).find('.fraction').text()
   print(score)
   list.append([index, movie, time, score])
print(list)

 

2.正式代码

# coding: utf-8
# author: hmk

import requests
from pyquery import PyQuery as pq
import pymysql.cursors


def get_html(url, header):
    try:
         r = requests.get(url=url, headers=header)
         r.encoding = r.apparent_encoding
         return r.text
    except:
        return None


def get_data(html, list_data):
    doc = pq(html)
    for t in doc('dd'):
        index = pq(t).find('.board-index').text()
        print(index)
        movie = pq(t).find('.name').text()
        print(movie)
        time = pq(t).find('.releasetime').text()
        print(time)
        score = pq(t).find('.integer').text() + pq(t).find('.fraction').text()
        print(score)
        list_data.append([index, movie, time, score])


def write_sql(data):
    conn = pymysql.connect(host='localhost',
                           user='root',
                           password='123456',
                           db='test',
                           charset='utf8')
    cur = conn.cursor()

    for i in data:
        """这里的data参数是指正则匹配并处理后的列表数据(是一个大列表,包含所有电影信息,每个电影信息都存在各自的一个列表中;
        对大列表进行迭代,提取每组电影信息,这样提取到的每组电影信息都是一个小列表,然后就可以把每组电影信息写入数据库了)"""
        movie = i  # 每组电影信息,这里可以看做是准备插入数据库的每组电影数据
        sql = "insert into maoyan_movie(ranking,movie,release_time,score) values(%s, %s, %s, %s)"  # sql插入语句
        try:
            cur.execute(sql, movie)  # 执行sql语句,movie即是指要插入数据库的数据
            conn.commit()  # 插入完成后,不要忘记提交操作
            print('导入成功')
        except:
            print('导入失败')
    cur.close()  # 关闭游标
    conn.close()  # 关闭连接


def main():
    start_url = 'http://maoyan.com/board/4'
    depth = 10  # 爬取深度(翻页)
    header = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
              "Accept-Encoding": "gzip, deflate, sdch",
              "Accept-Language": "zh-CN,zh;q=0.8",
              "Cache-Control": "max-age=0",
              "Connection": "keep-alive",
              "Host": "maoyan.com",
              "Referer": "http://maoyan.com/board",
              "Upgrade-Insecure-Requests": "1",
              "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36"}

    for i in range(depth):
        url = start_url + '?offset=' + str(10 * i)
        html = get_html(url, header)
        list_data = []
        get_data(html, list_data)
        write_sql(list_data)
        # print(list_data)


if __name__ == "__main__":
    main()

其实就这个例子来说,使用pyquery来提取信息是最简单省事的了,直接使用css选择器就可以把想要的数据拿到

posted @ 2018-06-27 21:22  我是冰霜  阅读(908)  评论(0编辑  收藏  举报