pyhon-爬虫实战抓取豆瓣top250到mysql

采集地址https://movie.douban.com/top250

一、创建mysql数据库

CREATE TABLE `t_doubantop` (
  `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
  `num` int(11) DEFAULT NULL,
  `name` varchar(255) DEFAULT NULL,
  `charactor` varchar(255) DEFAULT NULL,
  `remark` varchar(255) DEFAULT NULL,
  `score` varchar(255) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=273 DEFAULT CHARSET=utf8;

二、采集代码

from bs4 import BeautifulSoup
import pymysql
import requests
import re
import os


#链接数据库
def connect_db():
    connect = pymysql.connect(  # 连接数据库
        user="root",
        password="password",
        host="127.0.0.1",
        db="test",
        port=3306,
        charset=("utf8"),  # 注意编码一定要设置,否则gbk你懂的
        use_unicode=True,
    )
    return connect

def get_html(web_url):  # 爬虫获取网页没啥好说的
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 2345Explorer/8.5.1.15355"}
    html = requests.get(url=web_url, headers=header).text
    Soup = BeautifulSoup(html, "lxml")
    data = Soup.find("ol").find_all("li")  # 还是有一点要说,就是返回的信息最好只有你需要的那部分,所以这里进行了筛选
    return data


def get_info(all_move, connect, cursor):
    for info in all_move:
        #    编号
        nums = re.findall(r'<em class="">\d+</em>', str(info), re.S | re.M)  # 编号我使用的是正则表达式来获取
        nums = re.findall(r'\d+', str(nums), re.S | re.M)
        num = nums[0]

        #    名字
        names = info.find("span")  # 名字比较简单 偷了一下懒直接获取第一个span就是
        name = names.get_text()

        #    导演
        charactors = info.find("p")  # 这段信息中有太多非法符号你需要替换掉
        charactor = charactors.get_text().replace(" ", "").replace("\n", "")  # 使信息排列规律
        charactor = charactor.replace("\xa0", "").replace("\xee", "").replace("\xf6", "").replace("\u0161", "").replace("\xf4", "").replace("\xfb", "").replace("\u2027", "")

        #    评语
        remarks = info.find_all("span", {"class": "inq"})
        print(remarks)
        if remarks:  # 这个判断是因为有的电影没有评语,你需要做判断
            remark = remarks[0].get_text().replace("\u22ef", "")
        else:
            remark = "此影片没有评价"

        #    评分
        scores = info.find_all("span", {"class": "rating_num"})  # 没啥好说 匹配就行
        score = scores[0].get_text()

        data = {'num':num, 'name':name, 'charactor':charactor, 'remark':remark, 'score':score}
        print(data)
        # 保存数据
        cursor.execute("insert into t_doubantop(num,name,charactor,remark,score)values(%s,%s,%s,%s,%s)",
                       [data['num'], data['name'], data['charactor'], data['remark'], data['score']])
        # 提交
        connect.commit()
    return


if __name__ == "__main__":
    connect = connect_db()#链接数据库
    cursor = connect.cursor()  # 设置游标
    page = 0  # 初始化页数,TOP一共有250部   每页25部
    while page <= 225:
        web_url = "https://movie.douban.com/top250?start=%s&filter=" % page
        all_move = get_html(web_url)  # 返回每一页的网页
        data = get_info(all_move, connect, cursor)  # 匹配对应信息并保存
        page += 25

    connect.close()  # 最后记得关掉连接

三、数据库保存结果

posted @ 2018-01-25 16:40  奔梦  阅读(452)  评论(0)    收藏  举报