大二下学期团队项目(爬取豆瓣电影)

今日爬取了豆瓣的一部分电影信息,大约2000个:

import string
import time
import traceback

import pymysql
import requests
import re

from lxml import etree
import random

from bs4 import BeautifulSoup
from flask import json

def get_conn():
    """
    :return: 连接,游标192.168.1.102
    """
    # 创建连接
    conn = pymysql.connect(host="*",
                    user="root",
                    password="root",
                    db="*",
                    charset="utf8")
    # 创建游标
    cursor = conn.cursor()  # 执行完毕返回的结果集默认以元组显示
    return conn, cursor

def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()

def query(sql,*args):
    """
    封装通用查询
    :param sql:
    :param args:
    :return: 返回查询结果以((),(),)形式
    """
    conn,cursor = get_conn();
    cursor.execute(sql)
    res=cursor.fetchall()
    close_conn(conn,cursor)
    return res

def get_tencent_data():
    #豆瓣的网址
    url_bean = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start='

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    }
    a=1
    num=0
    cursor = None
    conn = None
    conn, cursor = get_conn()
    while a<=100:
        num_str='%d'%num
        num=num+20
        a=a+1;
        # 获取豆瓣页面电影数据
        r = requests.get(url_bean + num_str, headers=headers)
        res_bean = json.loads(r.text);
        data_bean = res_bean["data"]
        print(f"{time.asctime()}开始插入数据",(a-1))
        #循环遍历电影数据
        try:
            for i in data_bean:


                #分配数据
                score = i["rate"]
                director = i["directors"]  # []
                director_str = ""
                for j in director:
                    director_str = director_str + " " + j
                name = i["title"]
                img = i["cover"]
                star = i["casts"]  # []
                star_str = ""
                for j in star:
                    star_str = star_str + " " + j
                # 分配数据

                # 获取电影详细数据的网址
                url_details = i["url"]
                r = requests.get(url_details, headers=headers)
                soup_bean = BeautifulSoup(r.text,"lxml")
                #获取详细数据
                span = soup_bean.find_all("span", {"property": "v:genre"})
                type = ""
                for i in span:
                    type = type + " " + i.text
                span = soup_bean.find_all("span", {"property": "v:runtime"})
                timelen = span[0].text
                span = soup_bean.find_all("span", {"property": "v:initialReleaseDate"})
                date = span[0].text
                span = soup_bean.find("a", {"class", "rating_people"})
                scorenum = span.text
                span = soup_bean.find("span", {"property": "v:summary"})
                summary = span.text.replace(" ", "")#将空格去掉
                # 获取详细数据

                sql = "insert into test_bean values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql, [name, star_str, director_str, type, "", date, summary, score, "", img, scorenum,
                                     timelen])
            conn.commit()  # 提交事务 update delete insert操作 //*[@id="info"]/text()[2]
        except:
            traceback.print_exc()
        print(f"{time.asctime()}插入数据完毕",(a-1))#循环了几次
    close_conn(conn, cursor)
    print(f"{time.asctime()}所有数据插入完毕")

if __name__ == "__main__":
   get_tencent_data()

 

posted @ 2021-04-23 22:14  风吹过半夏  阅读(48)  评论(0编辑  收藏  举报