Python 爬取bangumi网页信息

1.数据库连接池

#######db.py##########
import time
import pymysql
import threading
from DBUtils.PooledDB import PooledDB, SharedDBConnection
POOL = PooledDB(
    creator=pymysql,  # 使用链接数据库的模块
    maxconnections=6,  # 连接池允许的最大连接数,0和None表示不限制连接数
    mincached=2,  # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
    maxcached=5,  # 链接池中最多闲置的链接,0和None不限制
    maxshared=3,  # 链接池中最多共享的链接数量,0和None表示全部共享。PS: 无用,因为pymysql和MySQLdb等模块的 threadsafety都为1,所有值无论设置为多少,_maxcached永远为0,所以永远是所有链接都共享。
    blocking=True,  # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
    maxusage=None,  # 一个链接最多被重复使用的次数,None表示无限制
    setsession=[],  # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."]
    ping=0,
    # ping MySQL服务端,检查是否服务可用。# 如:0 = None = never, 1 = default = whenever it is requested, 2 = when a cursor is created, 4 = when a query is executed, 7 = always
    host='127.0.0.1',
    port=3306,
    user='root',
    password='1234',
    database='bgm',
    charset='utf8'
)

 2.核心代码

import re
import requests
from bs4 import BeautifulSoup
import pymysql
from db import POOL

from threading import Thread
header={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
conn = POOL.connection()

cursor = conn.cursor()
sql = '''insert into bgm_info(CNAME,JNAME,CON,FEN,NUM,IMG_URL,TEXT_URL) VALUES (%s,%s,%s,%s,%s,%s,%s)'''


def sql_data(*args):
    # print(args)
    res=cursor.execute(sql,args)


    conn.commit()
    # cursor.close()
    conn.close()

def get_text(url):
    res=requests.get(url,headers=header)
    # print(res.url)
    res.encoding='utf-8'
    html=res.text
    soup_html=BeautifulSoup(html,'html.parser')
    try:
        all_li=soup_html.find('ul',class_='browserFull').find_all('li',class_='item odd clearit')

        for div in all_li:

            c_name=div.find('div',class_='inner').find('a').get_text()
            j_name=div.find('small').string
            rank=div.find('span',class_='rank').get_text()
            con=div.find('p',class_='info tip').string
            fade=div.find('small',class_='fade').string
            num=div.find('span',class_='tip_j').get_text()
            num = re.search('\d+',num).group()
            img_url = 'http:'+div.find('img')['src']
            url2 = 'http://bangumi.tv/'+div.find('div',class_='inner').find('a')['href']
            # print(img_url)
            sql_data(c_name,j_name,con,fade,num,img_url,url2)
    except:
        print(url)
if __name__ == '__main__':
    for i in range(500):
        url = 'http://bangumi.tv/anime/browser?sort=rank&page=%s'%i
        # get_text(url)
        t = Thread(target=get_text,args=(url,))
        t.start()

 

posted @ 2018-01-05 17:30  TAMAYURA  阅读(748)  评论(0编辑  收藏  举报