asyncio异步采集小试一下 ,果然快!

# _*_ coding: utf-8 _*_
import codecs
from bs4 import BeautifulSoup
import time, json, math
import sys, os
import asyncio
import aiohttp
import aiofiles

f = codecs.open('goods.txt', 'w', encoding='utf-8', errors='ignore')
semaphore = asyncio.Semaphore(5)

#asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

async def getHtml(url):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as html:
                if url.endswith('.jpg'):
                    img = await html.read()
                    imgname = url.replace('http://www.13qh.com/', '')
                    imgpath = os.path.dirname(imgname)
                    if not os.path.exists(imgpath):
                        os.makedirs(imgpath)
                    fp  = await aiofiles.open(imgname, 'wb')
                    await fp.write(img)
                    return True
                else:
                    tmp = await html.text(encoding='utf-8')
                    return tmp

async def getList(url, **cat):
    tmp = await getHtml(url)
    try:
        htm = BeautifulSoup(tmp, 'lxml')
        ul  = htm.select('.goods-item .goods-pic a')
    except Exception as e:
        print(e)
        ul  = None
    if ul  != None:
        for li in ul:
            link = li.get('href')
            await parse(link, **cat)

async def parse(url, **cat):
    tmp = await getHtml(url)
    try:
        htm = BeautifulSoup(tmp, 'lxml')
        goods_id = url.split('/')[-1]
        goods_name = htm.select('.goods-title h3')[0].text
        goods_name_sub = htm.select('.goods-title p')[0].text
        goods_price = htm.select('.goods-info .sale_price')[0].text
        sale_price  = htm.select('.goods-info ul li')[0].find('del').text
        sale_price  = filter(lambda ch : ch in '.0123456789', sale_price)
        thumb_cont  = htm.select('.thumb-cont ul li')
        print(goods_name)
        goods_thumb = []
        for thumb in thumb_cont:
            img = thumb.find('img').get('big')
            goods_thumb.append(img)
            print(img)
            await getHtml('http://www.13qh.com' + img)
        detail_div  = htm.select('.detail-content p img')
        goods_detail = []
        for p in detail_div:
            goods_detail.append(p.get('src'))
            print(p.get('src'))
            await getHtml('http://www.13qh.com' + p.get('src'))
        goods = {
            'cat_id': cat['lan_id'],
            'sub_id': cat['sub_id'],
            'goods_id': goods_id,
            'goods_name': goods_name,
            'goods_price': goods_price,
            'sale_price' : sale_price,
            'goods_thumb': goods_thumb,
            'goods_detail': goods_detail
        }
        f.write(json.dumps(goods) + os.linesep)
    except Exception as e:
        print(e)

async def caiz():
    url = 'http://www.13qh.com/'
    tmp = await getHtml(url)
    htm = BeautifulSoup(tmp, 'lxml')
    cat = htm.select('.category-content>ul>li')

    category = []
    for li in cat:
        lan = li.select('p a')[0]
        lan_text = lan.text
        lan_id   = lan.get('href').split('/')[-1]
        category.append({'cat_id': lan_id, 'cat_name': lan_text, 'parent_id': 0})

        ul  = li.select('.category-list ul li')
        for u in ul:
            ua = u.select('.a')
            for a in ua:
                sua = a.select('a')
                sua_text = sua.text
                sua_id   = sua.get('href').split('/')[-1]
                category.append({'cat_id': sua_id, 'cat_name': sua_text, 'parent_id': lan_id})

            ub = u.select('.b a')[0]
            sub_text = ub.text
            sub_id   = ub.get('href').split('/')[-1]
            category.append({'cat_id': sub_id, 'cat_name': sub_text, 'parent_id': lan_id})

            uc = u.select('.c a')
            for c in uc:
                suc_text = c.text
                suc_href = c.get('href')
                suc_id   = suc_href.split('/')[-1]
                category.append({'cat_id': suc_id, 'cat_name': suc_text, 'parent_id': sub_id})

                for i in range(1, 20):
                    asyncio.ensure_future(getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id))
    with codecs.open('category.txt', 'w', encoding='utf-8', errors='ignore') as ff:
        ff.write(json.dumps(category))

def main():
    loop  = asyncio.get_event_loop()
    asyncio.run(caiz())
    f.close()

if __name__ == '__main__':
    main()

 

posted @ 2020-12-03 12:47  1553  阅读(75)  评论(0编辑  收藏  举报