# _*_ coding: utf-8 _*_
import codecs
from bs4 import BeautifulSoup
import time, json, math
import sys, os
import asyncio
import aiohttp
import aiofiles
f = codecs.open('goods.txt', 'w', encoding='utf-8', errors='ignore')
semaphore = asyncio.Semaphore(5)
#asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
async def getHtml(url):
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as html:
if url.endswith('.jpg'):
img = await html.read()
imgname = url.replace('http://www.13qh.com/', '')
imgpath = os.path.dirname(imgname)
if not os.path.exists(imgpath):
os.makedirs(imgpath)
fp = await aiofiles.open(imgname, 'wb')
await fp.write(img)
return True
else:
tmp = await html.text(encoding='utf-8')
return tmp
async def getList(url, **cat):
tmp = await getHtml(url)
try:
htm = BeautifulSoup(tmp, 'lxml')
ul = htm.select('.goods-item .goods-pic a')
except Exception as e:
print(e)
ul = None
if ul != None:
for li in ul:
link = li.get('href')
await parse(link, **cat)
async def parse(url, **cat):
tmp = await getHtml(url)
try:
htm = BeautifulSoup(tmp, 'lxml')
goods_id = url.split('/')[-1]
goods_name = htm.select('.goods-title h3')[0].text
goods_name_sub = htm.select('.goods-title p')[0].text
goods_price = htm.select('.goods-info .sale_price')[0].text
sale_price = htm.select('.goods-info ul li')[0].find('del').text
sale_price = filter(lambda ch : ch in '.0123456789', sale_price)
thumb_cont = htm.select('.thumb-cont ul li')
print(goods_name)
goods_thumb = []
for thumb in thumb_cont:
img = thumb.find('img').get('big')
goods_thumb.append(img)
print(img)
await getHtml('http://www.13qh.com' + img)
detail_div = htm.select('.detail-content p img')
goods_detail = []
for p in detail_div:
goods_detail.append(p.get('src'))
print(p.get('src'))
await getHtml('http://www.13qh.com' + p.get('src'))
goods = {
'cat_id': cat['lan_id'],
'sub_id': cat['sub_id'],
'goods_id': goods_id,
'goods_name': goods_name,
'goods_price': goods_price,
'sale_price' : sale_price,
'goods_thumb': goods_thumb,
'goods_detail': goods_detail
}
f.write(json.dumps(goods) + os.linesep)
except Exception as e:
print(e)
async def caiz():
url = 'http://www.13qh.com/'
tmp = await getHtml(url)
htm = BeautifulSoup(tmp, 'lxml')
cat = htm.select('.category-content>ul>li')
category = []
for li in cat:
lan = li.select('p a')[0]
lan_text = lan.text
lan_id = lan.get('href').split('/')[-1]
category.append({'cat_id': lan_id, 'cat_name': lan_text, 'parent_id': 0})
ul = li.select('.category-list ul li')
for u in ul:
ua = u.select('.a')
for a in ua:
sua = a.select('a')
sua_text = sua.text
sua_id = sua.get('href').split('/')[-1]
category.append({'cat_id': sua_id, 'cat_name': sua_text, 'parent_id': lan_id})
ub = u.select('.b a')[0]
sub_text = ub.text
sub_id = ub.get('href').split('/')[-1]
category.append({'cat_id': sub_id, 'cat_name': sub_text, 'parent_id': lan_id})
uc = u.select('.c a')
for c in uc:
suc_text = c.text
suc_href = c.get('href')
suc_id = suc_href.split('/')[-1]
category.append({'cat_id': suc_id, 'cat_name': suc_text, 'parent_id': sub_id})
for i in range(1, 20):
asyncio.ensure_future(getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id))
with codecs.open('category.txt', 'w', encoding='utf-8', errors='ignore') as ff:
ff.write(json.dumps(category))
def main():
loop = asyncio.get_event_loop()
asyncio.run(caiz())
f.close()
if __name__ == '__main__':
main()