#aio 爬虫,去重,入库
import asyncio
import aiohttp
import aiomysql
import re
from pyquery import PyQuery
stoping = False
start_url = 'http://www.jobbole.com/'
waiting_urls = []
seen_urls = set()
# url去重 --布隆过滤器 bloom filter
sem = asyncio.Semaphore(3) #限制并发数量
async def fetch(url,session):
async with sem:
#await asyncio.sleep(0.5)
try:
async with session.get(url) as resp:
print(resp.status)
if resp.status in [200,201]:
data = await resp.text()
return data
except Exception as e :
print(e)
#因为不是耗费 io的 所以用普通函数
def extract_urls(html):
urls = []
pq = PyQuery(html)
for link in pq.items('a'):
url = link.attr('href')
if url and url.startswith('http') and url not in seen_urls:
urls.append(url)
waiting_urls.append(url)
return urls
async def init_urls(url,session):
html = await fetch(url,session)
seen_urls.add(url)
extract_urls(html)
async def article_handeler(url,session,pool):
#获取文章详情,并解析入库
html = await fetch(url,session)
seen_urls.add(url)
extract_urls(html)
pq = PyQuery(html)
title = pq('title').text()
async with pool.acquire() as conn:
async with conn.cursor() as cur:
await cur.execute('SELECT 42;')
insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)
await cur.execute(insert_sql)
async def consumer(pool):
async with aiohttp.ClientSession() as session:
while not stoping:
if len(waiting_urls) == 0:
await asyncio.sleep(0.5)
continue
url = waiting_urls.pop()
print('start get url:{}'.format(url))
if re.match('http://.*?jobbole.com/\d+/',url):
if url not in seen_urls:
asyncio.ensure_future(article_handeler(url,session,pool))
await asyncio.sleep(0.5)
else:
if url not in seen_urls:
asyncio.ensure_future(init_urls(url,session))
async def main(loop):
#等待mysql链接建立好
pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,
user = 'root',password='123456',
db = 'aiomysql_test',loop=loop,
charset = 'utf8',autocommit = True)
async with aiohttp.ClientSession() as session:
html = await fetch(start_url, session)
seen_urls.add(start_url)
extract_urls(html)
asyncio.ensure_future(consumer(pool))
if __name__ == "__main__":
loop = asyncio.get_event_loop()
asyncio.ensure_future(main(loop))
loop.run_forever()