# produce端
from loguru import logger
from redis import StrictRedis
class QuotesProduce:
name = "start_urls"
def __init__(self):
self.redis_cli = StrictRedis(
host="",
port=6379,
db=0,
password="password",
decode_responses=True,
)
def produce(self):
for index in range(1, 11):
url = "https://quotes.toscrape.com/page/{}/".format(index)
logger.info(url)
self.redis_cli.lpush("start_urls", url)
if __name__ == '__main__':
crawler = QuotesProduce()
crawler.produce()
# consume端
import requests
from loguru import logger
from redis import StrictRedis
from parsel import Selector
from concurrent.futures import ThreadPoolExecutor
class QuotesConsume:
name = "start_urls"
def __init__(self):
self.redis_cli = StrictRedis(
host="",
port=6379,
db=0,
password="password",
decode_responses=True,
)
self.executor = ThreadPoolExecutor(max_workers=10)
def consume(self):
while True:
url = self.redis_cli.brpop("start_urls")
logger.info("消费者-这是当前弹出的网址: {}", url[1])
self.executor.submit(self.spider_task, url[1])
def spider_task(self, url):
response = requests.get(url, timeout=10)
selectors = Selector(text=response.text)
for selector in selectors.css(".col-md-8 .quote"):
text = selector.css(".text::text").get()
author = selector.css(".author::text").get()
items = {
"text": text,
"author": author
}
logger.info("这是采集的数据: {}", items)
if __name__ == '__main__':
crawler = QuotesConsume()
crawler.consume()