第10课-队列、使用多线程和队列的爬虫案例
1、队列代码示例
import threading
import time
from queue import Queue
'''
Queue是线程安全的队列
'''
def set_data(q):
index = 0
while True:
q.put(index)
index += 1
time.sleep(3)
def get_data(q):
while True:
print(q.get())
if __name__ == '__main__':
q = Queue(4)
t1 = threading.Thread(target=set_data,args=[q])
t2 = threading.Thread(target=get_data,args=[q])
t1.start()
t2.start()
q = Queue(1)
q.put(1)
q.get(timeout=1)
print(q.empty())
print(q.full(timeout=1))
print(q.qsize())
2、斗图爬虫实战
import requests
import threading
from queue import Queue
from lxml import etree
from urllib import request
g_flag = True
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
#爬取图片地址
def put_picture_link(q):
global g_flag
for i in range(1,11):
text = requests.get(url="http://www.doutula.com/article/list/?page={}".format(i), headers=HEADERS).text
html = etree.HTML(text)
imgs_elements = html.xpath(
'//div[@class="col-sm-9 center-wrap"]/a/div[@class="random_article"]/div/img [@class!="gif"]')
for img_element in imgs_elements:
image_link = img_element.xpath("@data-original")[0]
q.put(image_link)
g_flag = False
#下载图片
def download(q):
index = 1
while g_flag or q.qsize()>0:
img_link = q.get(timeout=1)
result = requests.get(url=img_link)
if result.status_code == 200:
my_picture = result.content
append = img_link.split(".")[-1]
with open("c://pictures/{}.{}".format(index, append), "wb") as fp:
fp.write(my_picture)
#文件下载
# request.urlretrieve(url=img_link,filename="c://pictures/{}.{}".format(index, append))
index += 1
if __name__ == '__main__':
q = Queue(10) #初始化队列
t1 = threading.Thread(target=put_picture_link,args=[q])
t1.start()
t2 = threading.Thread(target=download,args=[q])
t2.start()
print("主线程执行完毕!!!")
3、百思不得姐爬虫实战
"""百思不得姐爬虫实战"""
import threading
from lxml import etree
import requests
from queue import Queue
import csv
g_Lock = threading.Lock()
g_flag = True
DOMAIN = "http://www.budejie.com/"
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
#生产者
class Producer(threading.Thread):
def __init__(self,queue_url,queue_content):
super(Producer,self).__init__()
self.__queue_url = queue_url
self.__queue_content = queue_content
def run(self):
global g_flag
# count = 1
while self.__queue_url.qsize()>0:
url = self.__queue_url.get()
text = requests.get(url= url,headers = HEADERS).text
html = etree.HTML(text)
contents = html.xpath('//div[@class="g-mn"]//div[@class="j-r-list"]//ul//div[@class="j-r-list-c-desc"]/a')
for c in contents:
content = c.xpath("text()")[0].replace(r"\u200b","")
link = DOMAIN + c.xpath("@href")[0]
content_dict = {}
content_dict["段子"] = content
content_dict["链接"] = link
self.__queue_content.put(content_dict)
# print("第{}个页面请求成功".format(count))
# count += 1
g_flag = False
print("-----------------------所有请求已完成---------------")
#消费者
class Consumer(threading.Thread):
def __init__(self,queue_content,writer,i):
super(Consumer,self).__init__()
self.__queue_content = queue_content
self.__writer = writer
self.__i = i
def run(self):
print("----dddddddddddddd---")
while True:
if self.__queue_content.qsize()>0 or g_flag :
try:
content_dict = self.__queue_content.get(timeout=1)
g_Lock.acquire()
self.__writer.writerow(content_dict)
g_Lock.release()
except Exception as e:
print("队列为空{}".format(e))
else:
break
print("线程{}".format(self.__i),g_flag, self.__queue_content.qsize())
if __name__ == '__main__':
q_url = Queue(100)
q_content = Queue(100)
for i in range(1,25):
q_url.put("http://www.budejie.com/text/{}".format(i))
header = ["段子","链接"]
fp = open("text.csv","w",encoding="utf-8",newline="")
writer = csv.DictWriter(fp,header)
writer.writeheader()
for i in range(0,1):
c = Consumer(q_content,writer,i)
c.start()
p = Producer(q_url,q_content)
p.start()

浙公网安备 33010602011771号