数据第三次作业
作业一:
(1)实验要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。
(2)实验代码:
单线程实验代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import lxml
def imageSpider(start_url):
global thread
try:
urls = []
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
images1 = soup.select("img")
for image in images1:
try:
src = image["src"]
url = urllib.request.urljoin(start_url, src)
if url not in urls:
urls.append(url)
download(url)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url):
try:
if url[-4] == ".":
name = url.split("/")[-1]
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open('/Users/dar_z/Desktop/test/images' + name, "wb") #创建打开文件夹
fobj.write(data)
fobj.close()
print("downloaded " + name)
except Exception as err:
print(err)
start_url = "http://www.weather.com.cn/weather1d/101010100.shtml"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"}
thread = []
imageSpider(start_url)
print("end")
单线程实验结果:
多线程实验代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"
}
count = 0 #记录图片张数
threads = [] # 保存所有调用的线程
def imageSpider(start_url):
global threads
global count
try:
urls = [] # 保存已下载过的图片的路径
req = urllib.request.Request(start_url, headers=headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, features="html.parser")
# 查找网页中所有标签是img的
images = soup.select("img")
for image in images:
try:
# 取出图片的路径
src = image["src"]
# 将访问的网址与图片路径结合在一起
url = urllib.request.urljoin(start_url, src)
# 判断图片是否已经下载过了
if url not in urls:
print(url)
count = count + 1
# 线程调用,执行download函数,并传参
T = threading.Thread(target=download, args=(url, count))
T.setDaemon(False) # 设置线程为后台线程
T.start()
threads.append(T) # 将调用的线程添加到threads中
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url, count):
try:
if url[len(url)-4] == ".": # 对应xxx.jpg、xxx.png等图片的处理
ext = url[len(url)-4:]
elif url[len(url)-5] == ".": # 对应xxx.jpeg图片的处理
ext = url[len(url) - 5:]
else:
ext = ""
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
# 创建图像文件
fobj = open("/Users/dar_z/Desktop/test/images"+str(count)+ext, "wb")
# 写入数据
fobj.write(data)
fobj.close()
print("downloaded"+str(count)+ext)
except Exception as err:
print(err)
# 要访问的网站
start_url = "http://www.weather.com.cn"
# 请求头,模拟浏览器
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"
}
count = 0
threads = [] # 保存所有调用的线程
imageSpider(start_url)
for t in threads:
t.join() # 线程阻塞,主线程要等待所有线程执行完才继续向下执行
print("The end")
多线程结果:
(3)心得体会:
单线程和多线程大同小异,主要体现在时间上有些许不同,具体差异还得继续深入学习
作业二:
(1)实验要求:使用scrapy框架复现作业①。
(2)实验代码:
item.py
1 import scrapy
2
3 class ImageItem(scrapy.Item):
4 image_urls = scrapy.Field()
6 images = scrapy.Field()
settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
#增加访问header,加个降低被拒绝的保险
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1
}
#打开Images的通道
IMAGES_STORE = '/Users/dar_z/Desktop/test'
#设置这个存储地址
spider.py
import scrapy
from car.items import ImageItem
class CarhomeSpider(scrapy.Spider):
name = 'weather'
allowed_domains = ['weather.com']
start_urls = ['http://www.weather.com.cn/']
download_delay = 1
def parse(self, response):
item = ImageItem()
srcs = response.css('.article img::attr(src)').extract() #用css方法找到的所有图片地址
item['image_urls'] = srcs
yield item
pipeline.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline) :
def get_media_requests(self, item, info) :
for image_url in item['image_urls'] :
yield Request(image_url)
def item_completed(self, results, item, info) :
image_path = [x['path'] for ok, x in results if ok]
if not image_path :
raise DropItem('Item contains no images')
item['image_paths'] = image_path
return item
(3)实验结果:
作业三:
(1)实验要求:使用scrapy框架爬取股票相关信息。
(2)实验代码:
stock.py
import scrapy
from bs4 import BeautifulSoup
import re
class StockSpider(scrapy.Spider):
name = 'stock'
# allowed_domains = ['quote.eastmoney.com']
start_urls = ['http://quote.eastmoney.com/stock_list.html']
def parse(self, response):
for href in response.css('a::attr(href)').extract():
try:
stock = re.search(r"[s][hz]\d{6}", href).group(0)
stock = stock.upper()
url = 'https://xueqiu.com/S/' + stock
yield scrapy.Request(url, callback = self.parse_stock)
except:
continue
def parse_stock(self, response):
infoDict = {}
if response == "":
exit()
try:
name = re.search(r'<div class="stock-name">(.*?)</div>', response.text).group(1)
infoDict.update({'股票名称': name.__str__()})
tableHtml = re.search(r'"tableHtml":"(.*?)",', response.text).group(1)
soup = BeautifulSoup(tableHtml, "html.parser")
table = soup.table
for i in table.find_all("td"):
line = i.text
l = line.split(":")
infoDict.update({l[0].__str__(): l[1].__str__()})
yield infoDict
except:
print("error")
pipelines.py
class DemoPipeline(object):
def process_item(self, item, spider):
return item
class stockPipeline(object):
def open_spider(self,spider):
self.f = open('XueQiuStock.txt','w')
def close_spider(self,spider):
self.f.close()
def process_item(self,item,spider):
try:
line = str(dict(item)) + '\n'
self.f.write(line)
except:
pass
return item
settings.py
BOT_NAME = 'demo'
SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'demo.pipelines.stockPipeline': 300,
}
(3)实验结果图:
(4)心得体会:
导入库的时候出现一只报错问题