第三次作业
作业①:
要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。
单线程:
from bs4 import BeautifulSoup from bs4 import UnicodeDammit import urllib.request def imageSpider(start_url): try: urls=[] req=urllib.request.Request(start_url,headers=headers) data=urllib.request.urlopen(req) data=data.read() dammit=UnicodeDammit(data,["utf-8","gbk"]) data=dammit.unicode_markup soup=BeautifulSoup(data,"lxml") images=soup.select("img") for image in images: try: src=image["src"] url=urllib.request.urljoin(start_url,src) if url not in urls: urls.append(url) print(url) download(url) except Exception as error: print(error) except Exception as error: print(error) def download(url): global count try: count=count+1 if(url[len(url)-4]=="."): ext=url[len(url)-4:] else: ext="" req=urllib.request.Request(url,headers=headers) data=urllib.request.urlopen(req,timeout=100) data=data.read() fobj=open("C:/Program Files/JetBrains/PyCharm 2020.2.1/images/"+str(count)+ext,"wb") fobj.write(data) fobj.close() print("download"+str(count)+ext) except Exception as error: print(error) start_url="http://www.weather.com.cn/weather/101230101.shtml" headers={ "User-Agent":"Mozilla/5.0(Wndows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/200807421 Minefield/3.0.2pre" } count=0 print("url信息:") imageSpider(start_url)
多线程:
from bs4 import BeautifulSoup from bs4 import UnicodeDammit import urllib.request import threading def imageSpider(start_url): global threads global count try: urls=[] req=urllib.request.Request(start_url,headers=headers) data=urllib.request.urlopen(req) data=data.read() dammit=UnicodeDammit(data,["utf-8","gbk"]) data=dammit.unicode_markup soup = BeautifulSoup(data, "lxml") images = soup.select("img") for image in images: try: src=image["src"] url=urllib.request.urljoin(start_url, src) if url not in urls: print(url) count = count + 1 T = threading.Thread(target=download,args=(url,count)) T.setDaemon(False) T.start() threads.append(T) except Exception as err: print(err) except Exception as err: print(err) def download(url,count): try: if(url[len(url)-4]=="."): ext=url[len(url)-4:] else: ext="" req=urllib.request.Request(url,headers=headers) data=urllib.request.urlopen(req,timeout=100) data=data.read() fobj=open("C:/Program Files/JetBrains/PyCharm 2020.2.1/images1/"+str(count)+ext,"wb") fobj.write(data) fobj.close() print("downloaded" + str(count) + ext) except Exception as err: print(err) start_url="http://www.weather.com.cn/weather/101230101.shtml" headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"} count=0 threads=[] imageSpider(start_url) for t in threads: t.join() print("the end")
心得体会:作业1的完成是书上代码的复现,通过这次的作业巩固了多线程的使用
作业②
要求:使用scrapy框架复现作业①。
spider.py
from scrapy from ..items import Weatheritem from scrapy.selector import Selector class ImageSpider(scrapy.Spider): name = 'spider' start_url = ["http://www.weather.com.cn/weather/101230101.shtml"] #获取URL def parse(self, response): try: data=response.body.decode() selector=Selector(text=data) src_list=response.xpath('//image/@src').extract() for src in src_list: item=Imageitem() yield item
item.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Imageitem(scrapy.Item): jpg=scrapy.Field pass
settings.py
# Scrapy settings for demo project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' } ITEM_PIPELINES = { 'movie80s.pipelines.Movie80SPipeline': 1, } IMAGES_STORE = 'E:/Temp/dongman/' BOT_NAME = 'demo' SPIDER_MODULES = ['demo.spiders'] NEWSPIDER_MODULE = 'demo.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'demo (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'demo.middlewares.DemoSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'demo.middlewares.DemoDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'demo.pipelines.DemoPipeline': 300, 'scrapy.pipelines.image.ImagesPipeline':1, } IMAGES_STORE=r'C:\exam\demo\demo\image' IMAGES_URLS_FIELD='jpg' # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
作业③:
要求:使用scrapy框架爬取股票相关信息。