第三次作业

作业①:

要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。

单线程:

 

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
def imageSpider(start_url):

    try:
        urls=[]
        req=urllib.request.Request(start_url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit=UnicodeDammit(data,["utf-8","gbk"])
        data=dammit.unicode_markup
        soup=BeautifulSoup(data,"lxml")
        images=soup.select("img")
        for image in images:
            try:
                src=image["src"]
                url=urllib.request.urljoin(start_url,src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    download(url)
            except Exception as error:
              print(error)
    except Exception as error:
      print(error)
def download(url):
    global count
    try:
        count=count+1
        if(url[len(url)-4]=="."):
            ext=url[len(url)-4:]
        else:
            ext=""
        req=urllib.request.Request(url,headers=headers)
        data=urllib.request.urlopen(req,timeout=100)
        data=data.read()
        fobj=open("C:/Program Files/JetBrains/PyCharm 2020.2.1/images/"+str(count)+ext,"wb")
        fobj.write(data)
        fobj.close()
        print("download"+str(count)+ext)

    except Exception as error:
        print(error)
start_url="http://www.weather.com.cn/weather/101230101.shtml"
headers={
      "User-Agent":"Mozilla/5.0(Wndows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/200807421 Minefield/3.0.2pre" }
count=0
print("url信息:")
imageSpider(start_url)

 

 

 多线程:

 

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading


def imageSpider(start_url):
    global threads
    global count
    try:
        urls=[]
        req=urllib.request.Request(start_url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit=UnicodeDammit(data,["utf-8","gbk"])
        data=dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images = soup.select("img")
        for image in images:
            try:
                src=image["src"]
                url=urllib.request.urljoin(start_url, src)
                if url not in urls:
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download,args=(url,count))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)

def download(url,count):
    try:
        if(url[len(url)-4]=="."):
                ext=url[len(url)-4:]
        else:
            ext=""
        req=urllib.request.Request(url,headers=headers)
        data=urllib.request.urlopen(req,timeout=100)
        data=data.read()
        fobj=open("C:/Program Files/JetBrains/PyCharm 2020.2.1/images1/"+str(count)+ext,"wb")
        fobj.write(data)
        fobj.close()
        print("downloaded" + str(count) + ext)
    except Exception as err:
        print(err)
start_url="http://www.weather.com.cn/weather/101230101.shtml"
headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)Gecko/2008072421 Minefield/3.0.2pre"}
count=0
threads=[]

imageSpider(start_url)
for t in threads:
    t.join()
print("the end")

 

 

 

 心得体会:作业1的完成是书上代码的复现,通过这次的作业巩固了多线程的使用

作业②

要求:使用scrapy框架复现作业①。

 spider.py

from scrapy
from ..items import Weatheritem
from scrapy.selector import Selector

class ImageSpider(scrapy.Spider):
    name = 'spider'
    start_url = ["http://www.weather.com.cn/weather/101230101.shtml"]
    #获取URL
def parse(self, response):
    try:
        data=response.body.decode()
        selector=Selector(text=data)
        src_list=response.xpath('//image/@src').extract()
        for src in src_list:
            item=Imageitem()
            yield item

item.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy



class Imageitem(scrapy.Item):
   jpg=scrapy.Field
    pass

settings.py

# Scrapy settings for demo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
ITEM_PIPELINES = {
   'movie80s.pipelines.Movie80SPipeline': 1,
}
IMAGES_STORE = 'E:/Temp/dongman/'

BOT_NAME = 'demo'

SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'demo (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'demo.middlewares.DemoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'demo.middlewares.DemoDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
#    'demo.pipelines.DemoPipeline': 300,
      'scrapy.pipelines.image.ImagesPipeline':1,
}
IMAGES_STORE=r'C:\exam\demo\demo\image'
IMAGES_URLS_FIELD='jpg'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

 

 

作业③:

要求:使用scrapy框架爬取股票相关信息。

 

posted @ 2020-10-20 22:29  ww嘤  阅读(104)  评论(0编辑  收藏  举报