第三次作业

作业①

(1)、要求:指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。

单线程:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request

def imageSpider(start_url):
    try:
        urls=[]
        req=urllib.request.Request(start_url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit=UnicodeDammit(data,["utf-8","gbk"])
        data=dammit.unicode_markup
        soup=BeautifulSoup(data,"html.parser")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    download(url)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)

def download(url):
    global count
    try:
        count=count+1
        #提取文件后缀扩展名
        if(url[len(url)-4]=="."):
            ext=url[len(url)-4:]
        else:
            ext=""
        req=urllib.request.Request(url,headers=headers)
        data=urllib.request.urlopen(req,timeout=100)
        data=data.read()
        fobj=open("E:\\wangluopachong_3\\images\\"+str(count)+ext,"wb")
        fobj.write(data)
        fobj.close()
        print("downloaded "+str(count)+ext)
    except Exception as err:
            print(err)

start_url="http://www.weather.com.cn/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count=0
imageSpider(start_url)

输出信息


多线程:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
def imageSpider(start_url):
    global threads
    global count
    try:
        urls=[]
        req=urllib.request.Request(start_url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit=UnicodeDammit(data,["utf-8","gbk"])
        data=dammit.unicode_markup
        soup = BeautifulSoup(data, "html.parser")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    print(url)
                    count = count + 1
                T = threading.Thread(target=download, args=(url, count))
                T.setDaemon(False)
                T.start()
                threads.append(T)
            except Exception as err:
                print(err)
    except Exception as err: print(err)
def download(url,count):
    try:
        if(url[len(url)-4]=="."):
            ext=url[len(url)-4:]
        else:
            ext=""
        req=urllib.request.Request(url,headers=headers)
        data=urllib.request.urlopen(req,timeout=100)
        data=data.read()
        fobj=open("E:\\wangluopachong_3\\images_2\\"+str(count)+ext,"wb")
        fobj.write(data)
        fobj.close()
        print("downloaded "+str(count)+ext)
    except Exception as err:
        print(err)

start_url="http://www.weather.com.cn/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre)\
Gecko/2008072421 Minefield/3.0.2pre"}
count=0
threads=[]
imageSpider(start_url)
for t in threads:
    t.join()
print("The End")

输出信息


(2)、心得体会:

用多线程加高了效率和可靠性

作业②

(1)、要求:使用scrapy框架复现作业①。

爬取的代码

编写爬虫程序

import scrapy
from ..items import TianqiItem

class MySpider(scrapy.spiders.Spider):
    name="MySpider"#定义爬虫名

    allowed_domains=["weather.com"]    #搜索的域名范围,也就是爬虫的约束区域,规定爬虫只爬取这个域名下的网页

    start_urls=["http://www.weather.com.cn/"]

    #该函数名不能改变,因为Scrapy源码中默认callback函数的函数名就是parse
    def parse(self, response):
        href_list = response.xpath('//@src').extract()
        for src in href_list:
            item = TianqiItem()
            item['image_url']=[src]
            yield item

编写数据项目类

import scrapy
class TianqiItem(scrapy.Item):
    # define the fields for your item here like:
    image_url=scrapy.Field()

配置settings

ITEM_PIPELINES = {
#    'tianqi_scrapy.pipelines.TianqiScrapyPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1
}
IMAGES_STORE='E:\wangluopachong_3\images_scrapy'
IMAGES_URLS_FIELD='image_url'

输出信息


(2)、心得体会:

初步了解了scrapy框架,在编写过程中对setting.py,MySpider.py,items.py,pipelines.py这四个不同文件的作用与如何编写掌握更加深刻,也体会到了scrapy功能的强大

作业③

(1)、要求:使用scrapy框架爬取股票相关信息。

编写爬虫程序

import json
import scrapy
from ..items import gupiaoItem
class gupiao(scrapy.Spider):
    name = 'gupiao'
    start_urls = ["http://82.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112404444853921220622_1603172260738&pn=1&pz=20&po=1&np=1&"
                  "ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields="
                  "f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1603172260739"]

    def parse(self, response):
        data = json.loads(response.text.lstrip('jQuery112404444853921220622_1603172260738(').rstrip(');'))#去除头部json杂质
        for i in data['data']['diff']:
            item = gupiaoItem()
            item["daima"] = i['f12']
            item["name"] = i['f14']
            item["new"] = i['f2']
            item["zangfu"] =i['f3']
            item["e"] = i['f4']
            item["chengjiao"] = i['f5']
            item["jiaoe"]=i['f6']
            item["zhenfu"] = i['f7']
            item["max"] =i['f15']
            item["min"] = i['f16']
            item["today"] = i['f17']
            item["ye"] = i['f18']
            yield item
        print("finish!!!")

编写数据项目类

import scrapy
class gupiaoItem(scrapy.Item):
    # define the fields for your item here like:
    daima = scrapy.Field()
    name = scrapy.Field()
    new = scrapy.Field()
    zangfu = scrapy.Field()
    e = scrapy.Field()
    chengjiao = scrapy.Field()
    jiaoe = scrapy.Field()
    zhenfu = scrapy.Field()
    max = scrapy.Field()
    min = scrapy.Field()
    today = scrapy.Field()
    ye = scrapy.Field()
    pass

编写数据管道处理类

import csv
class ScrapyGupiaoPipeline(object):
    with open('gu.csv', 'a', encoding='utf-8', newline='')as file:
        writer = csv.writer(file)
        writer.writerow(["代码", "名称", "最新价格", "涨跌额", "涨跌幅", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收"])
        def process_item(self, item, spider):
            with open('gu.csv', 'a', encoding='utf-8', newline='')as file:
                writer = csv.writer(file)
                writer.writerow([item["daima"], item["name"], item["new"], item["zangfu"], item["e"],
                item["chengjiao"], item["jiaoe"], item["zhenfu"], item["max"], item["min"], item["today"], item["ye"]])
            return item

配置setting

ITEM_PIPELINES = {
   'demo.pipelines.ScrapyGupiaoPipeline': 300,
}

输出信息


(2)、心得体会

scrapy爬虫出现Forbidden by robots.txt,查阅可知:关闭scrapy自带的ROBOTSTXT_OBEY功能,在setting找到这个变量,设置为False即可解决。用scrapy框架爬取了股票数据,对scrapy框架更加了解,掌握更熟练了。

posted @ 2020-10-20 18:13  呱506  阅读(113)  评论(0)    收藏  举报