数据第三次作业

作业一：

（1）实验要求：指定一个网站，爬取这个网站中的所有的所有图片，例如中国气象网（http://www.weather.com.cn）。分别使用单线程和多线程的方式爬取。

（2）实验代码：

单线程实验代码：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import lxml

def imageSpider(start_url):
    global thread
    try:
        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images1 = soup.select("img")
        for image in images1:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    urls.append(url)
                    download(url)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)

def download(url):
    try:
        if url[-4] == ".":
            name = url.split("/")[-1]
            req = urllib.request.Request(url, headers=headers)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open('/Users/dar_z/Desktop/test/images' + name, "wb")    #创建打开文件夹
            fobj.write(data)
            fobj.close()
            print("downloaded " + name)
    except Exception as err:
        print(err)

start_url = "http://www.weather.com.cn/weather1d/101010100.shtml"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"}
thread = []

imageSpider(start_url)
print("end")

单线程实验结果：

多线程实验代码：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading


headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"
}
count = 0       #记录图片张数
threads = []    # 保存所有调用的线程


def imageSpider(start_url):
    global threads
    global count
    try:
        urls = []  # 保存已下载过的图片的路径
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, features="html.parser")
        # 查找网页中所有标签是img的
        images = soup.select("img")
        for image in images:
            try:
                # 取出图片的路径
                src = image["src"]
                # 将访问的网址与图片路径结合在一起
                url = urllib.request.urljoin(start_url, src)
                # 判断图片是否已经下载过了
                if url not in urls:
                    print(url)
                    count = count + 1
                    # 线程调用，执行download函数，并传参
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)  # 设置线程为后台线程
                    T.start()
                    threads.append(T)   # 将调用的线程添加到threads中
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)

def download(url, count):
    try:
        if url[len(url)-4] == ".":  # 对应xxx.jpg、xxx.png等图片的处理
            ext = url[len(url)-4:]
        elif url[len(url)-5] == ".":    # 对应xxx.jpeg图片的处理
            ext = url[len(url) - 5:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        #  创建图像文件
        fobj = open("/Users/dar_z/Desktop/test/images"+str(count)+ext, "wb")
        #  写入数据
        fobj.write(data)
        fobj.close()
        print("downloaded"+str(count)+ext)
    except Exception as err:
        print(err)


#  要访问的网站
start_url = "http://www.weather.com.cn"
#  请求头，模拟浏览器
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15"
}
count = 0
threads = []    # 保存所有调用的线程
imageSpider(start_url)
for t in threads:
    t.join()    # 线程阻塞，主线程要等待所有线程执行完才继续向下执行
print("The end")

多线程结果：

（3）心得体会：

单线程和多线程大同小异，主要体现在时间上有些许不同，具体差异还得继续深入学习

作业二：

（1）实验要求：使用scrapy框架复现作业①。

（2）实验代码：
item.py

1 import scrapy
2 
3 class ImageItem(scrapy.Item):
4 image_urls = scrapy.Field()
6     images = scrapy.Field()

settings.py

USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
#增加访问header，加个降低被拒绝的保险

ITEM_PIPELINES = {
    'scrapy.pipelines.images.ImagesPipeline': 1
}
#打开Images的通道

IMAGES_STORE = '/Users/dar_z/Desktop/test'
#设置这个存储地址

spider.py

import scrapy

from car.items import ImageItem

class CarhomeSpider(scrapy.Spider):
    name = 'weather'
    allowed_domains = ['weather.com']
    start_urls = ['http://www.weather.com.cn/']
    download_delay = 1

    def parse(self, response):
        item = ImageItem()
        srcs = response.css('.article img::attr(src)').extract() #用css方法找到的所有图片地址
        item['image_urls'] = srcs
        yield item

pipeline.py

from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request

class MyImagesPipeline(ImagesPipeline) :
    def get_media_requests(self, item, info) :
        for image_url in item['image_urls'] :
            yield Request(image_url)

    def item_completed(self, results, item, info) :
        image_path = [x['path'] for ok, x in results if ok]
        if not image_path :
            raise DropItem('Item contains no images')
        item['image_paths'] = image_path
        return item

(3)实验结果：

作业三：

（1）实验要求：使用scrapy框架爬取股票相关信息。

（2）实验代码：

stock.py


import scrapy
from bs4 import BeautifulSoup
import re

class StockSpider(scrapy.Spider):
    name = 'stock'
    # allowed_domains = ['quote.eastmoney.com']
    start_urls = ['http://quote.eastmoney.com/stock_list.html']

    def parse(self, response):
        for href in response.css('a::attr(href)').extract():
            try:
                stock = re.search(r"[s][hz]\d{6}", href).group(0)
                stock = stock.upper()
                url = 'https://xueqiu.com/S/' + stock
                yield scrapy.Request(url, callback = self.parse_stock)
            except:
                continue

    def parse_stock(self, response):
        infoDict = {}
        if response == "":
            exit()
        try:
            name = re.search(r'<div class="stock-name">(.*?)</div>', response.text).group(1)
            infoDict.update({'股票名称': name.__str__()})
            tableHtml = re.search(r'"tableHtml":"(.*?)",', response.text).group(1)
            soup = BeautifulSoup(tableHtml, "html.parser")
            table = soup.table
            for i in table.find_all("td"):
                line = i.text
                l = line.split("：")
                infoDict.update({l[0].__str__(): l[1].__str__()})
            yield infoDict
        except:
            print("error")

pipelines.py

class DemoPipeline(object):
    def process_item(self, item, spider):
        return item

class stockPipeline(object):
    def open_spider(self,spider):
        self.f = open('XueQiuStock.txt','w')

    def close_spider(self,spider):
        self.f.close()

    def process_item(self,item,spider):
        try:
            line = str(dict(item)) + '\n'
            self.f.write(line)
        except:
            pass
        return item

settings.py

BOT_NAME = 'demo'

SPIDER_MODULES = ['demo.spiders']
NEWSPIDER_MODULE = 'demo.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'demo.pipelines.stockPipeline': 300,
}

(3)实验结果图：

（4）心得体会：

导入库的时候出现一只报错问题

posted @ 2020-10-20 13:06 DarSkey 阅读(157) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

DarSkey

数据第三次作业

作业一：

作业二：

作业三：

公告