第三次作业

作业①:

要求：爬取中国气象网（http://www.weather.com.cn）的图片。分别使用单线程和多线程的方式爬取。

代码：1.单线程运行：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
def imageSpider(start_url):
    try:
        urls = []
        req = urllib.request.Request(start_url,headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data,["utf-8","gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data,"lxml")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url,src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    download(url)
            except Exception as errrrrrrrrrrrrrrrrrrror:
                print(errrrrrrrrrrrrrrrrrrror)
    except Exception as errrrrrrrrrrrrrrrrrrror:
        print(errrrrrrrrrrrrrrrrrrror)

def download(url):
    global count
    try:
        count = count + 1
        if(url[len(url)-4] == "."):
            ext = url[len(url)-4:]
　　　　　　　#ext = url[len(url)-4]
        else:
            ext = ""
        req = urllib.request.Request(url,headers=headers)
        data = urllib.request.urlopen(req,timeout=100)
        data = data.read()
        fobj = open("images\\" + str(count) + ext,"wb")
        fobj.write(data)
        fobj.close()
        print("downloaded" + str(count) + ext)
    except Exception as errrrrrrrrrrrrrrrrrrror:
        print(errrrrrrrrrrrrrrrrrrror)

start_url = "http://weather.com.cn/weather/101280601.shtml"
headers={"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count = 0
imageSpider(start_url)

结果：

...？这是什么东西？

仔细检查之后发现是我之前抄书都抄错了，把

ext = url[len(url)-4:]

抄成了

ext = url[len(url)-4]

修正后正常输出：

代码2.多线程运行：

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading

def imageSpider(start_url):
    global thread
    global count
    try:
        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "html.parser")
        images1 = soup.select("img")
        for image in images1:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)
                    T.start()
                    thread.append(T)
            except Exception as errrrrrrrrrrrrrrrrrrror:
                print(errrrrrrrrrrrrrrrrrrror)
    except Exception as errrrrrrrrrrrrrrrrrrror:
        print(errrrrrrrrrrrrrrrrrrror)


def download(url, count):
    try:
        count=count+1
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("images2\\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded " + str(count) + ext)
    except Exception as errrrrrrrrrrrrrrrrrrror:
        print(errrrrrrrrrrrrrrrrrrror)
start_url = "http://weather.com.cn/weather/101280601.shtml"
headers = {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count = 0
thread = []


imageSpider(start_url)
for t in thread:
    t.join()
print("the End")

结果：

看起来很正常，很多线程......嗯？

仔细检查后发现，多线程运行时有小概率会出现代码被跳过的情况，导致甚至有图片没有加上后缀名。

总结：再次尝试了多线程运行，虽然没有加上time()来测速，但毫无疑问多线程会更快一些。

只是这个连线程内代码都能被跳过的机制，在弄明白之前我实在有点不敢用......

作业②

要求：使用scrapy框架复现作业①。
代码：
MySpider.py:

import scrapy
from items import WeatherPhotoItem
from scrapy.selector import Selector
class Spider_weatherphoto(scrapy.Spider):

    name = "weatherspider"
    start_urls=["http://www.weather.com.cn/"]
    def parse(self, response):
        try:
            data = response.body.decode()
            selector = Selector(text=data)
            s=selector.xpath("//img/@src").extract()
            for e in s:
                item=WeatherPhotoItem()
                item["pic"] = [e]
                yield item
        except Exception as err:
            print(err)

pinelines.py:

import os
import urllib

class GetpicturePipeline:
    cnt = 1
    urlstream = []
    def process_item(self, item, spider):
        GetpicturePipeline.cnt += 1
        try:
            if not os.path.exists('images3'):
                os.mkdirs('images3')
            if item['url'] not in GetpicturePipeline.urlstream:
                data = urllib.request.urlopen(item['url']).read()
                with open('images3/' + str(GetpicturePipeline.cnt) + '.jpg', "wb") as f:
                    f.write(data)
        except Exception as err:
            print(err)
        return item

items.py:

import scrapy
class WeatherPhotoItem(scrapy.Item):
    pic= scrapy.Field()
    pass

结果：

总结：第一次尝试了scrapy框架和xpath，实事求是的讲xpath是一个更为优秀的查找信息的语言，但在定位上会麻烦一些...

作业③:

要求：使用scrapy框架爬取股票相关信息。
代码：
items.py:

import scrapy
class GetstockItem(scrapy.Item):
    index = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    latestPrice = scrapy.Field()
    seeSawedRange = scrapy.Field()
    seeSawedPrice = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()
    pass

pipelines.py

import prettytable as pt
class GetstockPipeline:
    tb = pt.PrettyTable(["序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "最高", "最低", "今开", "昨收"])
    def process_item(self, item, spider):
        self.tb.add_row(
            [item["index"], item["code"], item["name"], item["latestPrice"], item["seeSawedRange"], item["seeSawedPrice"],item["highest"], item["lowest"],
 item["today"],item["yesterday"]])
        return item

MySpider.py

import scrapy
import re
from getstock.pipelines import GetstockPipeline
from getstock.items import GetstockItem


class MySpider(scrapy.Spider):
    name = "mySpider"

    def start_requests(self):
        url = 'http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112403324490377009397_1602209502288&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1602209502289'
        yield scrapy.Request(url=url, callback=self.parse)
    def parse(self,response):
        try:

            r = response.body.decode()
            pat = '\[(.*?)\]'
            data = re.compile(pat, re.S).findall(r)
            datas = data[0].split("},{")
            datas[0] = datas[0].replace('{', '')
            datas[-1] = datas[-1].replace('}', '')
            for i in range(len(datas)):
                item = GetstockItem()
                item["index"] = i+1
                item["code"] = datas[i].split(",")[6].split(":")[1]
                item["name"] = datas[i].split(",")[7].split(":")[1]
                item["latestPrice"] = datas[i].split(",")[0].split(":")[1]
                item["seeSawedRange"] = datas[i].split(",")[1].split(":")[1]
                item["seeSawedPrice"] = datas[i].split(",")[2].split(":")[1]
                item["highest"] = datas[i].split(",")[8].split(":")[1]
                item["lowest"] = datas[i].split(",")[9].split(":")[1]
                item["today"] = datas[i].split(",")[10].split(":")[1]
                item["yesterday"] = datas[i].split(",")[11].split(":")[1]
                yield item
            print(GetstockPipeline.tb)
        except Exception as err:
            print(err)

总结：

这次实验进一步增强了我对scrapy框架的理解，并尝试使用了一个更为优秀的第三方库以排版输出结果，主体部分还是参考了之前上一次优秀作业的代码，创新度较少。

posted @ 2020-10-20 22:57 159ggg 阅读(152) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

159ggg

第三次作业

作业①:

要求：爬取中国气象网（http://www.weather.com.cn） 的图片。分别使用单线程和多线程的方式爬取。

代码：1.单线程运行：

公告

要求：爬取中国气象网（http://www.weather.com.cn）的图片。分别使用单线程和多线程的方式爬取。