Python高级应用程序设计任务

一、主题式网络爬虫设计方案（15分）

1.主题式网络爬虫名称

刀塔2热门饰品交易分析
2.主题式网络爬虫爬取的内容与数据特征分析

对热门饰品的成交价格和时间爬取
3.主题式网络爬虫设计方案概述（包括实现思路与技术难点）

用request库从商品列表获取单个商品的url，然后从商品获取名字和价格，然后根据价格做出图表已达到数据可视化持久化。

二、主题页面的结构特征分析（15分）
1.主题页面的结构特征

https://www.c5game.com/dota.html?quality=&hero=&type=&exterior=&tag=%E8%8F%A0%E8%8F%9C&page=[]

通过填写{}中的内容获取页面
2.Htmls页面解析

通过谷歌浏览器（F12）查看页面结构

3.节点（标签）查找方法与遍历方法

Ctrl+F定位信息，然后分析标签，为谷歌浏览器自带的功能
三、网络爬虫程序设计（60分）
爬虫程序主体要包括以下各部分，要附源代码及较详细注释，并在每部分程序后面提供输出结果的截图。
总代码

import requests,  matplotlib, re
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup

#url定位
def ahtml(url):
    a = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
    try:
        data = requests.get(url, headers=a)
        data.raise_for_status()
        data.encoding = data.apparent_encoding
        return data.text
    except:
        return " "

#获取商品代码
def getcommodity(coms):
    for i in range(1, 3):
        html = ahtml('https://www.c5game.com/dota.html?quality=&hero=&type=&exterior=&tag=%E8%8F%A0%E8%8F%9C&page={}'.format(i))
        soup = BeautifulSoup(html, "html.parser")
        for span in soup.find_all("p", attrs="name"):
            temp = span.a.attrs['href']
            temp1 = temp.split('/')[2]
            temp2 = temp1.split('-')[0]
            coms.append(temp2)
    return coms

#获取价格
def getprice(coms):
    count = []
    html = ahtml('https://www.c5game.com/dota/history/{}.html'.format(coms))
    soup = BeautifulSoup(html, "html.parser")
    span = soup.find_all("span", attrs="ft-gold")
    i = 0
    while i < 8:
        temp = span[i].string
        count.append(eval(temp.split('￥')[1]))
        i += 1
    return count

#获取饰品名称
def getname(coms):
    name = []
    html = ahtml('https://www.c5game.com/dota/history/{}.html'.format(coms))
    soup = BeautifulSoup(html, "html.parser")
    span = soup.find_all("div", attrs="name-ellipsis")
    name.append(span[0].string)
    return name


def ChartBroken(x, y, doc):
    plt.figure()
    plt.plot(x, y)
    plt.ylabel('Price')
    plt.xlabel('Number')
    plt.title(doc)
    plt.savefig(doc, dpi=600)
    plt.show()

def ChartBar(x, y, doc):
    plt.figure()
    plt.bar(left=x, height=y, color='b', width=0.5)
    plt.ylabel('Price')
    plt.xlabel('Number')
    plt.title(doc)
    plt.savefig(doc, dpi=600)
    plt.show()


#修正pandas绘图中文乱码问题
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False

#主函数
def main():
    a = []
    comslist = getcommodity(a)[0:3]
    i = 0
    while i < 3:
        name = getname(comslist[i])
        print(name)
        price = getprice(comslist[i])
        print(price)
        ChartBroken([1, 2, 3, 4, 5, 6, 7, 8], price, name[0])
        ChartBar([1, 2, 3, 4, 5, 6, 7, 8], price, name[0])
        i += 1

if __name__ == '__main__':
  main()

1.爬取

#url定位
def ahtml(url):
    a = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'}
    try:
        data = requests.get(url, headers=a)
        data.raise_for_status()
        data.encoding = data.apparent_encoding
        return data.text
    except:
        return " "

#获取商品代码并将无用信息删除
def getcommodity(coms):
    for i in range(1, 3):
        html = ahtml('https://www.c5game.com/dota.html?quality=&hero=&type=&exterior=&tag=%E8%8F%A0%E8%8F%9C&page={}'.format(i))
        soup = BeautifulSoup(html, "html.parser")
        for span in soup.find_all("p", attrs="name"):
            temp = span.a.attrs['href']
            temp1 = temp.split('/')[2]
            temp2 = temp1.split('-')[0]
            coms.append(temp2)
    return coms

#获取价格
def getprice(coms):
    count = []
    html = ahtml('https://www.c5game.com/dota/history/{}.html'.format(coms))
    soup = BeautifulSoup(html, "html.parser")
    span = soup.find_all("span", attrs="ft-gold")
    i = 0
    while i < 8:
        temp = span[i].string
        count.append(eval(temp.split('￥')[1]))
        i += 1
    return count

#获取饰品名称
def getname(coms):
    name = []
    html = ahtml('https://www.c5game.com/dota/history/{}.html'.format(coms))
    soup = BeautifulSoup(html, "html.parser")
    span = soup.find_all("div", attrs="name-ellipsis")
    name.append(span[0].string)
    return name

2.对数据进行清洗和处理

#获取商品代码并将无用信息删除
def getcommodity(coms):
    for i in range(1, 3):
        html = ahtml('https://www.c5game.com/dota.html?quality=&hero=&type=&exterior=&tag=%E8%8F%A0%E8%8F%9C&page={}'.format(i))
        soup = BeautifulSoup(html, "html.parser")
        for span in soup.find_all("p", attrs="name"):
            temp = span.a.attrs['href']
            temp1 = temp.split('/')[2]
            temp2 = temp1.split('-')[0]
            coms.append(temp2)
    return coms

3.数据分析与可视化
（例如：数据柱形图、直方图、散点图、盒图、分布图、数据回归分析等）

4.数据持久化

#数据保存
def ChartBroken(x, y, doc):
    plt.figure()
    plt.plot(x, y)
    plt.ylabel('Price')
    plt.xlabel('Times')
    plt.title(doc)
    plt.savefig(doc, dpi=600)
    plt.show()

def ChartBar(x, y, doc):
    plt.figure()
    plt.bar(left=x, height=y, color='b', width=0.5)
    plt.ylabel('Price')
    plt.xlabel('Times')
    plt.title(doc)
    plt.savefig(doc, dpi=600)
    plt.show()

四、结论（10分）
1.经过对主题数据的分析与可视化，可以得到哪些结论？

对于DOTA2饰品的价格，可以看到一个明显的上升，对比游戏版本更新，可以得出结论。

在游戏版本初，英雄的改动将导致价格和需求的上升。
2.对本次程序设计任务完成的情况做一个简单的小结。

看视频一步一步学，但是还是有点困难，要一直百度错误信息

posted on 2019-12-15 19:02 笑天征阅读(338) 评论(0) 编辑收藏举报