python爬取福布斯富人榜并进行数据可视化

python爬取福布斯富人榜并进行数据可视化

一,选题背景

如今,人类社会已经进入了大数据时代,数据已经成为必不可少的部分,可见数据的获取非常重要,而数据的获取的方式大概有下面几种。

1、企业生产的数据,大型互联网公司有海量的用户,所以他们积累数据有天然的优势

2、数据管理资讯公司

3、政府/机构提供的公开数据

4、第三方数据平台购买数据爬虫爬取数据

二,爬虫设计方案

1、项目名称:python爬取福布斯富人榜并进行数据可视化

2、数据的获取与数据特点的分析

三,结构特征分析

1、页面解析:

 

 

 四,程序设计

1、读取爬取页面链接结构

 

1 def loadalldata():
2    alldata = []
3    for i in range(1,16,1):
4       url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html"
5       data = loaddata(url)
6       alldata = alldata + data
7    return alldata

 

2、读取一页的数据

 

 

 1 def loaddata(url):
 2    from bs4 import BeautifulSoup
 3    import requests
 4    headers = {
 5        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
 6                     'Chrome/72.0.3626.121 Safari/537.36'
 7    }
 8    f = requests.get(url,headers=headers)
 9    soup = BeautifulSoup(f.content, "lxml")
10    # print(f.content.decode())
11    ranktable = soup.find_all('table',class_="rank-table" )[0]
12    trlist = ranktable.find_all('tr')
13    trlist.pop(0)
14    persionlist = []
15    for tr in trlist:
16       persion = {}
17       persion['num'] = tr.find_all('td')[0].string
18       persion['name'] = tr.find_all('td')[1].p.string
19       persion['money'] = tr.find_all('td')[2].string
20       persion['company'] = tr.find_all('td')[3].string
21       persion['country'] = tr.find_all('td')[4].a.string
22       persionlist.append(persion)
23    print("页面"+url+"爬取成功")
24    return persionlist

 

 

 

3、将爬取到的数据保存到本地Excel文件

 

 

 1 def savedata(path,persionlist):
 2    import xlwt
 3    workbook = xlwt.Workbook()
 4    worksheet = workbook.add_sheet('test')
 5    worksheet.write(0, 0, '排名')
 6    worksheet.write(0, 1, '姓名')
 7    worksheet.write(0, 2, '财富')
 8    worksheet.write(0, 3, '企业')
 9    worksheet.write(0, 4, '国家')
10    for i in range(1,len(persionlist)+1,1):
11       worksheet.write(i, 0, persionlist[i-1]['num'])
12       worksheet.write(i, 1, persionlist[i-1]['name'])
13       worksheet.write(i, 2, persionlist[i-1]['money'])
14       worksheet.write(i, 3, persionlist[i-1]['company'])
15       worksheet.write(i, 4, persionlist[i-1]['country'])
16    workbook.save(path)
17    print("数据保存成功:"+path)

 

 

 

4、取出排行榜前十的姓名和财富数据 以两个list返回

 

 

 1 def loadtop10(path):
 2     import xlrd
 3     book = xlrd.open_workbook(path)
 4     sheet1 = book.sheets()[0]
 5     namelist = sheet1.col_values(1)
 6     moneylist = sheet1.col_values(2)
 7     namelist = namelist[1:11]
 8     moneylist = moneylist[1:11]
 9  
10     moneylist2 = []
11     for a in moneylist:
12         a = int(a[0:-3])
13         moneylist2.append(a)
14     print("取出排行榜前十的姓名和财富数据")
15     print(namelist)
16     print(moneylist2)
17     return namelist,moneylist2
18 def countcountrynum(path):
19    import xlrd
20    book = xlrd.open_workbook(path)
21    sheet1 = book.sheets()[0]
22    countrylist = sheet1.col_values(4)[1:-1]
23    print(countrylist)
24    countryset = list(set(countrylist))
25    dictlist = []
26    for country in countryset:
27       obj = {"name":country,"count":0}
28       dictlist.append(obj)
29    for obj in dictlist:
30       for a in countrylist:
31          if obj['name'] == a:
32             obj['count'] = obj['count'] + 1
33    print(dictlist)
34    for i in range(0,len(dictlist),1):
35       for j in range(0,len(dictlist)-i-1,1):
36           if dictlist[j]['count'] < dictlist[j+1]['count']:
37              temp = dictlist[j]
38              dictlist[j] = dictlist[j+1]
39              dictlist[j+1] = temp
40    dictlist2 = dictlist[0:5]
41    set2 = []
42    for a in dictlist2:
43       set2.append(a['name'])
44    othercount = 0;
45    for a in dictlist:
46       if a['name'] not in set2:
47          othercount = othercount + 1
48    dictlist2.append({"name":"其他","count":othercount})
49    print('获取排行榜中每个国家的上榜人数')
50    print(dictlist2)
51    return dictlist2
52 def drow():
53    import matplotlib.pyplot as plt
54    plt.rcParams['font.sans-serif'] = ['SimHei']
55    plt.figure('福布斯前十榜',figsize=(15,5))
56    listx,listy = loadtop10('rank.xls')
57    plt.title('福布斯前十榜', fontsize=16)
58    plt.xlabel('人物', fontsize=14)
59    plt.ylabel('金额/亿美元', fontsize=14)
60    plt.tick_params(labelsize=10)
61    plt.grid(linestyle=':', axis='y')
62    a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center')
63    for i in a:
64       h = i.get_height()
65       plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom')
66    dictlist = countcountrynum("rank.xls")
67    plt.figure('各国家上榜人数所占比例')
68    labels = []
69    sizes = []
70    for a in dictlist:
71       labels.append(a['name'])
72       sizes.append(a['count'])
73    explode = (0.1, 0, 0, 0, 0, 0)
74    plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
75    plt.title("各国家上榜人数所占比例", fontsize=16)
76    plt.axis('equal')
77  
78    plt.show()

 

 

 

5、完整代码

 

  1  ## 读取一页的数据
  2 def loaddata(url):
  3    from bs4 import BeautifulSoup
  4    import requests
  5    headers = {
  6        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) '
  7                     'Chrome/72.0.3626.121 Safari/537.36'
  8    }
  9    f = requests.get(url,headers=headers)   #Get该网页从而获取该html内容
 10    soup = BeautifulSoup(f.content, "lxml")  #用lxml解析器解析该网页的内容, 好像f.text也是返回的html
 11    # print(f.content.decode())        #尝试打印出网页内容,看是否获取成功
 12    ranktable = soup.find_all('table',class_="rank-table" )[0]   #获取排行榜表格
 13    trlist = ranktable.find_all('tr') #获取表格中所有tr标签
 14    trlist.pop(0) #去掉第一个元素
 15    persionlist = []
 16    for tr in trlist:
 17       persion = {}
 18       persion['num'] = tr.find_all('td')[0].string  #编号
 19       persion['name'] = tr.find_all('td')[1].p.string #名称
 20       persion['money'] = tr.find_all('td')[2].string #财产
 21       persion['company'] = tr.find_all('td')[3].string #企业
 22       persion['country'] = tr.find_all('td')[4].a.string #国家
 23       persionlist.append(persion)
 24    print("页面"+url+"爬取成功")
 25    return persionlist
 26  
 27  
 28 ## 读取所有福布斯排行榜数据
 29 def loadalldata():
 30    alldata = []
 31    for i in range(1,16,1):
 32       url = "https://www.phb123.com/renwu/fuhao/shishi_"+str(i)+".html"
 33       data = loaddata(url)
 34       alldata = alldata + data
 35    return alldata
 36  
 37 ## 将爬取的数据保存到文件
 38 def savedata(path,persionlist):
 39    import xlwt
 40    workbook = xlwt.Workbook()
 41    worksheet = workbook.add_sheet('test')
 42    worksheet.write(0, 0, '排名')
 43    worksheet.write(0, 1, '姓名')
 44    worksheet.write(0, 2, '财富')
 45    worksheet.write(0, 3, '企业')
 46    worksheet.write(0, 4, '国家')
 47    for i in range(1,len(persionlist)+1,1):
 48       worksheet.write(i, 0, persionlist[i-1]['num'])
 49       worksheet.write(i, 1, persionlist[i-1]['name'])
 50       worksheet.write(i, 2, persionlist[i-1]['money'])
 51       worksheet.write(i, 3, persionlist[i-1]['company'])
 52       worksheet.write(i, 4, persionlist[i-1]['country'])
 53    workbook.save(path)
 54    print("数据保存成功:"+path)
 55  
 56 ## 取出排行榜前十的姓名和财富数据 以两个list返回
 57 def loadtop10(path):
 58     import xlrd
 59     book = xlrd.open_workbook(path)
 60     sheet1 = book.sheets()[0]
 61     namelist = sheet1.col_values(1)
 62     moneylist = sheet1.col_values(2)
 63     namelist = namelist[1:11]
 64     moneylist = moneylist[1:11]
 65  
 66     moneylist2 = []
 67     for a in moneylist:
 68         a = int(a[0:-3])
 69         moneylist2.append(a)
 70     print("取出排行榜前十的姓名和财富数据")
 71     print(namelist)
 72     print(moneylist2)
 73     return namelist,moneylist2
 74  
 75 ## 统计排行榜中每个国家的上榜人数 以字典list返回
 76 def countcountrynum(path):
 77    import xlrd
 78    book = xlrd.open_workbook(path)
 79    sheet1 = book.sheets()[0]
 80    countrylist = sheet1.col_values(4)[1:-1]
 81    print(countrylist)
 82    countryset = list(set(countrylist))
 83    dictlist = []
 84    for country in countryset:
 85       obj = {"name":country,"count":0}
 86       dictlist.append(obj)
 87    ## 统计出每个国家对应的数量
 88    for obj in dictlist:
 89       for a in countrylist:
 90          if obj['name'] == a:
 91             obj['count'] = obj['count'] + 1
 92    print(dictlist)
 93    ## 将dictlist排序 数量多的放前面 8 5 6 9 3 2 4
 94    for i in range(0,len(dictlist),1):
 95       for j in range(0,len(dictlist)-i-1,1):
 96           if dictlist[j]['count'] < dictlist[j+1]['count']:
 97              temp = dictlist[j]
 98              dictlist[j] = dictlist[j+1]
 99              dictlist[j+1] = temp
100    dictlist2 = dictlist[0:5]
101    set2 = []
102    for a in dictlist2:
103       set2.append(a['name'])
104    othercount = 0;
105    for a in dictlist:
106       if a['name'] not in set2:
107          othercount = othercount + 1
108    dictlist2.append({"name":"其他","count":othercount})
109    print('获取排行榜中每个国家的上榜人数')
110    print(dictlist2)
111    return dictlist2
112  
113 ## 绘制条形图和饼状图
114 def drow():
115    import matplotlib.pyplot as plt
116    plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文字体
117    plt.figure('福布斯前十榜',figsize=(15,5))
118  
119    ## 读取福布斯排行榜前十的数据
120    listx,listy = loadtop10('rank.xls')
121  
122    plt.title('福布斯前十榜', fontsize=16)
123    plt.xlabel('人物', fontsize=14)
124    plt.ylabel('金额/亿美元', fontsize=14)
125    plt.tick_params(labelsize=10)
126    plt.grid(linestyle=':', axis='y')
127    a = plt.bar(listx, listy, color='dodgerblue', label='Apple', align='center')
128    # 设置标签
129    for i in a:
130       h = i.get_height()
131       plt.text(i.get_x() + i.get_width() / 2, h, '%d' % int(h), ha='center', va='bottom')
132    ## -------------------------------------------------------------------------
133    dictlist = countcountrynum("rank.xls")
134    plt.figure('各国家上榜人数所占比例')
135    labels = []
136    sizes = []
137    for a in dictlist:
138       labels.append(a['name'])
139       sizes.append(a['count'])
140    explode = (0.1, 0, 0, 0, 0, 0)
141    plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
142    plt.title("各国家上榜人数所占比例", fontsize=16)
143    plt.axis('equal')  # 该行代码使饼图长宽相等
144  
145    plt.show()
146  
147 if __name__ == '__main__':
148  
149    ## 爬取数据
150    data = loadalldata()
151    ## 保存数据
152    savedata("rank.xls",data)    # py文件同级目录创建rank.xls文件
153    ## 展示数据
154    drow()

 

五,效果实现

1、获取到数据的本地Excel文件

 

2、福布斯排行榜前十的人物数据可视化效果

 

 

3、各个国家上榜人数与整体相比所占比例的统计与可视化

 

 

六,项目总结

1、得到的结论

根据饼图发现,美国上榜人数最多,然后中国紧随其后,其次是德国、俄罗斯、印度……等等国家。根据柱状图发现,福布斯排行榜的大部分都所属美国,只有第三位与第十位是法国人。

 

 2、收获

这次爬虫项目的设计与实施让我学会了爬取网站的信息与获取的数据分析。因为这次的项目就很不熟练导致到处碰壁,所以需要学会的知识点还有很多,还需要不断的学习与改进。

 

posted @ 2021-06-26 01:47  章浩斌  阅读(768)  评论(0编辑  收藏  举报