Python数据分析
一、爬取部分豆瓣top250
1.1利用xpath
import requests from lxml import etree def getData(number): url = "https://movie.douban.com/top250?start={0}".format(number) headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'} response = requests.get(url, timeout=10, headers=headers) response.encoding = 'utf-8' parser = etree.HTMLParser(encoding='utf-8') html = etree.HTML(response.text, parser=parser) # 根据页面结构不同,灵活处理 message = html.xpath('//ol[@class="grid_view"]/li/div[@class="item"]/div[@class="pic"]/a/img/@alt') print(message) if __name__ == '__main__': # 每页25个电影,取前10页 for number in range(0, 250, 25): getData(number)
1.2re搭配bs4爬取并存入Excel
import re from bs4 import BeautifulSoup import requests import xlwt findLink = re.compile(r'<img.*src="(.*?)"') findTitle = re.compile(r'<img alt="(.*?)"') url = "https://movie.douban.com/top250?start=" headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'} html = requests.get(url, headers=headers) soup = BeautifulSoup(html.text, "html.parser") dic = {} for item in soup.find_all('div', class_='item'): item = str(item) title = re.findall(findTitle, item) link = re.findall(findLink, item) # 返回的是一个列表,下面取第一项 dic[title[0]] = link[0] workbook = xlwt.Workbook(encoding='utf-8') # 创建workbook对象 worksheet = workbook.add_sheet('sheetMovie') # 创建工作表 worksheet.write(0, 0, '电影名称') # 写入数据,这里表示第0行第0列的内容为电影名称 worksheet.write(0, 1, '链接') i = 1 for item in dic: worksheet.write(i, 0, item) worksheet.write(i, 1, dic[item]) i += 1 workbook.save('movie_message.xls')
二、Matplotlib
2.1绘制基本图形(折线、散点、条形)
1 from matplotlib import pyplot as plt 2 import matplotlib 3 x = range(2, 26, 2) 4 y = [15, 13, 14.5, 17, 20, 25, 26, 26, 24, 22, 18, 15] 5 myfont = matplotlib.font_manager.FontProperties(fname="simkai.ttf") 6 7 # 设置图片大小 8 plt.figure(figsize=(20, 8), dpi=80) 9 10 # 调整轴刻度 11 x_labels = [i/2 for i in range(4, 49)] 12 plt.xticks(x_labels) 13 14 # 添加描述信息,且显示中文字体 15 plt.xlabel('我是X', fontproperties=myfont) 16 plt.ylabel('我是Y', fontproperties=myfont) 17 plt.title('我是标题', fontproperties=myfont) 18 19 # 绘制折线图 20 plt.plot(x, y, label='我是图例') 21 ''' 22 # 绘制散点图 23 plt.scatter(x, y, label='我是图例') 24 ''' 25 ''' 26 # 绘制条形图 27 plt.bar(range(len(x)), y, label='我是图例', width=0.3) 28 # 设置字符串到x轴 29 plt.xticks(range(len(x)), x, fontproperties=myfont) 30 ''' 31 32 plt.legend(prop=myfont) # 指定图例的字体 33 # 保存图片,可以使用svg格式放大不会有锯齿 34 plt.savefig("./sig_size.png") 35 plt.show()
三、Numpy
3.1创建数组
1 import numpy as np 2 # 使用numpy创建数组的三种方法 3 t1 = np.array([1, 2, 3]) 4 print(t1, type(t1)) 5 t2 = np.array(range(10)) 6 print(t2, type(t2)) 7 t3 = np.arange(12) 8 print(t3, type(t3)) 9 print(t3.reshape(3, 4)) # 改成3行4列
3.2读取数据、索引、修改
1 import numpy as np 2 us_path = "./US_data_numbers.csv" 3 4 t1 = np.loadtxt(us_path, delimiter=",", dtype="int") # 根据,分割,显示格式为int 5 print(t1[2]) # 取某一行 6 print(t1[2:]) # 取连续的行 7 print(t1[[0, 2]]) # 取不连续的多行 8 9 print(t1[:, 1]) # 取某一列 10 print(t1[:, 1:]) # 取连续的列 11 print(t1[:, [0, 2]]) # 取不连续的多列 12 13 # 数值修改 14 t2 = np.arange(12).reshape(3, 4) 15 print(t2) 16 t2[1, [1, 2]] = 0 17 print(t2) 18 t2[t2 > 5] = 66 19 print(t2)
3.3两组数据拼接
1 import numpy as np 2 us_path = "./US_data_numbers.csv" 3 uk_path = "./UK_data_numbers.csv" 4 5 t1 = np.loadtxt(us_path, delimiter=",", dtype="int") # 根据,分割,显示格式为int 6 t2 = np.loadtxt(uk_path, delimiter=",", dtype="int") 7 8 # 构造全为0,1的数据 9 zeros_data = np.zeros(t1.shape[0], 1) 10 ones_data = np.ones(t2.shape[0], 1) 11 # 分别添加一列全为0,1的数组 12 t1 = np.hstack((t1, zeros_data)) 13 t2 = np.hstack((t2, ones_data)) 14 # 按照上下拼接两组数据 15 final_data = np.vstack((t1, t2))
四、Pandas
4.1Series创建、索引、缺失值
1 import pandas as pd 2 3 # Series组成部分:pd.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False) 4 temp_dict = {"name": "xiaohong", "age": 30, "tel": 10086} 5 t1 = pd.Series(temp_dict) 6 print(t1) 7 temp_list = [1, 3, 5, 7, 9, 9] 8 # 可以通过index指定索引,默认自动从0开始生成索引,即隐式索引 9 t2 = pd.Series(temp_list) 10 print(t2) 11 12 print(t2.head(3)) # 显示前3行数据,不指定n默认为5 13 print(t2.tail(3)) # 显示后3行数据,不指定n默认为5 14 15 print(t2.unique()) # 去重 16 print(t2[t2.notnull()]) # 只显示不为空的行 17 print(t2[t2.isnull()]) # 只显示为空的行
4.2DataFrame创建
1 import pandas as pd 2 import numpy as np 3 4 d1 = {"name": ["xiaohong", "xiaoming"], "age": [18, 20], "sex": ["男", "女"]} 5 result = pd.DataFrame(d1) 6 print(result) 7 8 d2 = pd.DataFrame(np.arange(12).reshape(3, 4), index=list(['A', 'B', 'C']), columns=list(['W', 'X', 'Y', 'Z'])) 9 print(d2)
结果如图 
五、Openpyxl
1 import openpyxl 2 3 filename = "./test_data.xlsx" 4 inwb = openpyxl.load_workbook(filename) 5 sheetnames = inwb.get_sheet_names() 6 ws = inwb.get_sheet_by_name(sheetnames[0]) # 获取第一个sheet内容 7 8 print(ws.cell(row=4, column=3).value) 9 10 for i in range(4, 328): 11 for j in range(3, ws.max_column): 12 if ws.cell(row=i, column=j).value == 0: # 代表第i行第j列的数值 13 print(i, j)

浙公网安备 33010602011771号