Python数据分析

一、爬取部分豆瓣top250  

1.1利用xpath

import requests
from lxml import etree


def getData(number):
    url = "https://movie.douban.com/top250?start={0}".format(number)
    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    response = requests.get(url, timeout=10, headers=headers)
    response.encoding = 'utf-8'
    parser = etree.HTMLParser(encoding='utf-8')
    html = etree.HTML(response.text, parser=parser)
    # 根据页面结构不同,灵活处理
    message = html.xpath('//ol[@class="grid_view"]/li/div[@class="item"]/div[@class="pic"]/a/img/@alt')
    print(message)


if __name__ == '__main__':
    # 每页25个电影,取前10页
    for number in range(0, 250, 25):
        getData(number)
View Code

1.2re搭配bs4爬取并存入Excel

import re
from bs4 import BeautifulSoup
import requests
import xlwt

findLink = re.compile(r'<img.*src="(.*?)"')
findTitle = re.compile(r'<img alt="(.*?)"')

url = "https://movie.douban.com/top250?start="
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, "html.parser")
dic = {}
for item in soup.find_all('div', class_='item'):
    item = str(item)
    title = re.findall(findTitle, item)
    link = re.findall(findLink, item)       # 返回的是一个列表,下面取第一项
    dic[title[0]] = link[0]

workbook = xlwt.Workbook(encoding='utf-8')  # 创建workbook对象
worksheet = workbook.add_sheet('sheetMovie')    # 创建工作表
worksheet.write(0, 0, '电影名称')   # 写入数据,这里表示第0行第0列的内容为电影名称
worksheet.write(0, 1, '链接')
i = 1
for item in dic:
    worksheet.write(i, 0, item)
    worksheet.write(i, 1, dic[item])
    i += 1
workbook.save('movie_message.xls')
View Code

 

二、Matplotlib

2.1绘制基本图形(折线、散点、条形)

 1 from matplotlib import pyplot as plt
 2 import matplotlib
 3 x = range(2, 26, 2)
 4 y = [15, 13, 14.5, 17, 20, 25, 26, 26, 24, 22, 18, 15]
 5 myfont = matplotlib.font_manager.FontProperties(fname="simkai.ttf")
 6 
 7 # 设置图片大小
 8 plt.figure(figsize=(20, 8), dpi=80)
 9 
10 # 调整轴刻度
11 x_labels = [i/2 for i in range(4, 49)]
12 plt.xticks(x_labels)
13 
14 # 添加描述信息,且显示中文字体
15 plt.xlabel('我是X', fontproperties=myfont)
16 plt.ylabel('我是Y', fontproperties=myfont)
17 plt.title('我是标题', fontproperties=myfont)
18 
19 # 绘制折线图
20 plt.plot(x, y, label='我是图例')
21 '''
22 # 绘制散点图
23 plt.scatter(x, y, label='我是图例')
24 '''
25 '''
26 # 绘制条形图
27 plt.bar(range(len(x)), y, label='我是图例', width=0.3)
28 # 设置字符串到x轴
29 plt.xticks(range(len(x)), x, fontproperties=myfont)
30 '''
31 
32 plt.legend(prop=myfont)     # 指定图例的字体
33 # 保存图片,可以使用svg格式放大不会有锯齿
34 plt.savefig("./sig_size.png")
35 plt.show()
View Code

 

三、Numpy

3.1创建数组

1 import numpy as np
2 # 使用numpy创建数组的三种方法
3 t1 = np.array([1, 2, 3])
4 print(t1, type(t1))
5 t2 = np.array(range(10))
6 print(t2, type(t2))
7 t3 = np.arange(12)
8 print(t3, type(t3))
9 print(t3.reshape(3, 4))     # 改成3行4列
View Code

3.2读取数据、索引、修改 

 1 import numpy as np
 2 us_path = "./US_data_numbers.csv"
 3 
 4 t1 = np.loadtxt(us_path, delimiter=",", dtype="int")    # 根据,分割,显示格式为int
 5 print(t1[2])    # 取某一行
 6 print(t1[2:])   # 取连续的行
 7 print(t1[[0, 2]])   # 取不连续的多行
 8 
 9 print(t1[:, 1])     # 取某一列
10 print(t1[:, 1:])    # 取连续的列
11 print(t1[:, [0, 2]])    # 取不连续的多列
12 
13 # 数值修改
14 t2 = np.arange(12).reshape(3, 4)
15 print(t2)
16 t2[1, [1, 2]] = 0
17 print(t2)
18 t2[t2 > 5] = 66
19 print(t2)
View Code

3.3两组数据拼接 

 1 import numpy as np
 2 us_path = "./US_data_numbers.csv"
 3 uk_path = "./UK_data_numbers.csv"
 4 
 5 t1 = np.loadtxt(us_path, delimiter=",", dtype="int")    # 根据,分割,显示格式为int
 6 t2 = np.loadtxt(uk_path, delimiter=",", dtype="int")
 7 
 8 # 构造全为0,1的数据
 9 zeros_data = np.zeros(t1.shape[0], 1)
10 ones_data = np.ones(t2.shape[0], 1)
11 # 分别添加一列全为0,1的数组
12 t1 = np.hstack((t1, zeros_data))
13 t2 = np.hstack((t2, ones_data))
14 # 按照上下拼接两组数据
15 final_data = np.vstack((t1, t2))
View Code

 

四、Pandas

4.1Series创建、索引、缺失值

 1 import pandas as pd
 2 
 3 # Series组成部分:pd.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
 4 temp_dict = {"name": "xiaohong", "age": 30, "tel": 10086}
 5 t1 = pd.Series(temp_dict)
 6 print(t1)
 7 temp_list = [1, 3, 5, 7, 9, 9]
 8 # 可以通过index指定索引,默认自动从0开始生成索引,即隐式索引
 9 t2 = pd.Series(temp_list)
10 print(t2)
11 
12 print(t2.head(3))   # 显示前3行数据,不指定n默认为5
13 print(t2.tail(3))   # 显示后3行数据,不指定n默认为5
14 
15 print(t2.unique())  # 去重
16 print(t2[t2.notnull()])     # 只显示不为空的行
17 print(t2[t2.isnull()])      # 只显示为空的行
View Code

4.2DataFrame创建 

1 import pandas as pd
2 import numpy as np
3 
4 d1 = {"name": ["xiaohong", "xiaoming"], "age": [18, 20], "sex": ["", ""]}
5 result = pd.DataFrame(d1)
6 print(result)
7 
8 d2 = pd.DataFrame(np.arange(12).reshape(3, 4), index=list(['A', 'B', 'C']), columns=list(['W', 'X', 'Y', 'Z']))
9 print(d2)

结果如图 

 

五、Openpyxl

 1 import openpyxl
 2 
 3 filename = "./test_data.xlsx"
 4 inwb = openpyxl.load_workbook(filename)
 5 sheetnames = inwb.get_sheet_names()
 6 ws = inwb.get_sheet_by_name(sheetnames[0])      # 获取第一个sheet内容
 7 
 8 print(ws.cell(row=4, column=3).value)
 9 
10 for i in range(4, 328):
11     for j in range(3, ws.max_column):
12         if ws.cell(row=i, column=j).value == 0:     # 代表第i行第j列的数值
13             print(i, j)
View Code

 

posted @ 2021-09-03 16:29  Ocean允许访问  阅读(91)  评论(0)    收藏  举报