Seaborn
数据分析 第八讲 Seaborn

一、Seaborn介绍
Seaborn
什么是Seaborn
1.Python中的一个制图工具库,可以制作出吸引人的、信息量大的统计图
2.在matplotlib上构建,支持numpy和pandas的数据结构可视化
3.Seaborn比Matplotlib更简洁易用
Seaborn网站:http://seaborn.pydata.org/

4.win7系统安装seaborn
pip install seaborn -i https://pypi.douban.com/simple --trusted-host
pypi.douban.com

二、Seaborn特点
1.多个颜色主题
2.可视化单变量、二维变量用于比较数据集中各变量的分布情况
3.可视化线性回归模型中的变量
4.可视化矩阵数据,通过聚类算法探究矩阵间的结构
5.可视化时间序列数据及不确定性
6.可在分割区域制图,用于复杂的可视化
三、seaborn的简单使用
1.使用seaborn绘图-散点图
需求:画一个花瓣(petal)和花萼(sepal)长度的散点图,并且点的颜色要区分鸢尾花的种类
sns.lmplot()
# seaborn的简单使用
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# 需求:画一个花瓣(petal)和花萼(sepal)长度的散点图,并且点的颜色要区分鸢尾花的种类
df = pd.read_csv('./iris.csv')
# print(df.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SepalLength 150 non-null float64
1 SepalWidth 150 non-null float64
2 PetalLength 150 non-null float64
3 PetalWidth 150 non-null float64
4 Name 150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
'''
# print(df.loc[:,'Name'].unique()) # ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
# fit_reg=True 默认有回归线
sns.lmplot(x='SepalLength',y='PetalLength',data=df,hue='Name',fit_reg=False,markers='.')
plt.show()

2.单变量分布
核密度估计图
1.核密度估计(kernel density estimation)是在概率论中用来估计未知的密度函数
2.通过核密度估计图可以比较直观的看出数据样本本身的分布特征
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
# 正态分布生成数据
data1 = np.random.normal(size=4)
sns.kdeplot(data1)
plt.show()

import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
# 正态分布生成数据
data1 = np.random.normal(size=1000)
sns.kdeplot(data1)
plt.show()

绘制直方图和密度图
distplot集合了直方图与核函数估计的功能
- kde = True 显示核密度估计曲线
- hist = True 显示直方图
- rug = True 显示密度观察条
# seaborn的简单使用
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
s1 = pd.Series(np.random.randn(1000))
# 直方图
# plt.hist(s1)
# rug=True 显示密度观察条 hist=True 显示直方图 kde=True 显示核密度估计曲线
sns.distplot(s1,kde=True,hist=True,rug=True)
# sns.kdeplot(s1,shade=True,color='r')
plt.show()

3.双变量分布
sns.jointplot(x,y,data,kind)
- x,y二维数据,向量或字符串
- data,如果x,y是字符串data应该为DataFrame
- kind='scatter’默认,二维散点图
- kind=‘hex’,二维直方图
- kind=‘ked’,二维核密度估计图
# seaborn的简单使用
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
df = pd.DataFrame({'x': np.random.randn(500), 'y': np.random.randn(500)})
# 二维散点图
# sns.jointplot(x='x',y='y',data=df)
# 二维直方图
# sns.jointplot(x='x',y='y',data=df,kind='hex')
# 二维密度图
sns.jointplot(x='x', y='y', data=df, kind='kde')
plt.show()



4.绘制热力图和柱状图
# 绘制热力图和柱状图
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
'''
name : str
Name of the dataset (``{name}.csv`` on
https://github.com/mwaskom/seaborn-data).'''
# 加载在线数据
df = sns.load_dataset('flights')
# print(df.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 144 non-null int64
1 month 144 non-null category
2 passengers 144 non-null int64
dtypes: category(1), int64(2)
memory usage: 2.9 KB
None'''
# print(df.head())
'''
year month passengers
0 1949 Jan 112
1 1949 Feb 118
2 1949 Mar 132
3 1949 Apr 129
4 1949 May 121'''
df = df.pivot(index='month',columns='year',values='passengers')
# print(df.head())
'''
year 1949 1950 1951 ... 1958 1959 1960
month ...
Jan 112 115 145 ... 340 360 417
Feb 118 126 150 ... 318 342 391
Mar 132 141 178 ... 362 406 419
Apr 129 135 163 ... 348 396 461
May 121 125 172 ... 363 420 472'''
# 热力图
# sns.heatmap(df)
# sns.heatmap(df,annot=True,fmt='d') # annot=True,fmt='d' 调整数值
# sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu') # cmap='YlGnBu' 调整颜色
s = df.sum()
# print(s)
'''
year
1949 1520
1950 1676
1951 2042
1952 2364
1953 2700
1954 2867
1955 3408
1956 3939
1957 4421
1958 4572
1959 5140
1960 5714
dtype: int64'''
x = s.index
y = s.values
# 柱状图
sns.barplot(x, y)
plt.show()




5.设置显示效果
style = [‘darkgrid’,‘dark’,‘white’,‘whitegrid’,‘ticks’]
设置风格
sns.set_style(style[0],{‘grid.color’: ‘red’})
当前风格的参数
print(sns.axes_style())
context = [‘paper’,‘notebook’,‘talk’,‘poster’]
sns.set_context(context[0],rc={‘grid.linewidth’: 2.0})
print(sns.plotting_context())
# 设置显示效果
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
style = ['darkgrid', 'dark', 'white', 'whitegrid', 'ticks']
# 设置风格
sns.set_style(style[0], {'grid.color': 'red'})
# 当前风格的参数
# print(sns.axes_style())
'''
{'axes.facecolor': '#EAEAF2', 'axes.edgecolor': 'white', 'axes.grid': True, 'axes.axisbelow': True, 'axes.labelcolor': '.15', 'figure.facecolor': 'white', 'grid.color': 'red', 'grid.linestyle': '-', 'text.color': '.15', 'xtick.color': '.15', 'ytick.color': '.15', 'xtick.direction': 'out', 'ytick.direction': 'out', 'lines.solid_capstyle': 'round', 'patch.edgecolor': 'w', 'patch.force_edgecolor': True, 'image.cmap': 'rocket', 'font.family': ['sans-serif'], 'font.sans-serif': ['Arial', 'DejaVu Sans', 'Liberation Sans', 'Bitstream Vera Sans', 'sans-serif'], 'xtick.bottom': False, 'xtick.top': False, 'ytick.left': False, 'ytick.right': False, 'axes.spines.left': True, 'axes.spines.bottom': True, 'axes.spines.right': True, 'axes.spines.top': True}'''
context = ['paper', 'notebook', 'talk', 'poster']
sns.set_context(context[0], rc={'grid.linewidth': 2.0})
# print(sns.plotting_context())
'''
{'font.size': 9.600000000000001, 'axes.labelsize': 9.600000000000001, 'axes.titlesize': 9.600000000000001, 'xtick.labelsize': 8.8, 'ytick.labelsize': 8.8, 'legend.fontsize': 8.8, 'axes.linewidth': 1.0, 'grid.linewidth': 3.0, 'lines.linewidth': 1.2000000000000002, 'lines.markersize': 4.800000000000001, 'patch.linewidth': 0.8, 'xtick.major.width': 1.0, 'ytick.major.width': 1.0, 'xtick.minor.width': 0.8, 'ytick.minor.width': 0.8, 'xtick.major.size': 4.800000000000001, 'ytick.major.size': 4.800000000000001, 'xtick.minor.size': 3.2, 'ytick.minor.size': 3.2, 'legend.title_fontsize': 9.600000000000001}'''
'''
name : str
Name of the dataset (``{name}.csv`` on
https://github.com/mwaskom/seaborn-data).'''
# 加载在线数据
df = sns.load_dataset('flights')
# print(df.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 144 non-null int64
1 month 144 non-null category
2 passengers 144 non-null int64
dtypes: category(1), int64(2)
memory usage: 2.9 KB
None'''
# print(df.head())
'''
year month passengers
0 1949 Jan 112
1 1949 Feb 118
2 1949 Mar 132
3 1949 Apr 129
4 1949 May 121'''
df = df.pivot(index='month', columns='year', values='passengers')
# print(df.head())
'''
year 1949 1950 1951 ... 1958 1959 1960
month ...
Jan 112 115 145 ... 340 360 417
Feb 118 126 150 ... 318 342 391
Mar 132 141 178 ... 362 406 419
Apr 129 135 163 ... 348 396 461
May 121 125 172 ... 363 420 472'''
# 热力图
# sns.heatmap(df)
# sns.heatmap(df,annot=True,fmt='d') # annot=True,fmt='d' 调整数值
# sns.heatmap(df,annot=True,fmt='d',cmap='YlGnBu') # cmap='YlGnBu' 调整颜色
s = df.sum()
# print(s)
'''
year
1949 1520
1950 1676
1951 2042
1952 2364
1953 2700
1954 2867
1955 3408
1956 3939
1957 4421
1958 4572
1959 5140
1960 5714
dtype: int64'''
x = s.index
y = s.values
# 柱状图
sns.barplot(x, y)
plt.show()



浙公网安备 33010602011771号