#Seaborn #用matplotlib封装来显示数据显示模板 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline #inline直接把图显示在book上
## 整体布局风格设置 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,7): # 画6条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) sinplot() # print(help(np.linspace))
import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,7): # 画6条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) sns.set() # 使用seaborn默认参数 sinplot()
## 五种主题风格 # darkgrid # whitegrid # dark # white # ticks import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline ## white # sns.set_style('whitegrid') # data = np.random.normal(size=(20,6)) + np.arange(6) / 2 # sns.boxplot(data = data) ## dark sns.set_style('ticks') # 依次替换尝试即可 # sns.despine() data1 = np.random.normal(size=(20,8)) + np.arange(8) / 2 sns.boxplot(data = data1)
## 画图距离轴线的位置 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,7): # 画6条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) sns.violinplot(data) sns.despine(offset = 1,left=False) # offset设置距离周线为10,left=False表示保留左边的轴 # sns.despine(left=True) # sns.despine(right=False) # sns.despine(top=False)
## 设置子图风格 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,7): # 画6条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) with sns.axes_style('darkgrid'): #在with里面都是当前风格,with 域 plt.subplot(211) sinplot() plt.subplot(212) sinplot(-1) # print(help(plt.subplot))
## 设置布局 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,7): # 画6条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) sns.set() # 动用默认值 # sns.set_context('paper') # 整体风格,画图域大小 # sns.set_context('talk') sns.set_context('poster',font_scale=3.5,rc={'lines,linewitdh':55.5}) # 也可附加指定字体的大小,linewidth线条粗细 # sns.set_context('notebook') plt.figure(figsize=(18,6)) # 长和宽 sinplot()
## 颜色 ## 离散型/连续性颜色 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline # sns.set() sns.set(rc={"figure.figsize":(6,6)}) ############调色板 # color_palette()能传入任何Matplotlib所支持的颜色 # color_palette()不写参数则默认颜色 # set_palette()设置所有图的颜色 current_palette = sns.color_palette() sns.palplot(current_palette) ##########分类色板 # 6个默认的颜色循环主题:deep,muted,pastel,bright,dark,colorblind #########圆形画板 # 当你有10个以上的分类要区分时,最简单的方法就是在一个圆形的颜色空间中画出均匀间隔的颜色(这样的色调 # 会保持亮度和饱和度不变)。这是大多数当他们需要使用比当前默认颜色循环中设置的颜色更多时的默认方案。 # 最常用的方法是使用hls的颜色空间,这是RGB值的一个简单转换。 # hls 默认颜色空间 sns.palplot(sns.color_palette("hls",19)) # 传出19种颜色 def sinplot(flip=1): x = np.linspace(0,14,100) # 在0-14找100个点 for i in range(1,18): # 画18条线 plt.plot(x,np.sin(x + i * 0.5) * (7 - i) * flip) plt.figure(figsize=(20,18)) # 设置整体的大小 with sns.color_palette('hls',18): plt.subplot(221) # 子图一 sinplot(15) plt.subplot(222) # 子图二 data = np.random.normal(size=(20,8)) + np.arange(8) / 2 sns.boxplot(data=data,palette=sns.color_palette()) ######################### # plt.sub(xyz) # 子图总个数为:x*y # z为子图的index ###################### hls_palette()函数来控制颜色的亮度和饱和度 # l - 亮度 lightness # s - 饱和 saturation sns.palplot(sns.hls_palette(16,l=.6,s=.5)) #(个数,亮度,饱和度) # 队来表示颜色分布,比如不同国家???? sns.palplot(sns.color_palette("Paired",10))
### 使用xkcd来命名颜色 # xkcd 包含了一套众包努力的针对随机RGB色的命名,产生了945个可以随时通过xkcd_rgb字典中调用的命名颜色 # 指定固定颜色 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline plt.plot([0,1],[0,1],sns.xkcd_rgb["pale red"],lw = 10) # lw 线宽 plt.plot([0,1],[0,2],sns.xkcd_rgb["medium green"],lw = 3) plt.plot([0,1],[0,3],sns.xkcd_rgb["denim blue"],lw = 3)
## 连续色板 # 色彩随数据变换,比如数据越来越重要则颜色越来越深 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline sns.palplot(sns.color_palette("Blues")) sns.palplot(sns.color_palette("Blues_r")) # 颜色后面加 _r 则表示颜色翻转 sns.palplot(sns.color_palette("BuGn_r"))
## 线性色板 # 色调线性变换 import seaborn as sns import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline sns.palplot(sns.color_palette("cubehelix",8))
## 单变量分析 | 直方图 | 数据分布情况(光滑曲线) import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes = True) np.random.seed((sum(map(ord,"distributions")))) plt.figure(figsize=(20,18)) # 设置整体的大小 # 直方图 ## 生成的数据是高斯数据,高斯分布 plt.subplot(311) x = np.random.normal(size = 100000) # 注意数据量不要调的过大,否则会过多占用内存而卡住,我机器内存8G sns.distplot(x,kde = False,fit = stats.gamma) plt.subplot(312) sns.distplot(x,bins = 20 ,kde = False,fit = stats.gamma) # 自己指定bins,来等分数据 ## 数据分布情况 # fit = stats.gamma plt.subplot(313) x = np.random.gamma(6,size = 10000) sns.distplot(x,kde = False,fit = stats.gamma)
## 根据均值和方差生成情况 / 多个变量 /jointplot /kind /通过颜色差异来看数据在何处分布较多 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns mean,cov = [0,1],[(1,5),(5,1)] # plt.figure(figsize = (9,18)) # plt.subplot(211) data = np.random.multivariate_normal(mean,cov,100) df = pd.DataFrame(data,columns = ["x","y"]) # 变成DataFrame格式 # df # 观察两个变量之间的关系最好用散点图 sns.jointplot(x = 'x',y = 'y',data = df) # plt.subplot(212) # # # 通过颜色差异来看数据在何处分布较多 x,y = np.random.multivariate_normal(mean,cov,1000).T with sns.axes_style("whitegrid"): # 黑白相间,五个风格之一 sns.jointplot(x = x,y = y,kind = "hex",color = "k")
## pairplot import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns iris= sns.load_dataset("iris") # iris: yan尾花数据集 sns.pairplot(iris) # x,y各有四个特征 # 对角线 : 单个变量的分布情况 # 非对角线: 俩俩变量散点图特征
## 回归分析绘图 | regplot() | lmplot() import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes = True) np.random.seed(sum(map(ord,"regression"))) tips = sns.load_dataset("tips") tips.head() # 画图时,会隐藏不输出这个 # regplot()和lmplot()都可以绘制回归关系 # 推荐regplot(),可以传进去的数据更多 plt.figure(figsize = (9,18)) plt.subplot(211) sns.regplot(x = "total_bill",y = "tip",data = tips) plt.subplot(212) sns.regplot(x = "total_bill",y = "tip",data = tips,x_jitter = 5) # 对原始的点来随机偏动,更有利于随机模型的建立 # plt.figure(figsize = (9,18)) # plt.subplot(211) sns.lmplot(x = "size",y = "tip",data = tips) # plt.subplot(212) sns.lmplot(x = "size",y = "tip",data = tips,x_jitter = .05) sns.lmplot(x = "size",y = "tip",data = tips,x_jitter = True) ################# ## ## 图是得到了,怎么获取回归方程????? ## ################
## 多变量分析绘图 ### 类别可视化展示 / stripplot() import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") plt.figure(figsize = (18,30)) plt.subplot(511) sns.stripplot(x = "day",y = "total_bill",data = tips) # # 重叠是很常见的现象,但是重叠影响我们观察数据的分布情况(数据量过大的话则会更加影响) # plt.subplot(512) plt.title('Data with jitter') sns.stripplot(x = "day",y = "total_bill",data = tips,jitter = True) plt.subplot(513) sns.swarmplot(x = "day",y = "total_bill",data = tips) plt.subplot(514) sns.swarmplot(x = "day",y = "total_bill",hue = "sex",data = tips) plt.subplot(515) sns.swarmplot(x = "total_bill",y = "day",hue = "sex",data = tips) tips.head()
我们要洞悉数据背后的含义是什么,这些什么可视化工具都是一种工具而已,让我们来研究意义。而不是如何把图做的如何好看和优美,我们重要的是研究背后的规律,然后利用规律为我们创造价值而已。
示意图
< > 菱形: 离群点 ——————— 最大值 | | ***** 3/4位 ##### ##### ***** 2/4位 ##### ##### ***** 1/4位 | | —————— 最小值
## 盒图 # IQR即统计学概念四分位距,第1/4分位与第3/4分位的距离 # N = 1.51IQR(一般情况),如果一个值 > Q3(3/4位) + N 或 < Q1(1/4位) - N ,则为离群点(不太符合正常逻辑的点) import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") plt.figure(figsize = (16,8)) sns.boxplot(x = "day",y = "total_bill",hue = "time",data = tips) #hue在那个特征上统计划分 ## ## ## <> 菱形: 离群点 ## ## ——————— 最大值 ## | ## | ## ***** 3/4位 ## ##### ## ##### ## ***** 2/4位 ## ##### ## ##### ## ***** 1/4位 ## | ## | ## —————— 最小值 ## tips.head()
## 小提琴图(violinplot) # 越胖则该处值出现的次数越多 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") plt.figure(figsize = (100,50)) sns.set_context('poster',font_scale=3) # 也可附加指定字体的大小,linewidth线条粗细 plt.subplot(211) sns.violinplot(x = "pclass",y = "embarked",hue = "sex",data = titanic) plt.subplot(212) sns.violinplot(x = "pclass",y = "embarked",hue = "sex",data = titanic,split = True) # 把两个重叠在一起 titanic.head()
## 分类属性绘图 # 即把多种图组合在一起 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") sns.set_context('poster',font_scale=1) # 也可附加指定字体的大小,linewidth线条粗细 plt.figure(figsize = (25,8)) sns.violinplot(x = "day",y = "total_bill",hue = "sex",data = tips,split = True) sns.swarmplot(x = "day",y = "total_bill",hue = "sex",data = tips,color = "w",alpha = 1) # alpha 透明程度 titanic.head()
## 条形图 # 显示图的集中趋势可以用条形图 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") plt.figure(figsize = (10,9)) sns.barplot(x = "sex",y = "survived",hue = "class", data = titanic)
## 点图 # 可以用更好的描述变化差异 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") plt.figure(figsize = (18,9)) plt.subplot(211) sns.pointplot(x = "sex",y = "survived",hue = "class",data = titanic) plt.subplot(212) sns.pointplot(x = "class", y = "survived",hue = "sex",data = titanic,palette={"male" : "g","female" : "m"}, markers = ["*", "o"],linestyles = ["-","--"] ) # makers : 点的样子 # linestyles : 线型 # 颜色 # print(help(sns.pointplot))
可以在里面画各种图,用kind指出来即可
## 多层面板分类图 / factorplot ## 可以在里面画各种图,用kind指出来即可 import numpy as np import pandas as pd from scipy import stats,integrate import matplotlib.pyplot as plt import seaborn as sns sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") # plt.figure(figsize = (18,9)) # not working sns.factorplot(x = "day",y = "total_bill",hue = "smoker",data = tips) sns.factorplot(x = "day",y = "total_bill",hue = "smoker",data = tips,kind = "bar") # kind = bar :条形图 # col:指定一些维度 sns.factorplot(x = "day",y = "total_bill",hue = "smoker",col = "time",data = tips,kind = "swarm") # 指定维度:time # size: 大小 # aspect: 长宽比 sns.factorplot(x ="time",y = "total_bill",hue = "smoker",col = "day",data = tips,kind = "box",size = 5,aspect = .5)
参数 | 含义 | 值 |
---|---|---|
x,y,hue | 数据集变量 | 变量名 |
data | 数据集 | 数据集名 |
row,col | 更多分类变量进行平铺显示 | 变量名 |
col_wrap | 每行的最高平铺数 | 整数 |
estimator | 在每个分类中进行矢量到标量的映射 | 矢量 |
ci | 置信区间 | 浮点数或None |
n_boot | 计算置信区间时使用的引导迭代次数 | 整数 |
units | 采用单元的标识符,用于执行多级引导和重复测量设计 | 数据变量或向量数据 |
order,hue_order | 对应排序列表 | 字符串列表 |
row_order,col_order | 对应排序列表 | 字符串列表 |
kind(可选) | point 默认,bar 柱形图,count 频次,box 箱次,violin 提琴,strip 散点,swarm 分散点,size 每个面的高度(英寸) | 标量 |
aspect | 纵横比 | 标量 |
orient | 方向 | "v" / "h" |
color颜色 | matplotlib颜色 palette 调色板 seaborn 颜色色板或字典lengend hue的信息面板 | True/False |
legend_out | 是否拓展图像,并将信息框绘制在中心右边 | True/False |
share{x,y} | 共享轴线 | Ture/False |
## FacetGrid | 子图 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy import stats,integrate from pandas import Categorical sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") g = sns.FacetGrid(tips,col = "time") # 实例化出来 g.map(plt.hist,"tip") # 设计画出来,plt.hsit条形图 g1 = sns.FacetGrid(tips,col = "sex",hue = "smoker") g1.map(plt.scatter,"total_bill","tip",alpha = .7) g1.add_legend() # 添加标签 g2 = sns.FacetGrid(tips,row = "smoker",col = "time",margin_titles = True) g2.map(sns.regplot,"size","total_bill",color = ".1",fit_reg = True,y_jitter = True) # fig_reg : 回归线 # 布局 g3 = sns.FacetGrid(tips,col = "day",size = 5,aspect = .5) g3.map(sns.barplot,"sex","total_bill") # 顺序 ordered_days = tips.day.value_counts().index print(ordered_days) ordered_days = Categorical(['Thur','Fri','Sun','Sat']) g4 = sns.FacetGrid(tips,row = "day",row_order = ordered_days,size = 1.7,aspect = 4) # row_order 指定顺序 g4.map(sns.boxplot,"total_bill") pal = dict(Lunch = "seagreen",Dinner = "gray") g5 = sns.FacetGrid(tips,hue = "time",palette = pal,size = 5) g5.map(plt.scatter,"total_bill","tip",s = 50,alpha = .7, linewidth = .5,edgecolor = 'white') # s圆圈大小 g5.add_legend() g6 = sns.FacetGrid(tips,hue = "sex",size = 5,hue_kws = {"marker":["^","v"]}) g6.map(plt.scatter,"total_bill","tip",s = 100,linewidth = .5,edgecolor = "white") g6.add_legend() # 关于轴 with sns.axes_style("white"): g7 = sns.FacetGrid(tips,row = "sex",col = "smoker",margin_titles = True,size= 5) g7.map(plt.scatter,"total_bill","tip",color = "#334488",edgecolor = "white",lw = .5) g7.set_axis_labels("Tota bill(US Dollars)","Tip") # 坐标标签 g7.set(xticks = [10,30,50],yticks = [2,6,10]) # 坐标数值 g7.fig.subplots_adjust(wspace = .52,hspace = .52) # 子图间隔 g7.fig.subplots_adjust(left = 0.125,right = 0.5,bottom = 0.1,top = 0.9,wspace = .02,hspace = .02) # 子图整体风格设置,调偏移程度 tips.head()
## PairGrid 指定绘图样式、增加分类、指定绘制量及颜色设置。 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy import stats,integrate from pandas import Categorical sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") g = sns.PairGrid(iris) g.map(plt.scatter) # print(help(sns.PairGrid))
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy import stats,integrate from pandas import Categorical sns.set(style = "whitegrid",color_codes = True) np.random.seed(sum(map(ord,"categorical"))) titanic = sns.load_dataset("titanic") tips = sns.load_dataset("tips") iris = sns.load_dataset("iris") g = sns.PairGrid(iris) g.map_diag(plt.hist) # 对角线的作图形式 g.map_offdiag(plt.scatter) # 非对角线的作图形式 # titanic.head() g1 = sns.PairGrid(iris,hue = "species") #添加了区分类别 g1.map_diag(plt.hist) g1.map_offdiag(plt.scatter) g1.add_legend() g2 = sns.PairGrid(iris,vars = ["sepal_length","sepal_width"],hue = "species") # vars规定特定的指标(特征) g2.map_diag(plt.hist) g2.map_offdiag(plt.scatter) g2.add_legend() g3 = sns.PairGrid(tips,hue = "size",palette = "GnBu_d") # 指定调色版 g3.map(plt.scatter,s = 50,edgecolor = "white") g3.add_legend()
## 热度图 | heatmap # 值的变化用一种颜色给表示出来 # 相关程度/相关度 import numpy as np import matplotlib.pyplot as plt import seaborn as sns np.random.seed(0) sns.set() plt.figure(figsize = (10,20)) plt.subplot(311) uniform_data = np.random.rand(5,5) # 随机生成3*3的一个矩阵 print(uniform_data) heatmap = sns.heatmap(uniform_data) # 值的大小对应颜色的深浅,然后依次把矩阵的各个值用颜色表示出来就形成了下面的图 plt.subplot(312) heatmap1 = sns.heatmap(uniform_data,vmin = 0.2,vmax = 0.5) # (vmin,vmax)取值区间的设定 plt.subplot(313) normal_data = np.random.randn(5,5) print(normal_data) ax = sns.heatmap(normal_data,center = 0) # 以0为中心,中心值上下侧颜色不一样
### 热度图航班实例 import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() plt.figure(figsize = (20,50)) flights = sns.load_dataset("flights") flights.head() flights = flights.pivot("month","year","passengers") # 把DataFrame的数据转换成一种矩阵(y,x,value) print(flights) plt.subplot(511) ax0 = sns.heatmap(flights) plt.subplot(512) plt.title('test') ax1 = sns.heatmap(flights,annot = True,fmt = "d") # annot = True把对应的值画到图中;fmt = "d"把实际值显示出来,否则默认为科学计数法 plt.subplot(513) # plt.title('test') ax2 = sns.heatmap(flights,linewidths = .5,annot = True,fmt = "d") # linewidths格子间距 plt.subplot(514) ax3 = sns.heatmap(flights,cmap = "YlGn") # 指定作图颜色 plt.subplot(515) ax4 = sns.heatmap(flights,cmap = "YlGn",cbar = False) # 隐藏color_bar,一般不推荐 # print(help(sns.heatmap))