数据探索性分析

EDA (Exploratory Data Analysis),也就是对数据进行探索性的分析,从而为之后的数据预处理和特征工程提供必要的结论。
通常我们用到pandas库和可视化工具如 matplotlib 和 seaborn 就可以完成了。
主要的步骤是:

  1. 理解问题
  2. 读取数据
  3. 单变量探索
  4. 多变量探索
  5. 数据预处理
  6. 建立假设,并检验

代码示例

载入库和数据

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import missingno as msno

## 载入训练集和测试集; 
Train_data = pd.read_csv('train.csv', sep=' ') 
Test_data = pd.read_csv('testA.csv', sep=' ')

观察数据

Train_data.head().append(Train_data.tail())
Test_data.head().append(Test_data.tail())

数字特征分析

## 1) 相关性分析 
price_numeric = Train_data[numeric_features] 
correlation = price_numeric.corr() print(correlation['price'].sort_values(ascending = False),'\n')
f , ax = plt.subplots(figsize = (7, 7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square = True,  vmax=0.8)
del price_numeric['price']


## 2) 查看几个特征得 偏度和峰值 
for col in numeric_features:    
	print('{:15}'.format(col),           
	'Skewness: {:05.2f}'.format(Train_data[col].skew()) ,           
	'   ' ,          
	'Kurtosis: {:06.2f}'.format(Train_data[col].kurt())          
	 )


## 3) 每个数字特征得分布可视化 
f = pd.melt(Train_data, value_vars=numeric_features) 
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False) 
g = g.map(sns.distplot, "value")

## 4) 数字特征相互之间的关系可视化 
sns.set() 
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14'] 
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde') 
plt.show()

类别特征分析

## 1) unique分布 
for fea in categorical_features:    
	print(Train_data[fea].nunique())
categorical_features


## 2) 类别特征箱形图可视化
# 因为 name和 regionCode的类别太稀疏了,这里我们把不稀疏的几类画一下 
categorical_features = ['model', 
'brand', 
'bodyType', 
'fuelType', 
'gearbox', 
'notRepairedDamage'] 
for c in categorical_features:
    Train_data[c] = Train_data[c].astype('category')
        if Train_data[c].isnull().any():
                Train_data[c] = Train_data[c].cat.add_categories(['MISSING'])        
                Train_data[c] = Train_data[c].fillna('MISSING')
                
def boxplot(x, y, **kwargs): 
   sns.boxplot(x=x, y=y)
   x=plt.xticks(rotation=90)
       
f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5) 
g = g.map(boxplot, "value", "price")
Train_data.columns

## 3) 类别特征的小提琴图可视化 
catg_list = categorical_features 
target = 'price' 
for catg in catg_list :
    sns.violinplot(x=catg, y=target, data=Train_data)
        plt.show()

categorical_features = ['model',
 'brand',
 'bodyType',
 'fuelType', 
 'gearbox', 
 'notRepairedDamage']

## 4) 类别特征的柱形图可视化 
def bar_plot(x, y, **kwargs):
    sns.barplot(x=x, y=y)
    x=plt.xticks(rotation=90)
        
f = pd.melt(Train_data, id_vars=['price'],value_vars=categorical_features) 
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5) 
g = g.map(bar_plot, "value", "price")

##  5) 类别特征的每个类别频数可视化(count_plot) 
def count_plot(x,  **kwargs):
    sns.countplot(x=x) 
    x=plt.xticks(rotation=90)
f = pd.melt(Train_data,  value_vars=categorical_features) 
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5) 
g = g.map(count_plot, "value")
Logo

永洪科技,致力于打造全球领先的数据技术厂商,具备从数据应用方案咨询、BI、AIGC智能分析、数字孪生、数据资产、数据治理、数据实施的端到端大数据价值服务能力。

更多推荐