【python+miniconda+jupyter】数据分析之全过程包括数据处理数据分布及相关性分析

一、读入excelimport pandas as pdimport numpy as np#设置画图风格与图片中文字体from matplotlib import pyplot as pltplt.style.use("ggplot")plt.rcParams['font.sans-serif'] = ['SimHei']plt.rcParams['axes.unicode_minus'] =

如果我是温帅帅

2059人浏览 · 2022-04-02 14:52:37

如果我是温帅帅 · 2022-04-02 14:52:37 发布

一、读入excel

import pandas as pd
import numpy as np
#设置画图风格与图片中文字体
from matplotlib import pyplot as plt
plt.style.use("ggplot")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)
#周报基础数据读入
df_huoyue=pd.read_excel('C:\\Users\\XXX.xlsx',sheet_name='XX')
df_hexin=pd.read_excel('C:\\Users\\XXX.xlsx',sheet_name='XX')
df_chanpin=pd.read_excel('C:\\Users\\XXX',sheet_name='XX ')

没有sheet的，可以不要sheet_name

二、数据处理

df_huoyue=df_huoyue.fillna(0)
df_hexin=df_hexin.fillna(0)
df_chanpin=df_chanpin.fillna(0)
print(1)

把异常数据补0


df_hexin.rename(columns={'Unnamed: 0':'周','核心指标':'销量','Unnamed: 2':'保有量','Unnamed: 3':'非货销量','Unnamed: 5':'非货保有量'},inplace=True)
df_hexin.head()

列名重命名，inplace=True,进行替换

三、多个dataframe合并

import pandas as pd
from functools import reduce

#pd.merge(df_hexin,df_huoyue,on='周',how='left')
dfs = [df_huoyue, df_hexin, df_chanpin,df_yonghu]
df_final = reduce(lambda left,right: pd.merge(left,right,on='周',how='left'), dfs)

四、筛选列

df_all_new=df_final[['周','活跃客户数','非货保有量','非货销量','销量','保有量','交易用户数','人均交易额','现金宝销量','货币基金销量','基金组合销量','投顾产品销量','股票基金销量','债券基金销量','定投金额','新增定投数','注册用户','开户用户','首购用户']]

五、数据拆分

df_zuhe=df_all_new.loc[0:57]
df_zuhetime=df_zuhe['周']
df_zuhe.drop(columns='周',inplace=True)
df_tougu_start=df_all_new.loc[57:61]
df_tougutime_start=df_tougu_start['周']
df_tougu_start.drop(columns='周',inplace=True)
df_tougu=df_all_new.loc[61:]
df_tougutime=df_tougu['周']
df_tougu.drop(columns='周',inplace=True)
print(1)

六、数据分布

tougu_fenbu=df_tougu_new.describe()
tougu_fenbu
zuhe_fenbu=df_zuhe_new.describe()
zuhe_fenbu

数据分布分析

print("均值上升的特征")
for i in range(0,zuhe_fenbu.shape[1]):
    if(tougu_fenbu.iloc[1,i]>zuhe_fenbu.iloc[1,i]):
        
        print(i,tougu_fenbu.columns[i])
        print('原均值',round(zuhe_fenbu.iloc[1,i],2),'现均值',round(tougu_fenbu.iloc[1,i],2))
        print('增加了',round(tougu_fenbu.iloc[1,i]-zuhe_fenbu.iloc[1,i],2))

根据标准差确定数据正常范围

print("数值范围")
list=["活跃客户数","销量","注册用户","开户用户","首购用户"]
for i in range(0,tougu_fenbu.shape[1]):
    #print(tougu_fenbu.columns[i])
    if (tougu_fenbu.columns[i] in list):
        print(tougu_fenbu.columns[i],round(tougu_fenbu.iloc[1,i]-tougu_fenbu.iloc[2,i],2),round(tougu_fenbu.iloc[1,i]+tougu_fenbu.iloc[2,i],2))

七、判断特征分布

import scipy.stats as stats
stats.shapiro(df_zuhe_new['首购用户'])

八、数据相关性分析

import seaborn as sns
import matplotlib.pyplot as plt
figure, ax = plt.subplots(figsize=(12, 12))
//相关性的热力图
sns.heatmap(df_zuhe_new.corr(), square=True, annot=True, ax=ax)
//随时间变化的分布图
plt.plot(df_tougu_new['Unnamed: 0'],df_tougu_new['活跃客户数'])