1. 统一变量统计格式

1.1 读入数据

# 导入程序包
import pandas as pd
import numpy as np

import re
import sys
import os
project_path = os.getcwd()

df_model=pd.read_excel(project_path+'/data/processed_data/df_11.4_人工删除无意义列.xlsx')
if 'Unnamed: 0' in df_model.columns:
    df_model = df_model.drop(['Unnamed: 0'], axis=1)

# 删除id、时间变量
df_model=df_model.drop(['patient_id','case_no','drug_name','start_datetime','end_datetime','bmd_time'],axis=1)

df_model.columns

# 选取变量
# discrete_col = ['日剂量','gender', '基础疾病','情感稳定剂','抗精神病药','抗焦虑药','安眠药','行为干预治疗']
discrete_col = ['日剂量','gender','抗精神病药','抗焦虑药','行为干预治疗']
continuous_col=[x for x in df_model.columns if x not in discrete_col]

1.2 分类变量统计

# 求分类变量比例
df_discrete_stat=pd.DataFrame(columns=['变量名称','所有事件(%d)' % df_model.shape[0],'缺失率(%)'])
for i in discrete_col:
    print(i)
    # 缺失率
    if df_model[i].isnull().sum()==0:
        miss_rate='0%'
    else:
        miss_rate=df_model[i].isnull().sum()/df_model.shape[0]
        miss_rate="%.2f%%" % (miss_rate * 100)      # 百分数输出
    df_discrete_stat.loc[df_discrete_stat.shape[0],['变量名称','缺失率(%)']]=[i+',n(%)',miss_rate]
    
    # 分类变量单独统计
    name_list=[]
    num_perc_list=[]
    df_model_stat=df_model[df_model[i].notnull()].sort_values([i],ascending=True)
    
    # 二分类还是多分类变量
    if df_model_stat[i].nunique() <=2:
        if re.match('gender|性别',i):
            name_list=['男','女']
        else:
            name_list=['是','否']
        for name,value in zip(name_list,[1,0]):
            print(name)
            num=df_model_stat[df_model_stat[i]==value].shape[0]
            percent=num/df_model.shape[0]
            percent="%.2f%%" % (percent * 100)
            num_percent=str(num)+'('+percent+')'
            num_perc_list.append(num_percent)
    else:
        for value in sorted(df_model_stat[i].unique()):
            print(value)
            name_list.append(value)
            num=df_model_stat[df_model_stat[i]==value].shape[0]
            percent=num/df_model.shape[0]
            percent="%.2f%%" % (percent * 100)
            num_percent=str(num)+'('+percent+')'
            num_perc_list.append(num_percent)
        
        
    df_temp = pd.DataFrame(data={'变量名称':name_list,
                                 '所有事件(%d)' % df_model.shape[0]:num_perc_list})

    df_discrete_stat=pd.concat([df_discrete_stat,df_temp],axis=0)
    df_discrete_stat=df_discrete_stat.reset_index(drop=True)

1.3 连续变量统计

# 统计全变量体系各变量的平均数、上下四分位数、缺失率
feature_quarter_list=[]
mean_quarter_list=[]
feature_std_list=[]
mean_std_list=[]
miss_list=[]
for i in continuous_col:
    # 计算上下四分位、均值、标准差
    try:
        data = df_model[i].astype('float')
        stat_result = pd.DataFrame(data.describe())
        mean_value=stat_result.loc['mean',i]
        up_quarter=stat_result.loc['25%',i]
        down_quarter=stat_result.loc['75%',i]
        std_value=stat_result.loc['std',i]
    except:
        mean_value=np.nan
        up_quarter=np.nan
        down_quarter=np.nan
    # 计算缺失率
    if df_model[i].isnull().sum()==0:
        miss_rate='0%'
    else:
        miss_rate=df_model[i].isnull().sum()/df_model.shape[0]
        miss_rate="%.2f%%" % (miss_rate * 100)      # 百分数输出
    miss_list.append(miss_rate)
    # mean(quarter)
    feature_quarter_list.append(i+',mean(IQR)')
    mean_quarter_list.append('%.2f(%.2f-%.2f)' % (mean_value,up_quarter,down_quarter))
    # mean(std)
    feature_std_list.append(i+',mean±std')
    mean_std_list.append('%.2f±%.2f' % (mean_value,std_value))
df_continuous_quarter=pd.DataFrame(data={'特征':feature_quarter_list,
                        'mean_quarter_list':mean_quarter_list,
                        'miss_list':miss_list})
df_continuous_std=pd.DataFrame(data={'特征':feature_std_list,
                        'mean_std_list':mean_std_list,
                        'miss_list':miss_list})
df_continous_stat=pd.concat([df_continuous_quarter,df_continuous_std],axis=1)
# 保存连续变量统计
df_continous_stat.to_excel(project_path+'/data/stat/df_2.2_连续变量统计_importance.xlsx')

于鹏-提取上下四分位代码

## 输出上下四分位
def desc_IQR(data): 
    # 提取describe 
    df_desc = data.describe().round(2).copy() 
    df_desc = df_desc[df_desc.index.isin(['25%','50%','75%'])] 
    df_desc = df_desc.astype('str') 
    # 拼凑格式 
    df_result = df_desc.loc['50%',df_desc.columns] + 
    '(' + df_desc.loc['25%',df_desc.columns] + '-' +
    df_desc.loc['75%',df_desc.columns] + ')' 
    result = df_result.to_frame(name='IQR').reset_index().rename(columns={'index':'variables'}) 
    return result 

Logo

永洪科技,致力于打造全球领先的数据技术厂商,具备从数据应用方案咨询、BI、AIGC智能分析、数字孪生、数据资产、数据治理、数据实施的端到端大数据价值服务能力。

更多推荐