天池数据挖掘比赛-心跳信号分类03-特征工程
特征工程学习目标学习时间序列数据的特征预处理方法学习时间序列特征处理工具Tsfresh(TimeSeries Fresh)的使用数据预处理时间序列数据格式处理、加入时间步特征time特征工程时间序列特征构造、特征筛选、使用tsfresh进行时间序列特征处理# 库函数导入import warningswarnings.filterwarnings('ignore')import numpy as n
·
特征工程
学习目标
学习时间序列数据的特征预处理方法
学习时间序列特征处理工具Tsfresh(TimeSeries Fresh)的使用
数据预处理
时间序列数据格式处理、加入时间步特征time
特征工程
时间序列特征构造、特征筛选、使用tsfresh进行时间序列特征处理
# 库函数导入
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tsfresh as tsf
from tsfresh import extract_features,select_features
from tsfresh.utilities.dataframe_functions import impute
#数据读取
data_train = pd.read_csv("train.csv")
data_test = pd.read_csv("testA.csv")
print(data_train.shape)
print(data_test.shape)
(100000, 3)
(20000, 2)
data_train.head()
id | heartbeat_signals | label | |
---|---|---|---|
0 | 0 | 0.9912297987616655,0.9435330436439665,0.764677... | 0.0 |
1 | 1 | 0.9714822034884503,0.9289687459588268,0.572932... | 0.0 |
2 | 2 | 1.0,0.9591487564065292,0.7013782792997189,0.23... | 2.0 |
3 | 3 | 0.9757952826275774,0.9340884687738161,0.659636... | 0.0 |
4 | 4 | 0.0,0.055816398940721094,0.26129357194994196,0... | 2.0 |
data_test.head()
id | heartbeat_signals | |
---|---|---|
0 | 100000 | 0.9915713654170097,1.0,0.6318163407681274,0.13... |
1 | 100001 | 0.6075533139615096,0.5417083883163654,0.340694... |
2 | 100002 | 0.9752726292239277,0.6710965234906665,0.686758... |
3 | 100003 | 0.9956348033996116,0.9170249621481004,0.521096... |
4 | 100004 | 1.0,0.8879490481178918,0.745564725322326,0.531... |
数据预处理
# 对心电特征进行行转列处理,同时为每个心电信号加入时间步特征time
train_heartbeat_df = data_train["heartbeat_signals"].str.split(",",expand=True).stack()
train_heartbeat_df = train_heartbeat_df.reset_index()
train_heartbeat_df = train_heartbeat_df.set_index("level_0")
train_heartbeat_df.index.name = None
train_heartbeat_df.rename(columns={"level_1":"time", 0:"heartbeat_signals"},inplace=True)
train_heartbeat_df["heartbeat_signals"] = train_heartbeat_df["heartbeat_signals"].astype(float)
train_heartbeat_df
time | heartbeat_signals | |
---|---|---|
0 | 0 | 0.991230 |
0 | 1 | 0.943533 |
0 | 2 | 0.764677 |
0 | 3 | 0.618571 |
0 | 4 | 0.379632 |
... | ... | ... |
99999 | 200 | 0.000000 |
99999 | 201 | 0.000000 |
99999 | 202 | 0.000000 |
99999 | 203 | 0.000000 |
99999 | 204 | 0.000000 |
20500000 rows × 2 columns
# 将处理后的心电特征加入到训练数据中,同时将训练数据label列单独存储
data_train_label = data_train["label"]
data_train = data_train.drop("label",axis=1)
data_train = data_train.drop("heartbeat_signals",axis=1)
data_train = data_train.join(train_heartbeat_df)
data_train
id | time | heartbeat_signals | |
---|---|---|---|
0 | 0 | 0 | 0.991230 |
0 | 0 | 1 | 0.943533 |
0 | 0 | 2 | 0.764677 |
0 | 0 | 3 | 0.618571 |
0 | 0 | 4 | 0.379632 |
... | ... | ... | ... |
99999 | 99999 | 200 | 0.000000 |
99999 | 99999 | 201 | 0.000000 |
99999 | 99999 | 202 | 0.000000 |
99999 | 99999 | 203 | 0.000000 |
99999 | 99999 | 204 | 0.000000 |
20500000 rows × 3 columns
data_train[data_train["id"]==1]
id | time | heartbeat_signals | |
---|---|---|---|
1 | 1 | 0 | 0.971482 |
1 | 1 | 1 | 0.928969 |
1 | 1 | 2 | 0.572933 |
1 | 1 | 3 | 0.178457 |
1 | 1 | 4 | 0.122962 |
... | ... | ... | ... |
1 | 1 | 200 | 0.000000 |
1 | 1 | 201 | 0.000000 |
1 | 1 | 202 | 0.000000 |
1 | 1 | 203 | 0.000000 |
1 | 1 | 204 | 0.000000 |
205 rows × 3 columns
使用tsfresh进行时间序列特征处理
特征抽取Tsfresh自动计算大量的时间序列数据的特征
from tsfresh import extract_features
# 特征提取
train_features = extract_features(data_train,column_id='id',column_sort='time')
train_features
Feature Extraction: 100%|██████████| 40/40 [1:12:34<00:00, 108.85s/it]
heartbeat_signals__variance_larger_than_standard_deviation | heartbeat_signals__has_duplicate_max | heartbeat_signals__has_duplicate_min | heartbeat_signals__has_duplicate | heartbeat_signals__sum_values | heartbeat_signals__abs_energy | heartbeat_signals__mean_abs_change | heartbeat_signals__mean_change | heartbeat_signals__mean_second_derivative_central | heartbeat_signals__median | ... | heartbeat_signals__permutation_entropy__dimension_5__tau_1 | heartbeat_signals__permutation_entropy__dimension_6__tau_1 | heartbeat_signals__permutation_entropy__dimension_7__tau_1 | heartbeat_signals__query_similarity_count__query_None__threshold_0.0 | heartbeat_signals__matrix_profile__feature_"min"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"max"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"mean"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"median"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"25"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"75"__threshold_0.98 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 1.0 | 1.0 | 38.927945 | 18.216197 | 0.019894 | -0.004859 | 0.000117 | 0.125531 | ... | 2.184420 | 2.500658 | 2.722686 | NaN | 6.445546 | 12.165525 | 10.246524 | 10.746992 | 8.388625 | 11.484910 |
1 | 0.0 | 0.0 | 1.0 | 1.0 | 19.445634 | 7.705092 | 0.019952 | -0.004762 | 0.000105 | 0.030481 | ... | 2.710933 | 3.065802 | 3.224835 | NaN | 3.209140 | 12.649111 | 9.031069 | 9.437545 | 6.723180 | 12.094899 |
2 | 0.0 | 0.0 | 1.0 | 1.0 | 21.192974 | 9.140423 | 0.009863 | -0.004902 | 0.000101 | 0.000000 | ... | 1.263370 | 1.406001 | 1.509478 | NaN | 3.054539 | 8.246211 | 7.370478 | 8.246211 | 5.966122 | 8.246211 |
3 | 0.0 | 0.0 | 1.0 | 1.0 | 42.113066 | 15.757623 | 0.018743 | -0.004783 | 0.000103 | 0.241397 | ... | 2.986728 | 3.534354 | 3.854177 | NaN | 3.010557 | 9.797959 | 6.331360 | 6.406440 | 5.266743 | 7.091706 |
4 | 0.0 | 0.0 | 1.0 | 1.0 | 69.756786 | 51.229616 | 0.014514 | 0.000000 | -0.000137 | 0.000000 | ... | 1.914511 | 2.165627 | 2.323993 | NaN | 9.181236 | 13.429784 | 9.959913 | 9.516290 | 9.286013 | 10.270925 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.0 | 0.0 | 1.0 | 1.0 | 63.323449 | 28.742238 | 0.023588 | -0.004902 | 0.000794 | 0.388402 | ... | 2.873602 | 3.391830 | 3.679969 | NaN | 2.436377 | 9.591663 | 5.635231 | 6.366205 | 3.596982 | 7.033638 |
99996 | 0.0 | 0.0 | 1.0 | 1.0 | 69.657534 | 31.866323 | 0.017373 | -0.004543 | 0.000051 | 0.421138 | ... | 3.085504 | 3.728881 | 4.095457 | NaN | 1.415410 | 7.483315 | 2.893592 | 2.684349 | 2.049241 | 3.334109 |
99997 | 0.0 | 0.0 | 1.0 | 1.0 | 40.897057 | 16.412857 | 0.019470 | -0.004538 | 0.000834 | 0.213306 | ... | 2.601062 | 2.996962 | 3.293562 | NaN | 5.748652 | 12.165525 | 8.524637 | 7.983410 | 7.062217 | 10.081756 |
99998 | 0.0 | 0.0 | 1.0 | 1.0 | 42.333303 | 14.281281 | 0.017032 | -0.004902 | 0.000013 | 0.264974 | ... | 3.236950 | 3.793512 | 4.018302 | NaN | 2.346822 | 8.246211 | 4.951374 | 4.727535 | 4.069786 | 5.615282 |
99999 | 0.0 | 0.0 | 1.0 | 1.0 | 53.290117 | 21.637471 | 0.021870 | -0.004539 | 0.000023 | 0.320124 | ... | 2.949266 | 3.462549 | 3.688612 | NaN | 1.959139 | 9.380832 | 4.573691 | 3.908621 | 3.094614 | 5.916164 |
100000 rows × 787 columns
特征选择train_features中包含了heartbeat_signals的787的常见的时间序列特征,其中特征可能为NaN,使用以下方式去除NaN
from tsfresh.utilities.dataframe_functions import impute
# 去除抽取特征中的NaN值
impute(train_features)
heartbeat_signals__variance_larger_than_standard_deviation | heartbeat_signals__has_duplicate_max | heartbeat_signals__has_duplicate_min | heartbeat_signals__has_duplicate | heartbeat_signals__sum_values | heartbeat_signals__abs_energy | heartbeat_signals__mean_abs_change | heartbeat_signals__mean_change | heartbeat_signals__mean_second_derivative_central | heartbeat_signals__median | ... | heartbeat_signals__permutation_entropy__dimension_5__tau_1 | heartbeat_signals__permutation_entropy__dimension_6__tau_1 | heartbeat_signals__permutation_entropy__dimension_7__tau_1 | heartbeat_signals__query_similarity_count__query_None__threshold_0.0 | heartbeat_signals__matrix_profile__feature_"min"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"max"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"mean"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"median"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"25"__threshold_0.98 | heartbeat_signals__matrix_profile__feature_"75"__threshold_0.98 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 1.0 | 1.0 | 38.927945 | 18.216197 | 0.019894 | -0.004859 | 0.000117 | 0.125531 | ... | 2.184420 | 2.500658 | 2.722686 | 0.0 | 6.445546 | 12.165525 | 10.246524 | 10.746992 | 8.388625 | 11.484910 |
1 | 0.0 | 0.0 | 1.0 | 1.0 | 19.445634 | 7.705092 | 0.019952 | -0.004762 | 0.000105 | 0.030481 | ... | 2.710933 | 3.065802 | 3.224835 | 0.0 | 3.209140 | 12.649111 | 9.031069 | 9.437545 | 6.723180 | 12.094899 |
2 | 0.0 | 0.0 | 1.0 | 1.0 | 21.192974 | 9.140423 | 0.009863 | -0.004902 | 0.000101 | 0.000000 | ... | 1.263370 | 1.406001 | 1.509478 | 0.0 | 3.054539 | 8.246211 | 7.370478 | 8.246211 | 5.966122 | 8.246211 |
3 | 0.0 | 0.0 | 1.0 | 1.0 | 42.113066 | 15.757623 | 0.018743 | -0.004783 | 0.000103 | 0.241397 | ... | 2.986728 | 3.534354 | 3.854177 | 0.0 | 3.010557 | 9.797959 | 6.331360 | 6.406440 | 5.266743 | 7.091706 |
4 | 0.0 | 0.0 | 1.0 | 1.0 | 69.756786 | 51.229616 | 0.014514 | 0.000000 | -0.000137 | 0.000000 | ... | 1.914511 | 2.165627 | 2.323993 | 0.0 | 9.181236 | 13.429784 | 9.959913 | 9.516290 | 9.286013 | 10.270925 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 0.0 | 0.0 | 1.0 | 1.0 | 63.323449 | 28.742238 | 0.023588 | -0.004902 | 0.000794 | 0.388402 | ... | 2.873602 | 3.391830 | 3.679969 | 0.0 | 2.436377 | 9.591663 | 5.635231 | 6.366205 | 3.596982 | 7.033638 |
99996 | 0.0 | 0.0 | 1.0 | 1.0 | 69.657534 | 31.866323 | 0.017373 | -0.004543 | 0.000051 | 0.421138 | ... | 3.085504 | 3.728881 | 4.095457 | 0.0 | 1.415410 | 7.483315 | 2.893592 | 2.684349 | 2.049241 | 3.334109 |
99997 | 0.0 | 0.0 | 1.0 | 1.0 | 40.897057 | 16.412857 | 0.019470 | -0.004538 | 0.000834 | 0.213306 | ... | 2.601062 | 2.996962 | 3.293562 | 0.0 | 5.748652 | 12.165525 | 8.524637 | 7.983410 | 7.062217 | 10.081756 |
99998 | 0.0 | 0.0 | 1.0 | 1.0 | 42.333303 | 14.281281 | 0.017032 | -0.004902 | 0.000013 | 0.264974 | ... | 3.236950 | 3.793512 | 4.018302 | 0.0 | 2.346822 | 8.246211 | 4.951374 | 4.727535 | 4.069786 | 5.615282 |
99999 | 0.0 | 0.0 | 1.0 | 1.0 | 53.290117 | 21.637471 | 0.021870 | -0.004539 | 0.000023 | 0.320124 | ... | 2.949266 | 3.462549 | 3.688612 | 0.0 | 1.959139 | 9.380832 | 4.573691 | 3.908621 | 3.094614 | 5.916164 |
100000 rows × 787 columns
按照特征和响应变量之间的相关性进行特征选择,首先单独计算每个特征和响应变量之间的相关性,然后利用Benjamini-Yekutieli procedure进行特征选择
from tsfresh import select_features
# 按照特征和数据label之间的相关性进行特征选择
train_features_filtered = select_features(train_features,data_train_label)
train_features_filtered
heartbeat_signals__sum_values | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_38 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_37 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_36 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_35 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_34 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_33 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_32 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_31 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_30 | ... | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_84 | heartbeat_signals__fft_coefficient__attr_"imag"__coeff_97 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_90 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_94 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_92 | heartbeat_signals__fft_coefficient__attr_"real"__coeff_97 | heartbeat_signals__fft_coefficient__attr_"abs"__coeff_75 | heartbeat_signals__fft_coefficient__attr_"real"__coeff_88 | heartbeat_signals__fft_coefficient__attr_"real"__coeff_92 | heartbeat_signals__fft_coefficient__attr_"real"__coeff_83 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 38.927945 | 0.660949 | 1.090709 | 0.848728 | 1.168685 | 0.982133 | 1.223496 | 1.236300 | 1.104172 | 1.497129 | ... | 0.531883 | -0.047438 | 0.554370 | 0.307586 | 0.564596 | 0.562960 | 0.591859 | 0.504124 | 0.528450 | 0.473568 |
1 | 19.445634 | 1.718217 | 1.280923 | 1.850706 | 1.460752 | 1.924501 | 1.925485 | 1.715938 | 2.079957 | 1.818636 | ... | 0.563590 | -0.109579 | 0.697446 | 0.398073 | 0.640969 | 0.270192 | 0.224925 | 0.645082 | 0.635135 | 0.297325 |
2 | 21.192974 | 1.814281 | 1.619051 | 1.215343 | 1.787166 | 2.146987 | 1.686190 | 1.540137 | 2.291031 | 2.403422 | ... | 0.712487 | -0.074042 | 0.321703 | 0.390386 | 0.716929 | 0.316524 | 0.422077 | 0.722742 | 0.680590 | 0.383754 |
3 | 42.113066 | 2.109550 | 0.619634 | 2.366413 | 2.071539 | 1.000340 | 2.728281 | 1.391727 | 2.017176 | 2.610492 | ... | 0.601499 | -0.184248 | 0.564669 | 0.623353 | 0.466980 | 0.651774 | 0.308915 | 0.550097 | 0.466904 | 0.494024 |
4 | 69.756786 | 0.194549 | 0.348882 | 0.092119 | 0.653924 | 0.231422 | 1.080003 | 0.711244 | 1.357904 | 1.237998 | ... | 0.015292 | 0.070505 | 0.065835 | 0.051780 | 0.092940 | 0.103773 | 0.179405 | -0.089611 | 0.091841 | 0.056867 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
99995 | 63.323449 | 0.840651 | 1.186210 | 1.396236 | 0.417221 | 2.036034 | 1.659054 | 0.500584 | 1.693545 | 0.859932 | ... | 0.779955 | 0.005525 | 0.486013 | 0.273372 | 0.705386 | 0.602898 | 0.447929 | 0.474844 | 0.564266 | 0.133969 |
99996 | 69.657534 | 1.557787 | 1.393960 | 0.989147 | 1.611333 | 1.793044 | 1.092325 | 0.507138 | 1.763940 | 2.677643 | ... | 0.539489 | 0.114670 | 0.579498 | 0.417226 | 0.270110 | 0.556596 | 0.703258 | 0.462312 | 0.269719 | 0.539236 |
99997 | 40.897057 | 0.469758 | 1.000355 | 0.706395 | 1.190514 | 0.674603 | 1.632769 | 0.229008 | 2.027802 | 0.302457 | ... | 0.282597 | -0.474629 | 0.460647 | 0.478341 | 0.527891 | 0.904111 | 0.728529 | 0.178410 | 0.500813 | 0.773985 |
99998 | 42.333303 | 0.992948 | 1.354894 | 2.238589 | 1.237608 | 1.325212 | 2.785515 | 1.918571 | 0.814167 | 2.613950 | ... | 0.594252 | -0.162106 | 0.694276 | 0.681025 | 0.357196 | 0.498088 | 0.433297 | 0.406154 | 0.324771 | 0.340727 |
99999 | 53.290117 | 1.624625 | 1.739088 | 2.936555 | 0.154759 | 2.921164 | 2.183932 | 1.485150 | 2.685922 | 0.583443 | ... | 0.463697 | 0.289364 | 0.285321 | 0.422103 | 0.692009 | 0.276236 | 0.245780 | 0.269519 | 0.681719 | -0.053993 |
100000 rows × 707 columns
更多推荐
所有评论(0)