一、数据降维了解
二、数据降维作用
三、数据降维方法
## 代码示例(缺失值比率) def na_count(data): '''各指标缺失规模及缺失占比统计 ''' data_count = data.count() na_count = len(data) - data_count na_rate = na_count/len(data) result = pd.concat([data_count,na_count,na_rate],axis = 1) return result def miss_data_handle(data): '''高缺失字段处理''' table_col = data.columns table_col_list = table_col.values.tolist() row_length = len(data) for col_key in table_col_list: non_sum1 = data[col_key].isnull().sum() if non_sum1/row_length >= 0.8: data[col_key] = data[col_key].dropna(axis = 1) return data
## 代码示例(随机森林/组合树) target_col = 'IS_SUCCESS' #响应变量 ipt_col = list(data.columns) #数据集维度列表 def data_sample(data, col=target_col, smp=3): data_1 = data[data[col] == 1].sample(frac=1) data_0 = data[data[col] == 0].sample(n=len(data_1)*smp) data = pd.concat([data_1, data_0]).reset_index() return data def train_test_spl(data): '''数据切分''' X_train, X_test, y_train, y_test = train_test_split( data[ipt_col], data[target_col], test_size=0.3, random_state=42) return X_train, X_test, y_train, y_test def feture_extracted(data): global ipt_col ipt_col= list(data.columns) ipt_col.remove(target_col) sample_present = [1, 2, 4, 6, 8, 10, 12, 15] # 定义抽样比例 alpha = 0.9 f1_score_list = [] model_dict = {} for i in sample_present: try: data = data_sample(data, col=target_col, smp=i) except ValueError: break X_train, X_test, y_train, y_test = train_test_spl(data) model = RandomForestClassifier() model = model.fit(X_train, y_train) model_pred = model.predict(X_test) f1_score = metrics.f1_score(y_test, model_pred) f1_score_list.append(f1_score) model_dict[i] = model max_f1_index = f1_score_list.index(max(f1_score_list)) print('最优的抽样比例是:1:',sample_present[max_f1_index]) d = dict(zip(ipt_col, [float('%.3f' %i) for i in model_dict[sample_present[max_f1_index]].feature_importances_])) f = zip(d.values(), d.keys()) importance_df = pd.DataFrame(sorted(f, reverse=True), columns=['importance', 'feture_name']) list_imp = np.cumsum(importance_df['importance']).tolist() for i, j in enumerate(list_imp): if j >= alpha: break print('大于alpha的特征及重要性如下:\n',importance_df.iloc[0:i+1, :]) print('其特征如下:') feture_selected = importance_df.iloc[0:i+1, 1].tolist() print(feture_selected) return feture_selected
## 代码示例(低方差滤波) var = data.var() numeric = data.columns variable = [ ] for i in range(0,len(var)): if var[i]>= 10: # 将阈值设置为10% variable.append(data[i+1])
## 代码示例(高相关滤波) k = 0.8 #配置参数 def data_corr_analysis(data, sigmod = k): '''相关性分析:返回出原始数据的相关性矩阵以及根据阈值筛选之后的相关性较高的变量''' corr_data = data.corr() for i in range(len(corr_data)): for j in range(len(corr_data)): if j == i: corr_data.iloc[i, j] = 0 x, y, corr_xishu = [], [], [] for i in list(corr_data.index): for j in list(corr_data.columns): if abs(corr_data.loc[i, j]) > sigmod: x.append(i) y.append(j) corr_xishu.append(corr_data.loc[i, j]) z = [[x[i], y[i], corr_xishu[i]] for i in range(len(x))] high_corr = pd.DataFrame(z, columns=['VAR1','VAR2','CORR_XISHU']) return high_corr
主成分变换对正交向量的尺度敏感。数据在变换前需要进行归一化处理。
新的主成分并不是由实际系统产生的,因此在进行 PCA 变换后会丧失数据的解释性。如果说,数据的解释能力对你的分析来说很重要,那么 PCA 对你来说可能就不适用了。
## 代码示例(PCA) from sklearn.decomposition import PCA pca = PCA() pca = PCA(n_components = None,copy = True,whiten = False) pca.fit(data) pca.components_ pca.explained_variance_ratio_ pca = PCA(3) #观察主成分累计贡献率,重新建立PCA模型 pca.fit(data) low_d = pca.transform(data)
## 代码示例(FA) import pandas as pd import numpy as np import math df = pd.DataFrame(mydata) #样本离差矩阵 mydata_mean = mydata.mean() E = np.mat(np.zeros((14, 14))) for i in range(len(mydata)): E += (mydata.iloc[i, :].reshape(14, 1) - mydata_mean.reshape(14, 1)) * (mydata.iloc[i, :].reshape(1, 14) - mydata_mean.reshape(1, 14)) #样本相关性矩阵 R = np.mat(np.zeros((14, 14))) for i in range(14): for j in range(14): R[i, j] = E[i, j]/math.sqrt(E[i, i] * E[j, j]) import numpy.linalg as nlg eig_value, eigvector = nlg.eig(R) eig = pd.DataFrame() eig['names'] = mydata.columns eig['eig_value'] = eig_value eig.sort_values('eig_value', ascending=False, inplace=True) #求因子模型的因子载荷阵,寻找公共因子个数m for m in range(1, 14): if eig['eig_value'][:m].sum()/eig['eig_value'].sum() >= 0.8: print(m) break #因子载荷矩阵 A = np.mat(np.zeros((14, 6))) for i in range(5): A[:,i]=math.sqrt(eig_value[i])*eigvector[:,i] a=pd.DataFrame(A) a.columns=['factor1','factor2','factor3','factor4','factor5','factor6']
## 代码示例(反向消除特征) from sklearn.linear_model import LinearRegression from sklearn.feature_selection import RFE from sklearn import datasets df = data.drop('IS_SUCCESS', 1) lreg = LinearRegression() rfe = RFE(lreg, 10) rfe = rfe.fit_transform(df, data.IS_SUCCESS)
## 代码示例(前项特征构造) from sklearn.feature_selection import f_regression ffs = f_regression(df,data.IS_SUCCESS) variable = [ ] for i in range(0,len(df.columns)-1): if ffs[0][i] >=10: variable.append(df.columns[i])