上期说到数据分析师一般对业务数据提取的时候就会进行数据清洗,也会做一些业务逻辑或者数据逻辑上的特征处理。但由于特征工程是数据建模重要的一环,所以这里就做一个简单的总结。希望能给大家带来一些小小地帮助~
首先给到一个特征工程概览图(如下):
特征工程
import pandas as pd
# 构造数据
df = pd.DataFrame({'客户编号': [1, 2, 3], '性别': ['男', '女', '男']})
print(df)
# 哑编码
df1 = pd.get_dummies(df, columns=['性别'])
print('-'*30)
print(df1)
# 哑变量删除至n-1,防止虚拟变量陷阱造成的共线性问题
df2 = pd.get_dummies(df, columns=['性别'], drop_first=True)
print('-'*30)
print(df2)
客户编号 性别
0 1 男
1 2 女
2 3 男
------------------------------
客户编号 性别_女 性别_男
0 1 0 1
1 2 1 0
2 3 0 1
------------------------------
客户编号 性别_男
0 1 1
1 2 0
2 3 1
# 构造数据
df = pd.DataFrame({'编号': [1, 2, 3, 4, 5], '城市': ['北京', '上海', '广州', '深圳', '北京']})
print(df)
# 通过replace替换
df1 = df.copy()
df1['城市'] = df1['城市'].replace({'北京': 0, '上海': 1, '广州': 2, '深圳':3})
print('-'*30)
print(df1)
# 通过map替换
df2 = df.copy()
city_dic = {'北京': 0, '上海': 1, '广州': 2, '深圳':3}
df2['城市'] = df2['城市'].map(city_dic)
print('-'*30)
print(df2)
编号 城市
0 1 北京
1 2 上海
2 3 广州
3 4 深圳
4 5 北京
------------------------------
编号 城市
0 1 0
1 2 1
2 3 2
3 4 3
4 5 0
------------------------------
编号 城市
0 1 0
1 2 1
2 3 2
3 4 3
4 5 0
# 通过LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df3 = df.copy()
df3['城市'] = le.fit_transform(df3['城市'])
print(df3)
编号 城市
0 1 1
1 2 0
2 3 2
3 4 3
4 5 1
# 通过OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe.fit_transform(df[['城市']]).toarray()
array([[0., 1., 0., 0.],
[1., 0., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.]])
import pandas as pd
# 构造数据
df = pd.DataFrame({'age': range(1,5)})
print(df)
# 自定义
df1 = df.copy()
df1['age_b'] = df1['age'].map(lambda x: 0 if x<=2 else 1)
print('-'*30)
print(df1)
age
0 1
1 2
2 3
3 4
------------------------------
age age_b
0 1 0
1 2 0
2 3 1
3 4 1
# Binarizer
from sklearn.preprocessing import Binarizer
df2 = df.copy()
df2['age_b'] = Binarizer(threshold=2).fit_transform(df2[['age']])
print(df2)
age age_b
0 1 0
1 2 0
2 3 1
3 4 1
import pandas as pd
df = pd.DataFrame([[22,1],[25,1],[20,0],[35,0],[32,1],[38,0],[50,0],[46,1]], columns=['age', 'churn'])
# 自定义bins
print(pd.cut(df['age'], bins=[-1,20,50,99]))
# 自定义标签
print('-'*30)
print(pd.cut(df['age'], bins=[-1,20,50,99], labels=[0,1,2]))
0 (20, 50]
1 (20, 50]
2 (-1, 20]
3 (20, 50]
4 (20, 50]
5 (20, 50]
6 (20, 50]
7 (20, 50]
Name: age, dtype: category
Categories (3, interval[int64]): [(-1, 20] < (20, 50] < (50, 99]]
------------------------------
0 1
1 1
2 0
3 1
4 1
5 1
6 1
7 1
Name: age, dtype: category
Categories (3, int64): [0 < 1 < 2]
import toad
c = toad.transform.Combiner()
# 等频分箱
print(c.fit(df['age'], method='quantile').export())
# 等距分箱
print('-'*30)
print(c.fit(df['age'], method='step').export())
# 卡方分箱
print('-'*30)
print(c.fit(df, y='churn', method='chi').export())
# 决策分箱
print('-'*30)
print(c.fit(df, y='churn', method='dt').export())
# KMean分箱
print('-'*30)
print(c.fit(df['age'], method='kmeans', n_bins=3).export())
{'age': [21.4, 23.2, 25.700000000000003, 30.6, 33.5, 35.6, 37.7, 42.800000000000004, 47.2]}
------------------------------
{'age': [23.0, 26.0, 29.0, 32.0, 35.0, 38.0, 41.0, 44.0, 47.0]}
------------------------------
{'age': [22, 25, 32, 35, 38, 46, 50]}
------------------------------
{'age': [21.0, 33.5, 42.0, 48.0]}
------------------------------
{'age': [28.666666666666668, 41.5]}
import pandas as pd
# 构造数据
df = pd.DataFrame({'sales': [3,7,8,2,6,3,6]})
# 平方根变换
import numpy as np
df.insert(len(df.columns), 'sqrt',
np.sqrt(df['sales']))
# 对数变换
import numpy as np
df.insert(len(df.columns), 'log',
np.log(df['sales']))
# Box-Cox变换
from scipy.stats import boxcox
df.insert(len(df.columns), 'boxcox',
boxcox(df['sales'])[0])
# 自定义函数变换
from sklearn.preprocessing import FunctionTransformer
def my_func(x):
return x/2
transformer = FunctionTransformer(my_func)
df.insert(len(df.columns), 'myfunc',
transformer.transform(df['sales']))
print(df)
sales sqrt log boxcox myfunc
0 3 1.732051 1.098612 1.639046 1.5
1 7 2.645751 1.945910 4.078599 3.5
2 8 2.828427 2.079442 4.609387 4.0
3 2 1.414214 0.693147 0.887320 1.0
4 6 2.449490 1.791759 3.523320 3.0
5 3 1.732051 1.098612 1.639046 1.5
6 6 2.449490 1.791759 3.523320 3.0
import pandas as pd
# 构造数据
df = pd.DataFrame({'sales': [3,7,8], 'rand': [-1,3,5]})
# 自定义函数
df[['sales', 'rand']].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x)))
sales | rand | |
---|---|---|
0 | 0.0 | 0.000000 |
1 | 0.8 | 0.666667 |
2 | 1.0 | 1.000000 |
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
min_max_scaler.fit_transform(df)
# 自定义函数
df.apply(lambda x : (x-x.mean())/ x.std(ddof=0))
sales | rand | |
---|---|---|
0 | -1.38873 | -1.336306 |
1 | 0.46291 | 0.267261 |
2 | 0.92582 | 1.069045 |
# scale
from sklearn.preprocessing import scale
scale(df)
array([[-1.38873015, -1.33630621],
[ 0.46291005, 0.26726124],
[ 0.9258201 , 1.06904497]])
# StandardScaler
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit_transform(df)
array([[-1.38873015, -1.33630621],
[ 0.46291005, 0.26726124],
[ 0.9258201 , 1.06904497]])
# Normalizer
from sklearn.preprocessing import Normalizer
norm = Normalizer()
norm.fit_transform(df)
array([[ 0.9486833 , -0.31622777],
[ 0.91914503, 0.3939193 ],
[ 0.8479983 , 0.52999894]])
df = pd.DataFrame([[1, np.nan, 3], [np.nan, np.nan, np.nan], [np.nan, np.nan, 0]], columns=['c1', 'c2', 'c3'])
print(df)
# 删除含nan的行
print('-'*30)
print(df.dropna())
# 删除全为nan的行
print('-'*30)
print(df.dropna(how='all'))
# 设置thresh参数,比如将其设置为n,那么其含义是如果该行的非空值少于n个则删除该行
print('-'*30)
print(df.dropna(thresh=2))
c1 c2 c3
0 1.0 NaN 3.0
1 NaN NaN NaN
2 NaN NaN 0.0
------------------------------
Empty DataFrame
Columns: [c1, c2, c3]
Index: []
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
2 NaN NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
# 随机填充
import random
print(df.fillna(int(random.random()*10)))
# 常数填充
print('-'*30)
print(df.fillna(0))
# 均值填充
print('-'*30)
print(df.fillna(df.mean()))
# 中位数填充
print('-'*30)
print(df.fillna(df.median()))
# 插值填充
print('-'*30)
print(df.fillna(method='ffill'))
c1 c2 c3
0 1.0 8.0 3.0
1 8.0 8.0 8.0
2 8.0 8.0 0.0
------------------------------
c1 c2 c3
0 1.0 0.0 3.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 1.5
2 1.0 NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 1.5
2 1.0 NaN 0.0
------------------------------
c1 c2 c3
0 1.0 NaN 3.0
1 1.0 NaN 3.0
2 1.0 NaN 0.0
# 构造数据
df = pd.DataFrame([[1, 2, 3], [1, 2, 3], [4, 5, 6]], columns=['c1', 'c2', 'c3'])
print(df)
# 删除重复值
print('-'*30)
print(df.drop_duplicates())
c1 c2 c3
0 1 2 3
1 1 2 3
2 4 5 6
------------------------------
c1 c2 c3
0 1 2 3
2 4 5 6
# 构造数据
df = pd.DataFrame({'c1': [3, 10, 5, 7, 1, 9, 93],
'c2': [15, 16, 14, 78, 19, 11, 8],
'c3': [20, 15, 18, 21, 101, 27, 29]},
columns=['c1', 'c2', 'c3'])
print(df)
# 箱线图
import matplotlib.pyplot as plt
print('-'*30)
df.boxplot()
plt.show()
# 3sigma
z = lambda x: (x-x.mean())/ x.std(ddof=0)
print('-'*30)
print(df[df.apply(z)>2].dropna(how='all'))
c1 c2 c3
0 3 15 20
1 10 16 15
2 5 14 18
3 7 78 21
4 1 19 101
5 9 11 27
6 93 8 29
------------------------------
output_31_1
------------------------------
c1 c2 c3
3 NaN 78.0 NaN
4 NaN NaN 101.0
6 93.0 NaN NaN
from sklearn.datasets import load_iris
iris = load_iris()
X = pd.DataFrame(iris.data)
X.columns=['sl', 'sw', 'pl', 'pw']
y = pd.DataFrame(iris.target)
y.columns = ['y']
df = pd.concat([pd.DataFrame(X), pd.DataFrame(y)], axis=1)
print(df.head())
sl sw pl pw y
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
# 相关系数
df.corr()
sl | sw | pl | pw | y | |
---|---|---|---|---|---|
sl | 1.000000 | -0.117570 | 0.871754 | 0.817941 | 0.782561 |
sw | -0.117570 | 1.000000 | -0.428440 | -0.366126 | -0.426658 |
pl | 0.871754 | -0.428440 | 1.000000 | 0.962865 | 0.949035 |
pw | 0.817941 | -0.366126 | 0.962865 | 1.000000 | 0.956547 |
y | 0.782561 | -0.426658 | 0.949035 | 0.956547 | 1.000000 |
# 方差膨胀因子
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [variance_inflation_factor(X.values, X.columns.get_loc(i)) for i in X.columns]
print(vif)
# VIF均大于10,剔除最高的sl
X1=df.drop(columns=['sl', 'y'])
vif = [variance_inflation_factor(X1.values, X1.columns.get_loc(i)) for i in X1.columns]
print('-'*30)
print(vif)
# 依然存在VIF大于10,剔除最高的pl
X2=df.drop(columns=['sl', 'pl', 'y'])
vif = [variance_inflation_factor(X2.values, X2.columns.get_loc(i)) for i in X2.columns]
print('-'*30)
print(vif)
[262.9693482414677, 96.35329172369063, 172.96096155387588, 55.50205979323753]
------------------------------
[5.856964572603174, 62.071308334041554, 43.2925737234071]
------------------------------
[2.891774016941542, 2.8917740169415427]
# PCA
from sklearn.decomposition import PCA
PCA(n_components=2).fit_transform(iris.data)[0:3]
array([[-2.68412563, 0.31939725],
[-2.71414169, -0.17700123],
[-2.88899057, -0.14494943]])
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
LDA(n_components=2).fit_transform(iris.data, iris.target)[0:3]
array([[ 8.06179978, 0.30042062],
[ 7.12868772, -0.78666043],
[ 7.48982797, -0.26538449]])
# PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
pf = PolynomialFeatures().fit_transform(iris.data)
print(pf[0:3])
# 查看多项式拟合后的shape
print('-'*30)
print(pf.shape)
[[ 1. 5.1 3.5 1.4 0.2 26.01 17.85 7.14 1.02 12.25 4.9 0.7
1.96 0.28 0.04]
[ 1. 4.9 3. 1.4 0.2 24.01 14.7 6.86 0.98 9. 4.2 0.6
1.96 0.28 0.04]
[ 1. 4.7 3.2 1.3 0.2 22.09 15.04 6.11 0.94 10.24 4.16 0.64
1.69 0.26 0.04]]
------------------------------
(150, 15)
# toad
import toad
# 相关系数、缺失值、iv值(小于0.1认为是弱预测能力)选择
toad.selection.select(df, df.y,
empty=0.7, iv=0.1,
corr=0.95,
return_drop=True)
( sl sw pl y
0 5.1 3.5 1.4 0
1 4.9 3.0 1.4 0
2 4.7 3.2 1.3 0
3 4.6 3.1 1.5 0
4 5.0 3.6 1.4 0
.. ... ... ... ..
145 6.7 3.0 5.2 2
146 6.3 2.5 5.0 2
147 6.5 3.0 5.2 2
148 6.2 3.4 5.4 2
149 5.9 3.0 5.1 2
[150 rows x 4 columns],
{'empty': array([], dtype=float64),
'iv': array([], dtype=object),
'corr': array(['pw'], dtype=object)})
from sklearn.model_selection import train_test_split
# 样本拆分
train, test = train_test_split(
df, test_size=.3, random_state=0)
# 计算psi(大于0.02认为是不稳定)
np.seterr(divide='ignore',invalid='ignore') # 防止0/0产生的invalid value
toad.metrics.PSI(train, test).sort_values(0)
y 0.081994
pl 0.316619
pw 0.418114
sw 0.425005
sl 0.762664
dtype: float64
# 逐步回归
toad.selection.stepwise(df,
df.y,
direction='both',
criterion='aic',
estimator='ols',
intercept=False).head()
sl | y | |
---|---|---|
0 | 5.1 | 0 |
1 | 4.9 | 0 |
2 | 4.7 | 0 |
3 | 4.6 | 0 |
4 | 5.0 | 0 |
# 方差选择法
from sklearn.feature_selection import VarianceThreshold
VarianceThreshold(threshold=3).fit_transform(iris.data)[0:3]
array([[1.4],
[1.4],
[1.3]])
# 相关系数法
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
r = lambda X, Y: np.array(list(map(lambda x:pearsonr(x, Y)[0], X.T))).T
SelectKBest(r, k=2).fit_transform(iris.data, iris.target)[0:3]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
# 卡方检验
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)[0:3]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
# 互信息法
from sklearn.feature_selection import SelectKBest
from sklearn import metrics
mic = metrics.mutual_info_score
g = lambda X, Y: np.array(list(map(lambda x:mic(x, Y), X.T))).T
SelectKBest(g, k=2).fit_transform(iris.data, iris.target)[0:3]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
# 递归特征消除法
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
RFE(estimator=LogisticRegression(solver='liblinear'),
n_features_to_select=2).fit_transform(iris.data, iris.target)[0:3]
array([[3.5, 0.2],
[3. , 0.2],
[3.2, 0.2]])
# 基于惩罚项的特征选择法
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
SelectFromModel(LogisticRegression(penalty="l1", C=0.1,
solver='liblinear')).fit_transform(iris.data, iris.target)[0:3]
array([[5.1, 3.5, 1.4],
[4.9, 3. , 1.4],
[4.7, 3.2, 1.3]])
# 基于树模型的特征选择法
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)[0:3]
array([[1.4, 0.2],
[1.4, 0.2],
[1.3, 0.2]])
# 过采样
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)
X_smotesampled, y_smotesampled = smote.fit_resample(iris.data, iris.target)
# 欠采样
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_undersampled, y_undersampled = rus.fit_resample(iris.data, iris.target)