使用python+sklearn的决策树方法预测是否有信用风险
推荐回答
import numpy as np11import pandas as pd11names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')11data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)11data.head()11BalanceDurationHistoryPurposeCredit amountSavingsEmploymentinstPercentsexMarriedGuarantors…AssetsAgeconcCreditApartmentCreditsOccupationDependentshasPhoneForeignlable0A11 6 A34 A43 1169 A65 A75 4 A93 A101 … A121 67 A143 A152 2 A173 1 A192 A201 1 1A12 48 A32 A43 5951 A61 A73 2 A92 A101 … A121 22 A143 A152 1 A173 1 A191 A201 2 2A14 12 A34 A46 2096 A61 A74 2 A93 A101 … A121 49 A143 A152 1 A172 2 A191 A201 1 3A11 42 A32 A42 7882 A61 A74 2 A93 A103 … A122 45 A143 A153 1 A173 2 A191 A201 1 4A11 24 A33 A40 4870 A61 A73 3 A93 A101 … A124 53 A143 A153 2 A173 2 A191 A201 2 5 rows × 21 columnsdata.Balance.unique()11array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)data.count()11Balance 1000 Duration 1000 History 1000 Purpose 1000 Credit amount 1000 Savings 1000 Employment 1000 instPercent 1000 sexMarried 1000 Guarantors 1000 Residence duration 1000 Assets 1000 Age 1000 concCredit 1000 Apartment 1000 Credits 1000 Occupation 1000 Dependents 1000 hasPhone 1000 Foreign 1000 lable 1000 dtype: int64#部分变量描述性统计分析data.describe()1212DurationCredit amountinstPercentResidence durationAgeCreditsDependentslablecount1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 mean20.903000 3271.258000 2.973000 2.845000 35.546000 1.407000 1.155000 1.300000 std12.058814 2822.736876 1.118715 1.103718 11.375469 0.577654 0.362086 0.458487 min4.000000 250.000000 1.000000 1.000000 19.000000 1.000000 1.000000 1.000000 25%12.000000 1365.500000 2.000000 2.000000 27.000000 1.000000 1.000000 1.000000 50%18.000000 2319.500000 3.000000 3.000000 33.000000 1.000000 1.000000 1.000000 75%24.000000 3972.250000 4.000000 4.000000 42.000000 2.000000 1.000000 2.000000 max72.000000 18424.000000 4.000000 4.000000 75.000000 4.000000 2.000000 2.000000 data.Duration.unique()11array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)data.History.unique()11array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)data.groupby('Balance').size().order(ascending=False)11c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Balance A14 394 A11 274 A12 269 A13 63 dtype: int64data.groupby('Purpose').size().order(ascending=False)11c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Purpose A43 280 A40 234 A42 181 A41 103 A49 97 A46 50 A45 22 A44 12 A410 12 A48 9 dtype: int64data.groupby('Apartment').size().order(ascending=False)11c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Apartment A152 713 A151 179 A153 108 dtype: int64import matplotlib.pyplot as plt%matplotlib inlinedata.plot(x='lable', y='Age', kind='scatter',alpha=0.02, s=50);12341234data.hist('Age', bins=15);11target=data.lable11features_data=data.drop('lable',axis=1)11numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量11numeric_features11[‘Duration’, ‘Credit amount’, ‘instPercent’, ‘Residence duration’, ‘Age’, ‘Credits’, ‘Dependents’]numeric_data = features_data[numeric_features]11numeric_data.head()11DurationCredit amountinstPercentResidence durationAgeCreditsDependents06 1169 4 4 67 2 1 148 5951 2 2 22 1 1 212 2096 2 3 49 1 2 342 7882 2 4 45 1 2 424 4870 3 4 53 2 2 categorical_data = features_data.drop(numeric_features, axis=1)11categorical_data.head()11BalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign0A11 A34 A43 A65 A75 A93 A101 A121 A143 A152 A173 A192 A201 1A12 A32 A43 A61 A73 A92 A101 A121 A143 A152 A173 A191 A201 2A14 A34 A46 A61 A74 A93 A101 A121 A143 A152 A172 A191 A201 3A11 A32 A42 A61 A74 A93 A103 A122 A143 A153 A173 A191 A201 4A11 A33 A40 A61 A73 A93 A101 A124 A143 A153 A173 A191 A201 categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示# apply运算将转换函数应用到每一个变量维度categorical_data_encoded.head(5)123123BalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign00 0 0 0 0 0 0 0 0 0 0 0 0 11 1 0 1 1 1 0 0 0 0 0 1 0 22 0 1 1 2 0 0 0 0 0 1 1 0 30 1 2 1 2 0 1 1 0 1 0 1 0 40 2 3 1 1 0 0 2 0 1 0 1 0 features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并features.head()# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:# features = pd.get_dummies(features_data)# features.head()1234512345DurationCredit amountinstPercentResidence durationAgeCreditsDependentsBalanceHistoryPurposeSavingsEmploymentsexMarriedGuarantorsAssetsconcCreditApartmentOccupationhasPhoneForeign06 1169 4 4 67 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 148 5951 2 2 22 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 212 2096 2 3 49 1 2 2 0 1 1 2 0 0 0 0 0 1 1 0 342 7882 2 4 45 1 2 0 1 2 1 2 0 1 1 0 1 0 1 0 424 4870 3 4 53 2 2 0 2 3 1 1 0 0 2 0 1 0 1 0 X = features.values.astype(np.float32) # 转换数据类型y = (target.values == 1).astype(np.int32) # 1:good,2:bad1212from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比1234512345from sklearn.tree import DecisionTreeClassifierfrom sklearn.cross_validation import cross_val_scoreclf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc') print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(np.mean(scores), np.std(scores)))123456789123456789ROC AUC Decision Tree: 0.6866 +/-0.0105#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)from sklearn.learning_curve import learning_curvedef plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),scoring=None):plt.title("Learning curves for %s" % type(estimator).__name__)plt.ylim(*ylim); plt.grid()plt.xlabel("Training examples")plt.ylabel("Score")train_sizes, train_scores, validation_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,scoring=scoring)train_scores_mean = np.mean(train_scores, axis=1)validation_scores_mean = np.mean(validation_scores, axis=1)plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",label="Cross-validation score")plt.legend(loc="best")print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))12345678910111213141516171819202122231234567891011121314151617181920212223clf = DecisionTreeClassifier(max_depth=None)plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了123123Best validation score: 0.6310clf = DecisionTreeClassifier(max_depth=10)plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212Best validation score: 0.6565clf = DecisionTreeClassifier(max_depth=8)plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212Best validation score: 0.6762clf = DecisionTreeClassifier(max_depth=5)plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212Best validation score: 0.7219clf = DecisionTreeClassifier(max_depth=4)plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212Best validation score: 0.7226