Feature processing of individual loan default prediction competition in CCF big data and computational intelligence competition

Game address portal:

CCF big data and computing intelligence competition

First read the data

import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import OneHotEncoder

train_data = pd.read_csv(r'data/data117603/train_public.csv')
test_public = pd.read_csv(r'data/data117603/test_public.csv')
df_features = train_data.append(test_public)

Try not to process the data first. What is the score of directly input into the model

LabelEncoder the text features first

cat_cols = ['class', 'employer_type', 'industry', 'work_year', 'issue_date', 'earlies_credit_mon']

from sklearn.preprocessing import LabelEncoder
for feat in cat_cols:
    lbl = LabelEncoder()
    df_features[feat] = lbl.fit_transform(df_features[feat])

lgb classification model is used for prediction

df_train = df_features[~df_features['isDefault'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_features[df_features['isDefault'].isnull()]

no_features = ['user_id', 'loan_id', 'isDefault']

# Input characteristic column
features = [col for col in df_train.columns if col not in no_features]

X = df_train[features] # Training set input
y = df_train['isDefault'] # Training set label
X_test = df_test[features] # Test set input

folds = KFold(n_splits=5, shuffle=True, random_state=2019)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(X_test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    trn_x, trn_y = X[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X[features].iloc[val_idx], y.iloc[val_idx]
    clf = LGBMClassifier()

    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
           )

    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(X_test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits


    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y

print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))

It can be seen here that when I only use LabelEncoder for text features, the AUC score is only 0.878533

Next, try to process the data

Because in 'work_ There are some features < 1 + 10 + in 'year', so I'll deal with these unqualified features first

def workYearDIc(x):
    if str(x)=='nan':
        return -1
    try:
        x = x.replace('< 1','0').replace('10+ ','10')
    except:
        pass
    return int(re.search('(\d+)', x).group())

df_features['work_year'] = df_features['work_year'].map(workYearDIc)

Next, I will deal with the characteristics of time according to the characteristics themselves

a = []
for i in range(15000):
    try:
        a.append(pd.to_datetime(df_features['earlies_credit_mon'].values[i]))
    except:
        try:
            a.append(pd.to_datetime('9' + df_features['earlies_credit_mon'].values[i]))
        except:
            a.append(pd.to_datetime('20' + df_features['earlies_credit_mon'].values[i]))
            
df_features['earlies_credit_mon'] = a

df_features['earlies_credit_mon'] = pd.to_datetime(df_features['earlies_credit_mon'])
df_features['issue_date'] = pd.to_datetime(df_features['issue_date'])

df_features['issue_date_month'] = df_features['issue_date'].dt.month
df_features['issue_date_dayofweek'] = df_features['issue_date'].dt.dayofweek
df_features['earliesCreditMon'] = df_features['earlies_credit_mon'].dt.month
df_features['earliesCreditYear'] = df_features['earlies_credit_mon'].dt.year

Identify category features

df_features['class'] = df_features['class'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6})

LabelEncoder the two features

cat_cols = ['employer_type', 'industry']

from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    lbl = LabelEncoder()
    df_features[col] = lbl.fit_transform(df_features[col])

Then delete the two previously processed features, and the feature 'policy_code 'because its values are all zero, it is also deleted

col_to_drop = ['issue_date', 'earlies_credit_mon', 'policy_code']
df_features = df_features.drop(col_to_drop, axis=1)

Then fill in some features with missing values * * -1**

df_features['pub_dero_bankrup'].fillna(-1, inplace=True)
df_features['f0'].fillna(-1, inplace=True)
df_features['f1'].fillna(-1, inplace=True)
df_features['f2'].fillna(-1, inplace=True)
df_features['f3'].fillna(-1, inplace=True)
df_features['f4'].fillna(-1, inplace=True)

Then divide the training and test sets

df_train = df_features[~df_features['isDefault'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_features[df_features['isDefault'].isnull()]

no_features = ['user_id', 'loan_id', 'isDefault']

# Input characteristic column
features = [col for col in df_train.columns if col not in no_features]

X = df_train[features] # Training set input
y = df_train['isDefault'] # Training set label
X_test = df_test[features] # Test set input

After that, the training set and test set are standardized. Here, I tried the maximum normalization and mean variance normalization, and found that the mean variance normalization is better, so I used the mean variance normalization

class StandardScaler:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None
    def fit(self,X):
        '''According to the training data set X Obtain the mean and variance of the data'''
        self.mean_ = np.array([np.mean(X[:,i]) for i in range(X.shape[1])])
        self.scale_ = np.array([np.std(X[:,i]) for i in range(X.shape[1])])
        return self
    def transform(self,X):
        '''take X according to Standardcaler Normalize the mean variance'''
        resX = np.empty(shape=X.shape,dtype=float)
        for col in range(X.shape[1]):
            resX[:,col] = (X[:,col]-self.mean_[col]) / (self.scale_[col])
        return resX

X_col = X.columns
X_test_col = X_test.columns
StandardScaler = StandardScaler()
StandardScaler.fit(X.values)
X = StandardScaler.transform(X.values)
X_test = StandardScaler.transform(X_test.values)

X = pd.DataFrame(X, columns=X_col)
X_test = pd.DataFrame(X_test, columns=X_test_col)

Because there is a problem of sample imbalance in this training set, I use SMOTE function for oversampling to solve the imbalance problem. It may be better to use down sampling or other methods to deal with sample imbalance here, but I haven't tried others. We can try others

from imblearn.over_sampling import SMOTE

X, y = SMOTE(random_state=42).fit_resample(X, y)

Next, input model training samples to see the effect

folds = KFold(n_splits=5, shuffle=True, random_state=2019)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(X_test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    trn_x, trn_y = X[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X[features].iloc[val_idx], y.iloc[val_idx]
    clf = LGBMClassifier()

    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=40  #30
           )

    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(X_test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits


    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y

print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))

In the final results, it can be seen that the AUC score after feature processing has been greatly improved

Then we look at the correlation between features to construct some new features according to these correlations

The correlation between features is calculated and displayed by thermal diagram

from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong'] # Specifies the default font
mpl.rcParams['axes.unicode_minus'] = False # Solve the problem that the saved image is negative
plt.figure(figsize=(30, 30))
ax = sns.heatmap(df_features.corr(),linewidths=5,vmax=1.0, square=True,linecolor='white', annot=True)
ax.tick_params(labelsize=10)
plt.show()

Looking at this thermodynamic diagram, it is not difficult for us to find a large correlation between some features

I have fused the features with high correlation between these features. I have tried many times and found that the following are the most important factors to improve the score
However, I tried one by one before. These new features are obvious for the improvement of AUC. However, when I added these new features to the model for training, I found that the improvement of AUC is not great, but I listed them and added them to the training together

df_features['new_1'] = df_features['total_loan'] / df_features['monthly_payment']
df_features['new_3'] = df_features['known_dero'] - df_features['pub_dero_bankrup']
df_features['new_6'] = df_features['known_outstanding_loan'] * df_features['pub_dero_bankrup']
df_features['new_11'] = df_features['f0'] / df_features['f3']
df_features['new_12'] = df_features['f0'] + df_features['f4']
df_features['new_18'] = df_features['f3'] * df_features['f4']

The final training scores are as follows:

Finally, we perform pseudo label processing on the data to further improve our AUC

Here is an introduction to pseudo tags

Here, I set all the data with the predicted value less than 0.05 to 0 and add it to the training set as a pseudo label, because the predicted result less than or equal to 0.05 can generally be regarded as 0

test_public['isDefault'] = sub_preds
test_public.loc[test_public['isDefault']<0.05,'isDefault'] = 0

InteId = test_public.loc[test_public.isDefault<0.05, 'loan_id'].tolist()
use_te = test_public[test_public.loan_id.isin( InteId )].copy()

Then re read the data of the training set and prediction set, and then combine the two data with the data we want to use as a pseudo label to form a new data

test = pd.read_csv(r'data/data117603/test_public.csv')
train = pd.read_csv(r'data/data117603/train_public.csv')
df_features = pd.concat([train,test,use_te]).reset_index(drop=True)

After that, as like as two peas, the character processing is then put into the model for training.
The training results are as follows:

From the results, we can see that the improvement is still very large,
In fact, many iterations can be carried out here to add more pseudo tags, but I won't do so many times here
Moreover, it would be better to rely on the business background for feature processing, but I didn't do data processing based on the business background because I didn't understand the business background

Keywords: Big Data Machine Learning Deep Learning data visualization

Added by genesysmedia on Sun, 28 Nov 2021 08:15:17 +0200

Programming VIP