8 Credit Card Fraud
8.1 Load: Credit Card Fraud
- Source: Kaggle
Python
loc = 'data/Y_04_CCR'
loc_csv = loc + '.csv'
loc_bin = loc + '.feather'
if(not os.path.exists(loc_bin)):
    #Read | Relocate | Rename | Factor | Levels | ID | Save |
    pp = pd.read_csv(loc_csv) 
    if(False): pp.drop(columns = ['Time'], inplace = True)   #Drop Columns
    qq = list(pp.columns)
    qq.insert(0, qq.pop(qq.index('Amount')))      #Relocate list element
    qq.insert(0, qq.pop(qq.index('Class')))
    pp = pp.reindex(columns = qq)
    pp.columns.values[[np.arange(3, 3+9)]] = \
      ['V0' + str(i) for i in np.arange(1, 1+9)]  #Rename V1 to V01
    pp.rename(columns ={'Class': 'is_Found'}, inplace = True)
    pp['is_Found'] = pp['is_Found'].astype('category')
    pp['is_Found'] = pp['is_Found'].cat.rename_categories(['No', 'Yes'])
    pp.insert(0, 'ID', range(1, 1+len(pp)))
    pyarrow.feather.write_feather(pp, loc_bin)    #Data: Credit Card Fraud
else:
    pp = pyarrow.feather.read_feather(loc_bin)
y_ccr = pp.copy()
pp.shape
## (284807, 32)
qq = list(pp.columns)                             #Column Names
if(False): list(qq[0:5]) + list(qq[-3:len(qq)])
print('\n'.join([', '.join(qq[i:i+8]) for i in range(0,len(qq),8)]))
## ID, is_Found, Amount, Time, V01, V02, V03, V04
## V05, V06, V07, V08, V09, V10, V11, V12
## V13, V14, V15, V16, V17, V18, V19, V20
## V21, V22, V23, V24, V25, V26, V27, V28
qq = pp['is_Found'].value_counts()
qq = qq.to_frame(qq.index.name)
qq.index.name = None
qq['PCT'] = 100 * qq['is_Found'] / qq['is_Found'].sum()          #Count & PCT
print(qq)
##      is_Found        PCT
## No     284315  99.827251
## Yes       492   0.172749
if(False): pp.info(memory_usage = False)
if(False): pp['Amount'].describe()
if(False): pp['Time'].max()                       #(2 Days) 2*24*60*60 - 88.3 Train & Test
Python
Python
# Split Train and Test with similar proportion of Response
y_trn_x_id, y_tst_x_id, y_trn_y_id, y_tst_y_id = \
    sklearn.model_selection.train_test_split(y_ccr_X, y_ccr_Y, 
        test_size = 0.2, random_state = 3)
y_trn_y = y_trn_y_id['is_Found']
y_tst_y = y_tst_y_id['is_Found']
pp = [i for i in list(y_trn_x_id.columns) if i not in ['ID']]
y_trn_x = y_trn_x_id[pp]
y_tst_x = y_tst_x_id[pp]
if(True): y_trn_x.shape, y_tst_x.shape, y_trn_y.shape, y_tst_y.shape
## ((227845, 30), (56962, 30), (227845,), (56962,))
pp = y_trn_y.value_counts().to_frame()
pp['PCT'] = 100 * pp / pp.sum()
pp
##            count        PCT
## is_Found                   
## No        227453  99.827953
## Yes          392   0.172047
qq = y_tst_y.value_counts().to_frame()
qq['PCT'] = 100 * qq / qq.sum()
qq
##           count        PCT
## is_Found                  
## No        56862  99.824444
## Yes         100   0.1755568.4 Random Forest
Python
loc = 'data/Y_04_RFC.sav'
if(not os.path.exists(loc)):
    # Fit
    y_rfc = sklearn.ensemble.RandomForestClassifier(n_estimators = 100, 
              criterion = 'gini', random_state = 3, n_jobs = -1, verbose = 0)
    y_rfc.fit(y_trn_x, y_trn_y) 
    joblib.dump(y_rfc, loc)
else:
    y_rfc = joblib.load(loc)
list(y_rfc.classes_)                              #Classes Labels 
## ['No', 'Yes']
y_rfc.n_features_in_                              #Count Features
## 30
y_rfc_xs = list(y_rfc.feature_names_in_)          #Features
print('\n'.join([', '.join(y_rfc_xs[i:i+8]) for i in range(0,len(y_rfc_xs),8)]))
## Amount, Time, V01, V02, V03, V04, V05, V06
## V07, V08, V09, V10, V11, V12, V13, V14
## V15, V16, V17, V18, V19, V20, V21, V22
## V23, V24, V25, V26, V27, V28Python
pp = [ round(i, 3) for i in y_rfc.feature_importances_ ]
#print('\n'.join([', '.join(map(str, pp[i:i+8])) for i in range(0,len(pp),8)]))
qq = pd.DataFrame(dict(Features = y_rfc_xs, Importance = pp))
qq.sort_values('Importance', ascending = False).head()
##    Features  Importance
## 18      V17       0.152
## 15      V14       0.150
## 13      V12       0.125
## 11      V10       0.094
## 17      V16       0.071Python
print(round(y_rfc.score(y_trn_x, y_trn_y), 5))    #Train
## 1.0
print(round(y_rfc.score(y_tst_x, y_tst_y), 5))    #Test
## 0.99942
# Confusion Matrix
pd.crosstab(y_tst_y, y_rfc_pred, 
            rownames = ['Actual'], colnames = ['Predicted'], margins = True)
## Predicted     No  Yes    All
## Actual                      
## No         56855    7  56862
## Yes           26   74    100
## All        56881   81  56962
pp = sklearn.metrics.confusion_matrix(y_tst_y, y_rfc_pred, 
                      labels = list(y_rfc.classes_))
if(False): print(pp)
with np.printoptions(precision = 5, suppress = True):
    print(100 * pp/np.sum(pp))                    #Percent
## [[99.81216  0.01229]
##  [ 0.04564  0.12991]]
tn, fp, fn, tp = pp.ravel()
(tn, fp, fn, tp)
## (56855, 7, 26, 74)
pp = y_tst_y.values.astype('object')
pp.size
## 56962
y_unique, y_counts = np.unique(pp, return_counts = True)
print(np.asarray((y_unique, y_counts)).T)
## [['No' 56862]
##  ['Yes' 100]]
qq = y_rfc_pred
qq.size
## 56962
y_unique, y_counts = np.unique(qq, return_counts = True)
print(np.asarray((y_unique, y_counts)).T)
## [['No' 56881]
##  ['Yes' 81]]
print('Accuracy (Avoid): ', sklearn.metrics.accuracy_score(
       y_tst_y, y_rfc_pred)) 
## Accuracy (Avoid):  0.999420666409185
print('Precision: ', sklearn.metrics.precision_score(
       y_tst_y, y_rfc_pred, pos_label = 'Yes')) #tp/(tp+fp)
## Precision:  0.9135802469135802
print('Recall (Sensitivity) (More Important): ', sklearn.metrics.recall_score(
       y_tst_y, y_rfc_pred, pos_label = 'Yes')) #tp/(tp+fn)
## Recall (Sensitivity) (More Important):  0.74
print('F1 (Avoid): ', sklearn.metrics.f1_score(
       y_tst_y, y_rfc_pred, pos_label = 'Yes')) #2*tp/(2*tp+fn+fp)
## F1 (Avoid):  0.81767955801104978.5 Confusion Matrix
- Precision makes sure we do not spot good transactions as fraudulent in our problem.
- Recall assures we do not predict fraudulent transactions as all good.
R
# Confusion Matrix 
#     Actual Positive       Negative
#  Predicted
#  Positive  TP             FP (I)          Precision = TP /(TP+FP)
#  Negative  FN (II)        TN              NPV = TN /(TN+FN)
#            Sensitivity    Specificity     Accuracy
#                  = TP /(TP+FN)  = TN /(TN+FP)   = (TP+TN) /N8.6 Undersampling
- To be done only on the Training Data, not on the Test Data
- Near Miss refers to a group of undersampling strategies that pick samples based on the distance between majority and minority class instances.
Python
#def q_get_best_model(estimator, params, kf = kf, label = 'Yes', 
#      train_X = y_trn_x, train_Y = y_trn_y, test_X = y_tst_x, test_Y = y_tst_y, 
#      is_grid_search = True, sampling = NearMiss(), scoring='f1', n_jobs = -1):
#    if sampling is None:
#        # make the pipeline of only the estimator
#        pipeline = make_pipeline(estimator)
#    else:
#        # make the pipeline of over or undersampling and estimator
#        pipeline = make_pipeline(sampling, estimator)
#    # get the estimator name
#    estimator_name = estimator.__class__.__name__.lower()
#    # construct the parameters for grid/random search cv
#    new_params = {f'{estimator_name}__{key}': params[key] for key in params}
#    if is_grid_search:
#        # grid search instead of randomized search
#        search = sklearn.model_selection.GridSearchCV(pipeline, 
#            param_grid = new_params, cv = kf, 
#            return_train_score = True, n_jobs = n_jobs, verbose = 0)
#    else:
#        # randomized search
#        search = sklearn.model_selection.RandomizedSearchCV(pipeline, 
#            param_distributions = new_params, cv = kf, scoring = scoring, 
#            return_train_score = True, n_jobs = n_jobs, verbose = 0)
#    # fit the model
#    search.fit(train_X, train_Y)
#    cv_score = cross_val_score(search, train_X, train_Y, 
#                               scoring = scoring, cv = kf)
#    # make predictions on the test data
#    y_pred = search.best_estimator_.named_steps[estimator_name].predict(test_X)
#    # calculate the metrics: recall, accuracy, F1 score, etc.
#    recall = sklearn.metrics.recall_score(test_Y, y_pred, pos_label = label)
#    precision = sklearn.metrics.precision_score(test_Y, y_pred, pos_label = label)
#    
#    print('Hi') #xxxx
#    
#    accuracy = sklearn.metrics.accuracy_score(test_Y, y_pred)
#    f1 = sklearn.metrics.f1_score(test_Y, y_pred, pos_label = label)
#    y_proba = search.best_estimator_.named_steps[estimator_name].predict_proba(test_X)[::, 1]
#    fpr, tpr, _ = roc_curve(test_Y, y_proba)
#    auc = roc_auc_score(test_Y, y_proba)
#    # return the best estimator along with the metrics
#    return({
#        'best_estimator': search.best_estimator_,
#        'estimator_name': estimator_name,
#        'cv_score': cv_score,
#        'recall': recall,
#        'precision': precision,
#        'accuracy': accuracy,
#        'f1_score': f1,
#        'fpr': fpr,
#        'tpr': tpr,
#        'auc': auc,
#    })
#8.7 Model
Python
if(False):
    # Cross Validation Framework 
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import GridSearchCV, cross_val_score, RandomizedSearchCV
    #kf = StratifiedKFold(n_splits = 5, random_state = None, shuffle = False)
    # Imbalance
    from imblearn.pipeline import make_pipeline
    from imblearn.under_sampling import NearMiss
    from imblearn.over_sampling import SMOTE
    # Metrics
    from sklearn.metrics import roc_curve, roc_auc_score
    # Classifiers
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier8.8 Tuning
Python
# Number of trees in random forest
#n_estimators = np.linspace(100, 3000, int((3000-100)/200) + 1, dtype=int)
n_estimators = np.array([100])
# Number of features to consider at every split
max_features = ['sqrt'] #['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [1, 2, 3]
# Minimum number of samples required to split a node
# min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 10, num = 9)]
min_samples_split = [2, 3]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
bootstrap = [True] #[True, False]
# Criterion
criterion = ['gini'] #['gini', 'entropy']
random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}8.9 Small Test Case
Python
y_rfc_base = sklearn.ensemble.RandomForestClassifier() 
if(False):
    y_rfc_cv_pre = sklearn.model_selection.RandomizedSearchCV(
                estimator = y_rfc_base, param_distributions = random_grid,
                n_iter = 3, cv = 2, verbose = 0, random_state = 3, n_jobs = -1)
    y_rfc_cv_pre.fit(y_trn_x, y_trn_y) 
    print(y_rfc_cv_pre.best_params_)R
if(TRUE) py_config()         #Python Configuration
## python:         C:/Softwares/Python/Python312/python.exe
## libpython:      C:/Softwares/Python/Python312/python312.dll
## pythonhome:     C:/Softwares/Python/Python312
## version:        3.12.1 (tags/v3.12.1:2305ca5, Dec  7 2023, 22:03:25) [MSC v.1937 64 bit (AMD64)]
## Architecture:   64bit
## numpy:          C:/Softwares/Python/Python312/Lib/site-packages/numpy
## numpy_version:  1.26.3
## 
## NOTE: Python version was forced by use_python() function
if(FALSE) q_url[ , 'URL']     #List of URL of this Page
if(FALSE) q_()                #R Objects of this Page excluding 'q_*'