The objective of this analysis is to develop a robust and accurate machine learning model for detecting credit card fraud. This involves predicting whether a given credit card transaction is fraudulent based on a set of features extracted from transaction data.
supervised classification task
"""!mkdir {the project directory on your pc}/kaggle
!cp kaggle.json {the project directory on your pc}/kaggle/
!chmod 600 {the project directory on your pc}/kaggle/kaggle.json
!kaggle datasets download -d mlg-ulb/creditcardfraud
!unzip creditcardfraud.zip"""
# create the project directory on your pc
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
transactions = pd.read_csv("creditcard.csv")
transactions.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
transactions.describe()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 284807.000000 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | ... | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 2.848070e+05 | 284807.000000 | 284807.000000 |
mean | 94813.859575 | 1.168375e-15 | 3.416908e-16 | -1.379537e-15 | 2.074095e-15 | 9.604066e-16 | 1.487313e-15 | -5.556467e-16 | 1.213481e-16 | -2.406331e-15 | ... | 1.654067e-16 | -3.568593e-16 | 2.578648e-16 | 4.473266e-15 | 5.340915e-16 | 1.683437e-15 | -3.660091e-16 | -1.227390e-16 | 88.349619 | 0.001727 |
std | 47488.145955 | 1.958696e+00 | 1.651309e+00 | 1.516255e+00 | 1.415869e+00 | 1.380247e+00 | 1.332271e+00 | 1.237094e+00 | 1.194353e+00 | 1.098632e+00 | ... | 7.345240e-01 | 7.257016e-01 | 6.244603e-01 | 6.056471e-01 | 5.212781e-01 | 4.822270e-01 | 4.036325e-01 | 3.300833e-01 | 250.120109 | 0.041527 |
min | 0.000000 | -5.640751e+01 | -7.271573e+01 | -4.832559e+01 | -5.683171e+00 | -1.137433e+02 | -2.616051e+01 | -4.355724e+01 | -7.321672e+01 | -1.343407e+01 | ... | -3.483038e+01 | -1.093314e+01 | -4.480774e+01 | -2.836627e+00 | -1.029540e+01 | -2.604551e+00 | -2.256568e+01 | -1.543008e+01 | 0.000000 | 0.000000 |
25% | 54201.500000 | -9.203734e-01 | -5.985499e-01 | -8.903648e-01 | -8.486401e-01 | -6.915971e-01 | -7.682956e-01 | -5.540759e-01 | -2.086297e-01 | -6.430976e-01 | ... | -2.283949e-01 | -5.423504e-01 | -1.618463e-01 | -3.545861e-01 | -3.171451e-01 | -3.269839e-01 | -7.083953e-02 | -5.295979e-02 | 5.600000 | 0.000000 |
50% | 84692.000000 | 1.810880e-02 | 6.548556e-02 | 1.798463e-01 | -1.984653e-02 | -5.433583e-02 | -2.741871e-01 | 4.010308e-02 | 2.235804e-02 | -5.142873e-02 | ... | -2.945017e-02 | 6.781943e-03 | -1.119293e-02 | 4.097606e-02 | 1.659350e-02 | -5.213911e-02 | 1.342146e-03 | 1.124383e-02 | 22.000000 | 0.000000 |
75% | 139320.500000 | 1.315642e+00 | 8.037239e-01 | 1.027196e+00 | 7.433413e-01 | 6.119264e-01 | 3.985649e-01 | 5.704361e-01 | 3.273459e-01 | 5.971390e-01 | ... | 1.863772e-01 | 5.285536e-01 | 1.476421e-01 | 4.395266e-01 | 3.507156e-01 | 2.409522e-01 | 9.104512e-02 | 7.827995e-02 | 77.165000 | 0.000000 |
max | 172792.000000 | 2.454930e+00 | 2.205773e+01 | 9.382558e+00 | 1.687534e+01 | 3.480167e+01 | 7.330163e+01 | 1.205895e+02 | 2.000721e+01 | 1.559499e+01 | ... | 2.720284e+01 | 1.050309e+01 | 2.252841e+01 | 4.584549e+00 | 7.519589e+00 | 3.517346e+00 | 3.161220e+01 | 3.384781e+01 | 25691.160000 | 1.000000 |
8 rows × 31 columns
transactions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null int64 dtypes: float64(30), int64(1) memory usage: 67.4 MB
balance = transactions['Class'].value_counts()
num_samp = len(transactions.index)
print(f"normal transactions, {balance.values[0]}, are {balance.values[0]/num_samp*100:.4f}% of the total samples which is {num_samp}.")
print(f"fraud transactions, {balance.values[1]}, are {balance.values[1]/num_samp*100:.4f}% of the total samples which is {num_samp}.")
normal transactions, 284315, are 99.8273% of the total samples which is 284807. fraud transactions, 492, are 0.1727% of the total samples which is 284807.
class_counts = transactions["Class"].value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=class_counts.index, y=class_counts.values, palette='viridis')
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()
transactions.iloc[:-1,:-1].hist(bins=50, figsize=(24, 16))#class feature has already been shown above
plt.show()
from sklearn.model_selection import train_test_split
unused_transactions_train, test = train_test_split(
transactions, test_size=0.2, stratify=transactions["Class"], random_state=42)
# it's not a random split. I'm choosing the right amount of samples for test and train set based on a important feature, in this case the feature is class
# this transactions_train woudln't be used for training, read undersampling comments.
# this test set would be used to evaulate the performance of the final models, read undersampling comments.
# to deal with highly imbalance dataset, we need to create a dataset which has equal amount of samples for both classes
# however, notice that test set had been splited before undersampling. because in the end we want to measure the performance of the models by the original dataset
# step 1. isolate fraud transactions. class = 1
fraud_transactions = transactions[transactions["Class"] == 1]
# step 2. since we have 492 fraud transactions we gonna need 492 normal transactions to form a balance dataset. we gonna choose normal transactions randomly
normal_transactions = transactions[transactions["Class"] == 0]
random_samples = normal_transactions.index.to_series().sample(n=492, replace=False, random_state=42)
normal_transactions = normal_transactions.iloc[random_samples,:]
# this cell may get an error on the first attempt! if it did, rerun it.
print(f"fraud_transactions shape = {fraud_transactions.shape}, normal_transactions shape = {normal_transactions.shape}")
fraud_transactions shape = (492, 31), normal_transactions shape = (492, 31)
# Good!
# we just need to concat these 2 datasets to have a balance dataset
transactions = pd.concat([fraud_transactions, normal_transactions])
transactions.shape
(984, 31)
# Great!
# now we have a balance dataset
# we need a train set
transactions_train, unused_test = train_test_split(
transactions, test_size=0.2, random_state=42)
transactions = transactions_train.copy() # to make naming clear, transactions is the train set
transactions.shape
(787, 31)
transactions.info()
# no missing value
# no categorical variable
<class 'pandas.core.frame.DataFrame'> Index: 787 entries, 148628 to 33276 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 787 non-null float64 1 V1 787 non-null float64 2 V2 787 non-null float64 3 V3 787 non-null float64 4 V4 787 non-null float64 5 V5 787 non-null float64 6 V6 787 non-null float64 7 V7 787 non-null float64 8 V8 787 non-null float64 9 V9 787 non-null float64 10 V10 787 non-null float64 11 V11 787 non-null float64 12 V12 787 non-null float64 13 V13 787 non-null float64 14 V14 787 non-null float64 15 V15 787 non-null float64 16 V16 787 non-null float64 17 V17 787 non-null float64 18 V18 787 non-null float64 19 V19 787 non-null float64 20 V20 787 non-null float64 21 V21 787 non-null float64 22 V22 787 non-null float64 23 V23 787 non-null float64 24 V24 787 non-null float64 25 V25 787 non-null float64 26 V26 787 non-null float64 27 V27 787 non-null float64 28 V28 787 non-null float64 29 Amount 787 non-null float64 30 Class 787 non-null int64 dtypes: float64(30), int64(1) memory usage: 196.8 KB
transactions.duplicated().sum()
# 12 samples are duplicated
12
transactions.drop_duplicates(inplace = True)
transactions.shape
(775, 31)
transactions.head()
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
148628 | 90086.0 | -2.178245 | -0.911402 | 0.843247 | -0.354214 | 3.269231 | -1.920507 | -0.295878 | 0.030576 | 0.952497 | ... | -0.357689 | -1.501182 | 0.076067 | 0.294249 | 0.684123 | 0.183607 | -0.160125 | 0.099012 | 1.98 | 0 |
84939 | 60530.0 | 1.496612 | -1.450776 | 0.511173 | -1.612566 | -1.265303 | 0.956041 | -1.693659 | 0.287020 | -1.150133 | ... | -0.295541 | -0.355211 | -0.172903 | -1.402114 | 0.436249 | -0.107190 | 0.068504 | 0.007694 | 30.97 | 0 |
154286 | 101051.0 | -1.465316 | -1.093377 | -0.059768 | 1.064785 | 11.095089 | -5.430971 | -9.378025 | -0.446456 | 1.992110 | ... | 1.160623 | -1.259697 | -15.981649 | -0.883670 | -3.536716 | -0.592965 | 0.675525 | 0.424849 | 0.92 | 1 |
203702 | 134928.0 | 0.873930 | -2.379076 | -1.465831 | 0.527825 | -0.778962 | 0.288092 | 0.189899 | -0.143854 | 0.998534 | ... | 0.502168 | 0.157962 | -0.415068 | 0.123369 | -0.451026 | 0.594766 | -0.157082 | 0.059854 | 628.50 | 0 |
178971 | 123866.0 | 0.958878 | -3.119481 | -2.144711 | -0.363285 | -0.990840 | 0.348700 | 0.028338 | -0.120619 | 0.017107 | ... | 0.842737 | 1.200739 | -0.750967 | 0.317934 | 0.269758 | 0.207906 | -0.149787 | 0.029467 | 650.00 | 0 |
5 rows × 31 columns
# Time feature -> StandardScaler -> range between 0 to 1
# Amount feature -> RobustScaler -> deals better with outliers
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
scaler1 = StandardScaler()
scaler2 = RobustScaler()
transactions["Time"] = scaler1.fit_transform(transactions[["Time"]])
transactions["Amount"] = scaler1.fit_transform(transactions[["Amount"]])
axes = transactions.loc[:,["Time", "Amount"]].hist(bins=50, figsize=(30, 4))
for ax, column in zip(axes.flatten(), ["Time", "Amount"]):
ax.set_xlabel("Value")
ax.set_ylabel("Frequency")
ax.set_title(f"{column} Distribution")
plt.show()
# let's measure correlations
corr_matrix = transactions.corr()
# Let's visualize correlations
plt.figure(figsize=(30, 6))
corr_matrix["Class"].sort_values(ascending=False).plot(kind="bar", color="green")
plt.title("Correlation of fraud/normal transactions with other attributes", fontsize=16)
plt.xlabel("Features")
plt.ylabel("Correlation")
plt.xticks(rotation=45, ha="right")
plt.show()
# V25 feature has almost no effect on the transaction being fraud or normal. it can be dropped.
transactions.drop(columns="V25", inplace=True)
transactions.shape
# now we have 30 features
(775, 30)
# check whether there are highly correlated features to choose one of them
corr_matrix = transactions.corr()
features_corr = corr_matrix.iloc[:-1, :-1]
target_corr = corr_matrix["Class"]
high_corrs = set()
for feature in features_corr.columns: # finding features which are highly correlated with the current feature
cols = features_corr.index[abs(features_corr[feature]) > 0.8]
for col in cols:
if col != feature and col not in high_corrs:
high_corrs.add(col)
high_corrs = list(high_corrs)
high_corrs.sort()
high_corrs_dict = {} # this dictionary would have features which are highly similar, and the correlation of each of them with the target value (ESTIMATED_VISITS)
for feature in high_corrs:
high_corrs_dict[feature] = target_corr[feature]
high_corrs_dict
{'V1': -0.4288524345143442, 'V10': -0.6301350744752954, 'V11': 0.6821940358206778, 'V12': -0.6716752429836114, 'V14': -0.7512824551766185, 'V16': -0.5840199349640787, 'V17': -0.5405735207869032, 'V18': -0.43556691503776856, 'V2': 0.4975663716987832, 'V3': -0.5688970132071082, 'V4': 0.712824013610517, 'V5': -0.35643919928264417, 'V7': -0.46906298705274524, 'V9': -0.5547230271748819}
# among highly correlated features, I want to keep only one that has the greatest absolute value correlation with the target value.
keep = []
seen = set() # as I iterate through features, I want to calculate their correlation with the target value just once
for feature in high_corrs_dict.keys():
if feature in seen: # if the feature was seen previously, it has been calculated before, go to the next feature
continue
greatest = feature
corr_features = features_corr.index[abs(features_corr[feature]) > 0.8].tolist()
corr_features.remove(feature) # list of correlated features
for f in corr_features:
seen.add(f)
if abs(high_corrs_dict[f]) > abs(high_corrs_dict[greatest]): # if its correlation is greater
greatest = f
if greatest not in keep:
keep.append(greatest)
len(keep)
# great! among 14 features, 4 of them carry the most influence on target value (Class/ transactions being fraud or normal), and the rest are redundant
4
# interestes columns are kept columns (previously calculated from correlations) + the ones which didn't have a high correlation with other features
unique_cols = set(transactions.columns) - set(high_corrs_dict.keys())
cols = keep + list(unique_cols)
len(cols)
# we've reduced attributes to 20
20
transactions = transactions[cols]
transactions.shape
(775, 20)
# let's visualize correlations again
corr_matrix = transactions.corr()
plt.figure(figsize=(30, 6))
corr_matrix["Class"].sort_values(ascending=False).plot(kind="bar", color="green")
plt.title("Correlation of fraud/normal transactions with non-redundant and influential attributes", fontsize=16)
plt.xlabel("Features")
plt.ylabel("Correlation")
plt.xticks(rotation=45, ha="right")
plt.show()
# datasets to feed the models
transactions_train.drop_duplicates(inplace = True) # drop duplicate rows
# y
transactions_target = transactions_train.pop("Class") # separate target feature
# Xs
cols.remove("Class") # Class is the target feature, so it should be dropped from features
transactions = transactions_train[cols] # only interestes columns as describe above
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
standard = ["Time"]
robust = ["Amount"]
preprocess_cols = standard+robust
no_process = list(transactions.columns)
for col in preprocess_cols:
no_process.remove(col)
preprocessing = ColumnTransformer([
("standard", StandardScaler(), standard),
("robust", RobustScaler(), robust),
("passthrough", "passthrough", no_process),
])
# building the model using pipeline
from sklearn.linear_model import LogisticRegression
log_reg = Pipeline([
("preprocessing", preprocessing),
("logistic_regression", LogisticRegression()),
])
# train the model
log_reg.fit(transactions, transactions_target)
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('logistic_regression', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('logistic_regression', LogisticRegression())])
ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])
['Time']
StandardScaler()
['Amount']
RobustScaler()
['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8']
passthrough
LogisticRegression()
# calculate evaluation metric using precision and recall
from sklearn.metrics import classification_report
log_predictions = log_reg.predict(transactions)
print(classification_report(transactions_target, log_predictions, target_names = ["Not Fraud", "Fraud"]))
precision recall f1-score support Not Fraud 0.92 0.97 0.94 394 Fraud 0.97 0.91 0.94 381 accuracy 0.94 775 macro avg 0.94 0.94 0.94 775 weighted avg 0.94 0.94 0.94 775
# not fraud precision: means out of all predicted normal transactions; 92% are actually normal but 8% of them are fraud (incorrectly labeld as normal)
# not fraud recall: means out of all actual normal transactions; 97% are predicted correctly but 3% are missed (they are normal but incorrectly predicted as fraud)
# fraud precision: means out of all predicted fraud transactions; 97% are actually fraud but 3% of them are normal (incorrectly labeld as fraud)
# fraud recall: means out of all actual fraud transactions; 91% are predicted correctly but 9% are missed. (they are fraud but incorrectly predicted as normal)
# cross-validation
from sklearn.model_selection import KFold, cross_val_score
cv_scores = cross_val_score(log_reg, transactions, transactions_target, cv=5, scoring='f1')
# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
# Print mean and standard deviation of cross-validation scores
print("Mean CV F1-Score of logistic regression:", mean_cv_score)
print("Standard Deviation of CV F1-Scores of logistic regression:", std_cv_score)
Mean CV F1-Score of logistic regression: 0.9377893960123185 Standard Deviation of CV F1-Scores of logistic regression: 0.015833081116849963
# building the model using pipeline
from sklearn.neighbors import KNeighborsClassifier
knn = Pipeline([
("preprocessing", preprocessing),
("KNN", KNeighborsClassifier()),
])
# train the model
knn.fit(transactions, transactions_target)
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('KNN', KNeighborsClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('KNN', KNeighborsClassifier())])
ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])
['Time']
StandardScaler()
['Amount']
RobustScaler()
['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8']
passthrough
KNeighborsClassifier()
# calculate evaluation metric using precision and recall
knn_predictions = knn.predict(transactions)
print(classification_report(transactions_target, knn_predictions, target_names = ["Not Fraud", "Fraud"]))
precision recall f1-score support Not Fraud 0.91 0.98 0.95 394 Fraud 0.98 0.90 0.94 381 accuracy 0.94 775 macro avg 0.95 0.94 0.94 775 weighted avg 0.95 0.94 0.94 775
# not fraud precision: means out of all predicted normal transactions; 91% are actually normal but 9% of them are fraud (incorrectly labeld as normal)
# not fraud recall: means out of all actual normal transactions; 98% are predicted correctly but 2% are missed (they are normal but incorrectly predicted as fraud)
# fraud precision: means out of all predicted fraud transactions; 98% are actually fraud but 2% of them are normal (incorrectly labeld as fraud)
# fraud recall: means out of all actual fraud transactions; 90% are predicted correctly but 10% are missed. (they are fraud but incorrectly predicted as normal)
# cross-validation
cv_scores = cross_val_score(knn, transactions, transactions_target, cv=5, scoring='f1')
# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
# Print mean and standard deviation of cross-validation scores
print("Mean CV F1-Score of KNN:", mean_cv_score)
print("Standard Deviation of CV F1-Scores of KNN:", std_cv_score)
Mean CV F1-Score of KNN: 0.9107122762295177 Standard Deviation of CV F1-Scores of KNN: 0.017907125931469123
# building the model using pipeline
from sklearn.svm import SVC
svm = Pipeline([
("preprocessing", preprocessing),
("svm", SVC(kernel='linear')),
])
# train the model
svm.fit(transactions, transactions_target)
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('svm', SVC(kernel='linear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('svm', SVC(kernel='linear'))])
ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])
['Time']
StandardScaler()
['Amount']
RobustScaler()
['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8']
passthrough
SVC(kernel='linear')
# calculate evaluation metric using precision and recall
svm_predictions = svm.predict(transactions)
print(classification_report(transactions_target, svm_predictions, target_names = ["Not Fraud", "Fraud"]))
precision recall f1-score support Not Fraud 0.92 0.98 0.95 394 Fraud 0.98 0.91 0.94 381 accuracy 0.95 775 macro avg 0.95 0.95 0.95 775 weighted avg 0.95 0.95 0.95 775
# not fraud precision: means out of all predicted normal transactions; 92% are actually normal but 8% of them are fraud (incorrectly labeld as normal)
# not fraud recall: means out of all actual normal transactions; 98% are predicted correctly but 2% are missed (they are normal but incorrectly predicted as fraud)
# fraud precision: means out of all predicted fraud transactions; 98% are actually fraud but 2% of them are normal (incorrectly labeld as fraud)
# fraud recall: means out of all actual fraud transactions; 91% are predicted correctly but 9% are missed. (they are fraud but incorrectly predicted as normal)
# cross-validation
cv_scores = cross_val_score(svm, transactions, transactions_target, cv=5, scoring='f1')
# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
# Print mean and standard deviation of cross-validation scores
print("Mean CV F1-Score of SVM:", mean_cv_score)
print("Standard Deviation of CV F1-Scores of SVM:", std_cv_score)
Mean CV F1-Score of SVM: 0.9344406220615431 Standard Deviation of CV F1-Scores of SVM: 0.018078289179674918
# building the model using pipeline
from sklearn.tree import DecisionTreeClassifier
tree_cls = Pipeline([
("preprocessing", preprocessing),
("tree_classification", DecisionTreeClassifier()),
])
# train the model
tree_cls.fit(transactions, transactions_target)
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('tree_classification', DecisionTreeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('tree_classification', DecisionTreeClassifier())])
ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])
['Time']
StandardScaler()
['Amount']
RobustScaler()
['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8']
passthrough
DecisionTreeClassifier()
# calculate evaluation metric using precision and recall
tree_predictions = tree_cls.predict(transactions)
print(classification_report(transactions_target, tree_predictions, target_names = ["Not Fraud", "Fraud"]))
precision recall f1-score support Not Fraud 1.00 1.00 1.00 394 Fraud 1.00 1.00 1.00 381 accuracy 1.00 775 macro avg 1.00 1.00 1.00 775 weighted avg 1.00 1.00 1.00 775
# not possible! it's overfitting
# cross-validation
cv_scores = cross_val_score(tree_cls, transactions, transactions_target, cv=5, scoring='f1')
# Calculate mean and standard deviation of cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()
# Print mean and standard deviation of cross-validation scores
print("Mean CV Score:", mean_cv_score)
print("Standard Deviation of CV Scores:", std_cv_score)
Mean CV Score: 0.901025974025974 Standard Deviation of CV Scores: 0.02476900119304011
# by cross validation the decesion tree model is the weakest model
# among all models(logistic regression, KNN, SVM, and decesion tree) after cross-validation the best model based on the F1-score is logistic regression
# plot ROC carve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
log_fpr, log_tpr, log_thresold = roc_curve(transactions_target, log_predictions)
knear_fpr, knear_tpr, knear_threshold = roc_curve(transactions_target, knn_predictions)
svc_fpr, svc_tpr, svc_threshold = roc_curve(transactions_target, svm_predictions)
tree_fpr, tree_tpr, tree_threshold = roc_curve(transactions_target, tree_predictions)
def graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr):
plt.figure(figsize=(16,8))
plt.title('ROC Curve \n Top 4 Classifiers', fontsize=18)
plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(transactions_target, log_predictions)))
plt.plot(knear_fpr, knear_tpr, label='KNears Neighbors Classifier Score: {:.4f}'.format(roc_auc_score(transactions_target, knn_predictions)))
plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(roc_auc_score(transactions_target, svm_predictions)))
plt.plot(tree_fpr, tree_tpr, label='Decision Tree Classifier Score: {:.4f}'.format(roc_auc_score(transactions_target, tree_predictions)))
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.01, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
arrowprops=dict(facecolor='#6E726D', shrink=0.05),
)
plt.legend()
graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr)
plt.show()
# fine-tune the performace model, logistic regression
from sklearn.model_selection import GridSearchCV
log_reg = Pipeline([
("preprocessing", preprocessing),
("logistic_regression", LogisticRegression()),
]) # building the model using pipeline
param_grid = [
{"logistic_regression__penalty": ['l1', 'l2'], "logistic_regression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
] # a list of logistic regression model hyperparameters
log_grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='f1')
# train random logistic regression model with all combinations of hyperparameters
# 14 different model, each train 5 times, makes it 70 rounds of training!
log_grid_search.fit(transactions, transactions_target)
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('logistic_regression', LogisticRegression())]), param_grid=[{'logistic_regression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'logistic_regression__penalty': ['l1', 'l2']}], scoring='f1')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('logistic_regression', LogisticRegression())]), param_grid=[{'logistic_regression__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'logistic_regression__penalty': ['l1', 'l2']}], scoring='f1')
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])), ('logistic_regression', LogisticRegression())])
ColumnTransformer(transformers=[('standard', StandardScaler(), ['Time']), ('robust', RobustScaler(), ['Amount']), ('passthrough', 'passthrough', ['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8'])])
['Time']
StandardScaler()
['Amount']
RobustScaler()
['V3', 'V4', 'V14', 'V16', 'V19', 'V22', 'V24', 'V21', 'V27', 'V6', 'V13', 'V15', 'V23', 'V28', 'V20', 'V26', 'V8']
passthrough
LogisticRegression()
# the evaluation scores
cvres = log_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
print(np.sqrt(mean_score), params)
nan {'logistic_regression__C': 0.001, 'logistic_regression__penalty': 'l1'} 0.9571285566143316 {'logistic_regression__C': 0.001, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 0.01, 'logistic_regression__penalty': 'l1'} 0.9661232891058812 {'logistic_regression__C': 0.01, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 0.1, 'logistic_regression__penalty': 'l1'} 0.9679837324399805 {'logistic_regression__C': 0.1, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 1, 'logistic_regression__penalty': 'l1'} 0.9683952684788988 {'logistic_regression__C': 1, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 10, 'logistic_regression__penalty': 'l1'} 0.9677503176681318 {'logistic_regression__C': 10, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 100, 'logistic_regression__penalty': 'l1'} 0.9670935755315853 {'logistic_regression__C': 100, 'logistic_regression__penalty': 'l2'} nan {'logistic_regression__C': 1000, 'logistic_regression__penalty': 'l1'} 0.9670935755315853 {'logistic_regression__C': 1000, 'logistic_regression__penalty': 'l2'}
# The hyperparameters are {'logistic_regression__C': 1, 'logistic_regression__penalty': 'l2'} for the logistic regression model
# the f1-score is 0.9683952684788988
test_target = test.pop("Class") # separate actual values
final_model = log_grid_search.best_estimator_ # the previously fine-tuned model
test_prediction = final_model.predict(test[cols])
print(classification_report(test_target, test_prediction, target_names = ["Not Fraud", "Fraud"]))
precision recall f1-score support Not Fraud 1.00 0.96 0.98 56864 Fraud 0.04 0.91 0.07 98 accuracy 0.96 56962 macro avg 0.52 0.93 0.53 56962 weighted avg 1.00 0.96 0.98 56962
# not fraud precision: means out of all predicted normal transactions; 100% are actually normal
# not fraud recall: means out of all actual normal transactions; 96% are predicted correctly but 4% are missed (they are normal but incorrectly predicted as fraud)
# fraud precision: means out of all predicted fraud transactions; 4% are actually fraud but 96% of them are normal (incorrectly labeld as fraud)
# fraud recall: means out of all actual fraud transactions; 91% are predicted correctly but 9% are missed. (they are fraud but incorrectly predicted as normal)