import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format

/kaggle/input/creditcardfraud/creditcard.csv

data = pd.read_csv('../input/creditcardfraud/creditcard.csv')
data.head()

data.shape

(284807, 31)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  int64  
dtypes: float64(30), int64(1)
memory usage: 67.4 MB

LABELS = ["Normal", "Fraud"]

count_classes = pd.value_counts(data['Class'], sort=True)
count_classes.plot(kind='bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xticks(range(2), LABELS)
plt.xlabel("Class")
plt.ylabel("Frequency")

# Add annotations to the bars
for i, v in enumerate(count_classes):
    plt.text(i, v + 50, str(v), ha='center', va='bottom', fontsize=10)  # adjust the +50 if needed for better positioning

plt.show()

data.describe()

data[['Amount', 'Time']].describe()

data.isna().mean()*100

Time     0.00
V1       0.00
V2       0.00
V3       0.00
V4       0.00
V5       0.00
V6       0.00
V7       0.00
V8       0.00
V9       0.00
V10      0.00
V11      0.00
V12      0.00
V13      0.00
V14      0.00
V15      0.00
V16      0.00
V17      0.00
V18      0.00
V19      0.00
V20      0.00
V21      0.00
V22      0.00
V23      0.00
V24      0.00
V25      0.00
V26      0.00
V27      0.00
V28      0.00
Amount   0.00
Class    0.00
dtype: float64

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

def split_data(data):
    features = data.loc[:, :'Amount']
    target = data.loc[:, 'Class']

    X_temp, X_test, y_temp, y_test = train_test_split(features, target, test_size=0.20, random_state=2)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=2)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Split the original data
X_train_orig, X_val_orig, X_test_orig, y_train_orig, y_val_orig, y_test_orig = split_data(data)

print("Number of rows in X_train_orig:", X_train_orig.shape[0])
print("Number of rows in X_val_orig:", X_val_orig.shape[0])
print("Number of rows in X_test_orig:", X_test_orig.shape[0])
print("Number of rows in y_train_orig:", y_train_orig.shape[0])
print("Number of rows in y_val_orig:", y_val_orig.shape[0])
print("Number of rows in y_test_orig:", y_test_orig.shape[0])

Number of rows in X_train_orig: 170883
Number of rows in X_val_orig: 56962
Number of rows in X_test_orig: 56962
Number of rows in y_train_orig: 170883
Number of rows in y_val_orig: 56962
Number of rows in y_test_orig: 56962

from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Dataset 1: Feature selection using ANOVA on training data
best_features = SelectKBest(score_func=f_classif, k='all')
fit = best_features.fit(X_train_orig, y_train_orig)

featureScores = pd.DataFrame(data=fit.scores_, index=list(X_train_orig.columns), columns=['ANOVA Score'])
featureScores = featureScores.sort_values(ascending=True, by='ANOVA Score')

# Filtering columns with ANOVA score > 50
filtered_featureScores = featureScores[featureScores['ANOVA Score'] > 50]

# Using gradient coloring based on the ANOVA scores
colors = plt.cm.viridis(np.linspace(0, 1, len(filtered_featureScores)))

plt.figure(figsize=(10, 8))
bars = plt.barh(filtered_featureScores.index, filtered_featureScores['ANOVA Score'], color=colors)
plt.xlabel('ANOVA Score')
plt.title('Features with ANOVA Score > 50')
plt.gca().invert_yaxis()  # Highest scores at the top

# Adding the scores inside the bars
for bar in bars:
    width = bar.get_width()
    plt.text(width - 0.05 * width, bar.get_y() + bar.get_height()/2, 
             '{:.2f}'.format(width), 
             ha='center', va='center', color='white', fontsize=9)

# Adding a colorbar
sm = plt.cm.ScalarMappable(cmap="viridis", norm=plt.Normalize(vmin=filtered_featureScores['ANOVA Score'].min(), vmax=filtered_featureScores['ANOVA Score'].max()))
plt.colorbar(sm)

plt.show()

/tmp/ipykernel_32/2823017105.py:35: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes.
  plt.colorbar(sm)

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Train a Random Forest model
rf = RandomForestClassifier(n_estimators=25)
rf.fit(X_train_orig, y_train_orig)

# Convert feature importances into a DataFrame
feature_importance = pd.DataFrame(data=rf.feature_importances_, index=X_train_orig.columns, columns=['Feature Importance'])
feature_importance = feature_importance.sort_values(ascending=True, by='Feature Importance')

# Filter feature importances greater than 0.030
filtered_feature_importance = feature_importance[feature_importance['Feature Importance'] > 0.030]

# Using gradient coloring based on the feature importances
colors = plt.cm.viridis(np.linspace(0, 1, len(filtered_feature_importance)))

plt.figure(figsize=(10, 8))
bars = plt.barh(filtered_feature_importance.index, filtered_feature_importance['Feature Importance'], color=colors)
plt.xlabel('Feature Importance')
plt.title('Features with Importance => 0.030')
plt.gca().invert_yaxis()  # Highest importance at the top

# Adding the importance values inside the bars
for bar in bars:
    width = bar.get_width()
    plt.text(width - 0.05 * width, bar.get_y() + bar.get_height()/2, 
             '{:.3f}'.format(width), 
             ha='center', va='center', color='white', fontsize=9)

# Adding a colorbar
sm = plt.cm.ScalarMappable(cmap="viridis", norm=plt.Normalize(vmin=filtered_feature_importance['Feature Importance'].min(), vmax=filtered_feature_importance['Feature Importance'].max()))
plt.colorbar(sm)

plt.show()

/tmp/ipykernel_32/1400419571.py:34: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes.
  plt.colorbar(sm)

selected_features = featureScores.index[:20]  # Get the top 20 feature names
X_train_df2 = X_train_orig[selected_features]
y_train_df2 = y_train_orig.copy()

# Apply the same transformation to validation and test sets
X_val_df2 = X_val_orig[selected_features]
X_test_df2 = X_test_orig[selected_features]

# Now, concatenate the features and target to create df2 for training, validation, and test
df2_train = pd.concat([X_train_df2, y_train_df2], axis=1)
df2_val = pd.concat([X_val_df2, y_val_orig], axis=1)  # Assuming you have y_val_orig
df2_test = pd.concat([X_test_df2, y_test_orig], axis=1)  # Assuming you have y_test_orig
df2_train.head()

df2_test.head()

df2_val.head()

selected_features = feature_importance.index[:8]  # Get the top 8 feature names
X_train_df1 = X_train_orig[selected_features]
y_train_df1 = y_train_orig.copy()

# Apply the same transformation to validation and test sets
X_val_df1 = X_val_orig[selected_features]  # Assuming X_val_orig is your original validation dataset
X_test_df1 = X_test_orig[selected_features]  # Assuming X_test_orig is your original test dataset

# Now, concatenate the features and target to create df1 for training, validation, and test
df1_train = pd.concat([X_train_df1, y_train_df1], axis=1)
df1_val = pd.concat([X_val_df1, y_val_orig], axis=1)  # Assuming you have y_val_orig
df1_test = pd.concat([X_test_df1, y_test_orig], axis=1)  # Assuming you have y_test_orig

df1_train.head()

df1_test.head()

df1_val.head()

features_df1 = df1_train.iloc[:, :-1] 
target_df1 = df1_train.iloc[:, -1]     

features_df2 = df2_train.iloc[:, :-1]  
target_df2 = df2_train.iloc[:, -1]

features_val_df1 = df1_val.iloc[:, :-1]
target_val_df1 = df1_val.iloc[:, -1]

features_test_df1 = df1_test.iloc[:, :-1]
target_test_df1 = df1_test.iloc[:, -1]

features_val_df2 = df2_val.iloc[:, :-1]
target_val_df2 = df2_val.iloc[:, -1]

features_test_df2 = df2_test.iloc[:, :-1]
target_test_df2 = df2_test.iloc[:, -1]

import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, RocCurveDisplay, classification_report, precision_recall_curve
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

over = SMOTE(sampling_strategy=0.5)
under = RandomUnderSampler(sampling_strategy=0.1)

# Resampling df1_train
# Under-sample df1 training data
f1_train_under, t1_train_under = under.fit_resample(features_df1, target_df1)

# SMOTE df1 training data
f1_train_smote, t1_train_smote = over.fit_resample(features_df1, target_df1)

# Resampling df2_train
# Under-sample df2 training data
f2_train_under, t2_train_under = under.fit_resample(features_df2, target_df2)

# SMOTE df2 training data
f2_train_smote, t2_train_smote = over.fit_resample(features_df2, target_df2)

# Applying both undersampling and SMOTE on df1_train and df2_train
steps = [('under', under), ('over', over)]
pipeline = Pipeline(steps=steps)
f1_train_combined, t1_train_combined = pipeline.fit_resample(features_df1, target_df1)
f2_train_combined, t2_train_combined = pipeline.fit_resample(features_df2, target_df2)

print("Counts for t1_train after undersampling:", Counter(t1_train_under))
print("Counts for t1_train after SMOTE:", Counter(t1_train_smote))
print("Counts for t1_train after combined resampling:", Counter(t1_train_combined))
print("\nCounts for t2_train after undersampling:", Counter(t2_train_under))
print("Counts for t2_train after SMOTE:", Counter(t2_train_smote))
print("Counts for t2_train after combined resampling:", Counter(t2_train_combined))

Counts for t1_train after undersampling: Counter({0: 3060, 1: 306})
Counts for t1_train after SMOTE: Counter({0: 170577, 1: 85288})
Counts for t1_train after combined resampling: Counter({0: 3060, 1: 1530})

Counts for t2_train after undersampling: Counter({0: 3060, 1: 306})
Counts for t2_train after SMOTE: Counter({0: 170577, 1: 85288})
Counts for t2_train after combined resampling: Counter({0: 3060, 1: 1530})

classifier_lr = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)
classifier_gbm = GradientBoostingClassifier(random_state=0)
classifier_xgb = XGBClassifier(max_depth=4, random_state=0)
classifier_lgb = lgb.LGBMClassifier(max_depth=4, random_state=0)
classifier_knn = KNeighborsClassifier()

# List of classifiers and their names
classifiers = [classifier_lr, classifier_gbm, classifier_xgb, classifier_lgb, classifier_knn]
classifier_names = ["LogisticRegression", "GradientBoostingMachine", "XGBoost", "LightGBM", "KNN"]

# Using the training data for visualization
data_sets_dict = {
    "Dataset1 - UnderSampling": {
        'x_train': f1_train_under, 
        'y_train': t1_train_under
    },
    "Dataset1 - Combined": {
        'x_train': f1_train_combined, 
        'y_train': t1_train_combined
    },
    "Dataset2 - UnderSampling": {
        'x_train': f2_train_under, 
        'y_train': t2_train_under
    },
    "Dataset2 - Combined": {
        'x_train': f2_train_combined, 
        'y_train': t2_train_combined
    }
}

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report

# Define number of splits
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results = []

for classifier, classifier_name in zip(classifiers, classifier_names):
    for dataset_name, data in data_sets_dict.items():
        X_data = data['x_train']
        y_data = data['y_train']

        aucs = []
        precisions = []
        recalls = []
        f1_scores = []
        supports = []
        
        for train_index, val_index in skf.split(X_data, y_data):
            X_train_fold, X_val_fold = X_data.iloc[train_index], X_data.iloc[val_index]
            y_train_fold, y_val_fold = y_data.iloc[train_index], y_data.iloc[val_index]

            # Train the model
            classifier.fit(X_train_fold, y_train_fold)

            # Predict on the validation fold
            y_pred = classifier.predict(X_val_fold)
            y_pred_scores = classifier.predict_proba(X_val_fold)[:, 1]

            roc_auc = roc_auc_score(y_val_fold, y_pred_scores)
            report = classification_report(y_val_fold, y_pred, output_dict=True)

            aucs.append(roc_auc)
            precisions.append(report['macro avg']['precision'])
            recalls.append(report['macro avg']['recall'])
            f1_scores.append(report['macro avg']['f1-score'])
            supports.append(report['macro avg']['support'])

        # Average metrics across all folds
        avg_auc = sum(aucs) / n_splits
        avg_precision = sum(precisions) / n_splits
        avg_recall = sum(recalls) / n_splits
        avg_f1_score = sum(f1_scores) / n_splits
        avg_support = sum(supports) / n_splits

        results.append({
            'Classifier': classifier_name,
            'Dataset': dataset_name,
            'Avg_ROC_AUC': avg_auc,
            'Avg_Precision': avg_precision,
            'Avg_Recall': avg_recall,
            'Avg_F1_Score': avg_f1_score,
            'Avg_Support': avg_support,
            'Num_Columns': X_data.shape[1]  # Number of columns in the dataset
        })

results_df = pd.DataFrame(results)
display(results_df)

data_sets_dict_val = {
    "Dataset1 - UnderSampling": {
        'x_val': features_val_df1, 
        'y_val': target_val_df1
    },
    "Dataset1 - Combined": {
        'x_val': features_val_df1, 
        'y_val': target_val_df1
    },
    "Dataset2 - UnderSampling": {
        'x_val': features_val_df2, 
        'y_val': target_val_df2
    },
    "Dataset2 - Combined": {
        'x_val': features_val_df2, 
        'y_val': target_val_df2
    }
}

results_validation = []

for classifier, classifier_name in zip(classifiers, classifier_names):
    for dataset_name, data in data_sets_dict.items():
        X_train = data['x_train']
        y_train = data['y_train']

        # Train the classifier on the entire training dataset
        classifier.fit(X_train, y_train)

        # Retrieve corresponding validation data
        X_val = data_sets_dict_val[dataset_name]['x_val']
        y_val = data_sets_dict_val[dataset_name]['y_val']

        # Predict on the validation set
        y_pred = classifier.predict(X_val)
        y_pred_scores = classifier.predict_proba(X_val)[:, 1]

        roc_auc = roc_auc_score(y_val, y_pred_scores)
        report = classification_report(y_val, y_pred, output_dict=True)

        results_validation.append({
            'Classifier': classifier_name,
            'Dataset': dataset_name,
            'ROC_AUC': roc_auc,
            'Precision': report['macro avg']['precision'],
            'Recall': report['macro avg']['recall'],
            'F1-Score': report['macro avg']['f1-score'],
            'Support': report['macro avg']['support'],
            'Num_Columns': X_val.shape[1]  # Number of columns in the dataset
        })

results_validation_df = pd.DataFrame(results_validation)
display(results_validation_df)

for dataset_name, data in data_sets_dict.items():
    X_train = data['x_train']
    X_val = data_sets_dict_val[dataset_name]['x_val']
    print(f"For {dataset_name} - X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")

For Dataset1 - UnderSampling - X_train shape: (3366, 8), X_val shape: (56962, 8)
For Dataset1 - Combined - X_train shape: (4590, 8), X_val shape: (56962, 8)
For Dataset2 - UnderSampling - X_train shape: (3366, 20), X_val shape: (56962, 20)
For Dataset2 - Combined - X_train shape: (4590, 20), X_val shape: (56962, 20)

# Define the XGBoost and LightGBM classifiers
classifiers = [classifier_xgb, classifier_lgb]
classifier_names = ['XGBoost', 'LightGBM']

# Plotting setup
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10))

for i, (classifier_name, classifier) in enumerate(zip(classifier_names, classifiers)):
    for j, (dataset_name, data) in enumerate(data_sets_dict.items()):
        # Train classifier on the current training dataset
        X_train = data['x_train']
        y_train = data['y_train']
        classifier.fit(X_train, y_train)
        
        # Get the corresponding validation data
        X_val = data_sets_dict_val[dataset_name]['x_val']
        y_val = data_sets_dict_val[dataset_name]['y_val']

        # Plot ROC curve
        RocCurveDisplay.from_estimator(classifier, X_val, y_val, ax=axes[i, j])
        axes[i, j].set_title(dataset_name)
        axes[i, j].legend(loc="lower right", fontsize="small")
        
    axes[i, 0].set_ylabel(classifier_name, size='large', rotation='vertical', verticalalignment='bottom')

plt.tight_layout()
plt.show()

import seaborn as sns
from sklearn.metrics import confusion_matrix

# Initialize the subplots for the classifiers and datasets
fig, axes = plt.subplots(nrows=len(classifiers), ncols=len(data_sets_dict), figsize=(20, 15))

for i, (classifier_name, classifier) in enumerate(zip(classifier_names, classifiers)):
    for j, (dataset_name, data) in enumerate(data_sets_dict.items()):
        X_train = data['x_train']
        y_train = data['y_train']
        classifier.fit(X_train, y_train)
        
        # Get the corresponding validation data
        X_val = data_sets_dict_val[dataset_name]['x_val']
        y_val = data_sets_dict_val[dataset_name]['y_val']

        y_pred = classifier.predict(X_val)
        cm = confusion_matrix(y_val, y_pred)

        # Plot confusion matrix using seaborn
        sns.heatmap(cm, annot=True, fmt='g', ax=axes[i, j], cmap='Blues', cbar=False)

        # Set titles and labels for each matrix
        axes[i, j].set_title(dataset_name)
        axes[i, j].set_xticklabels(['False', 'True'])
        axes[i, j].set_yticklabels(['False', 'True'])
        axes[i, j].set_xlabel("Predicted")
        axes[i, j].set_ylabel("Actual")

        # Set classifier name only for the first column
        if j == 0:
            axes[i, j].set_ylabel(f"{classifier_name}\n\nActual", fontsize='large')

plt.tight_layout()
plt.show()

%%capture 


from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb
import numpy as np
import warnings

# Ignore lightgbm warnings
warnings.filterwarnings("ignore")

# Hyperparameter ranges for LightGBM
lgbm_grid = {
    'num_leaves': [int(x) for x in np.linspace(5, 150, num=15)],
    'learning_rate': np.logspace(-3, 0, 10),
    'min_data_in_leaf': [int(x) for x in np.linspace(5, 150, num=15)],
    'feature_fraction': np.linspace(0.1, 1.0, 10),
    'bagging_fraction': np.linspace(0.1, 1.0, 10),
    'bagging_freq': [int(x) for x in np.linspace(1, 15, num=15)],
    'lambda_l1': np.logspace(-3, 3, 10),
    'lambda_l2': np.logspace(-3, 3, 10)
}

# Instantiate LGBM classifier
lgbm_classifier = lgb.LGBMClassifier(silent=True, verbosity=-1)

# Initialize RandomizedSearchCV
lgbm_random = RandomizedSearchCV(estimator=lgbm_classifier, param_distributions=lgbm_grid, 
                                 n_iter=150, cv=3, verbose=1, random_state=42, n_jobs=-1)

# Train for "Dataset1 - UnderSampling" using training data
X_train_dataset1 = data_sets_dict["Dataset1 - UnderSampling"]['x_train']
y_train_dataset1 = data_sets_dict["Dataset1 - UnderSampling"]['y_train']

lgbm_random.fit(X_train_dataset1, y_train_dataset1)
best_params_dataset1_lgbm = lgbm_random.best_params_
print("Best parameters for Dataset1 - UnderSampling with LGBM: ", best_params_dataset1_lgbm)

# Train for "Dataset2 - UnderSampling" using training data
X_train_dataset2 = data_sets_dict["Dataset2 - UnderSampling"]['x_train']
y_train_dataset2 = data_sets_dict["Dataset2 - UnderSampling"]['y_train']

lgbm_random.fit(X_train_dataset2, y_train_dataset2)
best_params_dataset2_lgbm = lgbm_random.best_params_
print("Best parameters for Dataset2 - UnderSampling with LGBM: ", best_params_dataset2_lgbm)

from xgboost import XGBClassifier

# Hyperparameter ranges for XGBoost
xgb_grid = {
    'learning_rate': np.logspace(-3, 0, 10),
    'max_depth': [int(x) for x in np.linspace(5, 150, num=15)],
    'subsample': np.linspace(0.1, 1.0, 10),
    'colsample_bytree': np.linspace(0.1, 1.0, 10),
    'gamma': np.linspace(0, 1, 10),
    'alpha': np.logspace(-3, 3, 10),
    'lambda': np.logspace(-3, 3, 10)
}

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier()

# Initialize RandomizedSearchCV
xgb_random = RandomizedSearchCV(estimator=xgb_classifier, param_distributions=xgb_grid, 
                                n_iter=150, cv=3, verbose=0, random_state=42, n_jobs=-1)

# Train for "Dataset1 - UnderSampling"
xgb_random.fit(X_train_dataset1, y_train_dataset1)
best_params_dataset1_xgb = xgb_random.best_params_
print("Dataset1 - UnderSampling için en iyi parametreler (XGBoost): ", best_params_dataset1_xgb)

# Train for "Dataset2 - UnderSampling"
xgb_random.fit(X_train_dataset2, y_train_dataset2)
best_params_dataset2_xgb = xgb_random.best_params_
print("Dataset2 - UnderSampling için en iyi parametreler (XGBoost): ", best_params_dataset2_xgb)

Dataset1 - UnderSampling için en iyi parametreler (XGBoost):  {'subsample': 1.0, 'max_depth': 67, 'learning_rate': 0.21544346900318823, 'lambda': 10.0, 'gamma': 0.2222222222222222, 'colsample_bytree': 0.7000000000000001, 'alpha': 0.46415888336127775}
Dataset2 - UnderSampling için en iyi parametreler (XGBoost):  {'subsample': 0.8, 'max_depth': 150, 'learning_rate': 0.21544346900318823, 'lambda': 10.0, 'gamma': 0.0, 'colsample_bytree': 0.7000000000000001, 'alpha': 0.021544346900318832}

# LightGBM for Dataset2 - UnderSampling
best_params_lgbm_dataset2 = {
    'num_leaves': 87,
    'min_data_in_leaf': 108,
    'learning_rate': 0.46415888336127775,
    'lambda_l2': 0.021544346900318832,
    'lambda_l1': 0.46415888336127775,
    'feature_fraction': 0.6,
    'bagging_freq': 8,
    'bagging_fraction': 1.0
}

lgbm_model_dataset2 = lgb.LGBMClassifier(**best_params_lgbm_dataset2)
lgbm_model_dataset2.fit(X_train_dataset2, y_train_dataset2)

LGBMClassifier(bagging_fraction=1.0, bagging_freq=8, feature_fraction=0.6,
               lambda_l1=0.46415888336127775, lambda_l2=0.021544346900318832,
               learning_rate=0.46415888336127775, min_data_in_leaf=108,
               num_leaves=87)

LGBMClassifier(bagging_fraction=1.0, bagging_freq=8, feature_fraction=0.6,
               lambda_l1=0.46415888336127775, lambda_l2=0.021544346900318832,
               learning_rate=0.46415888336127775, min_data_in_leaf=108,
               num_leaves=87)

# LightGBM for Dataset1 - UnderSampling
best_params_lgbm_dataset1 = {
    'num_leaves': 46,
    'min_data_in_leaf': 46,
    'learning_rate': 0.1,
    'lambda_l2': 2.154434690031882,
    'lambda_l1': 0.004641588833612777,
    'feature_fraction': 0.2,
    'bagging_freq': 4,
    'bagging_fraction': 0.6
}

lgbm_model_dataset1 = lgb.LGBMClassifier(**best_params_lgbm_dataset1)
lgbm_model_dataset1.fit(X_train_dataset1, y_train_dataset1)

LGBMClassifier(bagging_fraction=0.6, bagging_freq=4, feature_fraction=0.2,
               lambda_l1=0.004641588833612777, lambda_l2=2.154434690031882,
               min_data_in_leaf=46, num_leaves=46)

LGBMClassifier(bagging_fraction=0.6, bagging_freq=4, feature_fraction=0.2,
               lambda_l1=0.004641588833612777, lambda_l2=2.154434690031882,
               min_data_in_leaf=46, num_leaves=46)

import xgboost as xgb

# XGBoost for Dataset1 - UnderSampling
best_params_xgb_dataset1 = {
    'subsample': 0.9,
    'max_depth': 67,
    'learning_rate': 0.046415888336127774,
    'lambda': 10.0,
    'gamma': 0.0,
    'colsample_bytree': 0.5,
    'alpha': 0.004641588833612777
}

xgb_model_dataset1 = xgb.XGBClassifier(**best_params_xgb_dataset1)
xgb_model_dataset1.fit(X_train_dataset1, y_train_dataset1)

XGBClassifier(alpha=0.004641588833612777, base_score=None, booster=None,
              callbacks=None, colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, lambda=10.0,
              learning_rate=0.046415888336127774, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=67, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None, ...)

XGBClassifier(alpha=0.004641588833612777, base_score=None, booster=None,
              callbacks=None, colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, lambda=10.0,
              learning_rate=0.046415888336127774, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=67, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None, ...)

# XGBoost for Dataset2 - UnderSampling
best_params_xgb_dataset2 = {
    'subsample': 0.9,
    'max_depth': 36,
    'learning_rate': 0.46415888336127775,
    'lambda': 0.021544346900318832,
    'gamma': 0.0,
    'colsample_bytree': 0.9,
    'alpha': 0.004641588833612777
}

xgb_model_dataset2 = xgb.XGBClassifier(**best_params_xgb_dataset2)
xgb_model_dataset2.fit(X_train_dataset2, y_train_dataset2)

XGBClassifier(alpha=0.004641588833612777, base_score=None, booster=None,
              callbacks=None, colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, lambda=0.021544346900318832,
              learning_rate=0.46415888336127775, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=36, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None, ...)

XGBClassifier(alpha=0.004641588833612777, base_score=None, booster=None,
              callbacks=None, colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, lambda=0.021544346900318832,
              learning_rate=0.46415888336127775, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=36, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None, ...)

# Get columns from X_train after feature selection
selected_columns = X_train.columns

# Subset the validation dataset to only include these columns
X_val_selected = X_val_orig[selected_columns]

# List of tuned models:
tuned_models = [
    ('XGBoost (Trained on Dataset1 UnderSampling)', xgb_model_dataset1),
    ('XGBoost (Trained on Dataset2 UnderSampling)', xgb_model_dataset2),
    ('LightGBM (Trained on Dataset1 UnderSampling)', lgbm_model_dataset1),
    ('LightGBM (Trained on Dataset2 UnderSampling)', lgbm_model_dataset2)
]

results_val_tuned = pd.DataFrame()

for classifier_name, classifier in tuned_models:
    for dataset_name, data in data_sets_dict.items():
        if "UnderSampling" not in dataset_name:
            continue  # We only focus on undersampling datasets

        X_train = data['x_train']
        y_train = data['y_train']

        # Subset the validation dataset to match the columns in the current X_train
        selected_columns = X_train.columns
        X_val_selected = X_val_orig[selected_columns]

        # DEBUG: Print out columns to visually check
        print(f"Training columns for {classifier_name} on {dataset_name}: {X_train.columns}")
        print(f"Validation columns for {classifier_name} on {dataset_name}: {X_val_selected.columns}")

        # Predict on validation set
        classifier.fit(X_train, y_train)  # Fit the model
        y_pred_scores = classifier.predict_proba(X_val_selected)[:, 1]
        roc_auc = roc_auc_score(y_val_orig, y_pred_scores)
        y_pred = classifier.predict(X_val_selected)
        report = classification_report(y_val_orig, y_pred, output_dict=True)

        # Add results to the DataFrame
        df_temp = pd.DataFrame({
            'Classifier': [classifier_name],
            'Dataset': [dataset_name],
            'ROC_AUC_Score': [roc_auc],
            'Precision': [report['macro avg']['precision']],
            'Recall': [report['macro avg']['recall']],
            'F1-Score': [report['macro avg']['f1-score']],
            'Support': [report['macro avg']['support']]
        })
        results_val_tuned = pd.concat([results_val_tuned, df_temp], ignore_index=True)

display(results_val_tuned)

Training columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Validation columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Training columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Validation columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Training columns for XGBoost (Trained on Dataset2 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Validation columns for XGBoost (Trained on Dataset2 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Training columns for XGBoost (Trained on Dataset2 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Validation columns for XGBoost (Trained on Dataset2 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Training columns for LightGBM (Trained on Dataset1 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Validation columns for LightGBM (Trained on Dataset1 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Training columns for LightGBM (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Validation columns for LightGBM (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
[LightGBM] [Warning] lambda_l1 is set=0.004641588833612777, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.004641588833612777
[LightGBM] [Warning] bagging_fraction is set=0.6, subsample=1.0 will be ignored. Current value: bagging_fraction=0.6
[LightGBM] [Warning] lambda_l2 is set=2.154434690031882, reg_lambda=0.0 will be ignored. Current value: lambda_l2=2.154434690031882
[LightGBM] [Warning] feature_fraction is set=0.2, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.2
[LightGBM] [Warning] min_data_in_leaf is set=46, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=46
[LightGBM] [Warning] bagging_freq is set=4, subsample_freq=0 will be ignored. Current value: bagging_freq=4
Training columns for LightGBM (Trained on Dataset2 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
Validation columns for LightGBM (Trained on Dataset2 UnderSampling) on Dataset1 - UnderSampling: Index(['V28', 'Amount', 'V24', 'V25', 'V23', 'V19', 'V8', 'V5'], dtype='object')
[LightGBM] [Warning] lambda_l1 is set=0.46415888336127775, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.46415888336127775
[LightGBM] [Warning] bagging_fraction is set=1.0, subsample=1.0 will be ignored. Current value: bagging_fraction=1.0
[LightGBM] [Warning] lambda_l2 is set=0.021544346900318832, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.021544346900318832
[LightGBM] [Warning] feature_fraction is set=0.6, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.6
[LightGBM] [Warning] min_data_in_leaf is set=108, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=108
[LightGBM] [Warning] bagging_freq is set=8, subsample_freq=0 will be ignored. Current value: bagging_freq=8
Training columns for LightGBM (Trained on Dataset2 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Validation columns for LightGBM (Trained on Dataset2 UnderSampling) on Dataset2 - UnderSampling: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
[LightGBM] [Warning] lambda_l1 is set=0.46415888336127775, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.46415888336127775
[LightGBM] [Warning] bagging_fraction is set=1.0, subsample=1.0 will be ignored. Current value: bagging_fraction=1.0
[LightGBM] [Warning] lambda_l2 is set=0.021544346900318832, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.021544346900318832
[LightGBM] [Warning] feature_fraction is set=0.6, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.6
[LightGBM] [Warning] min_data_in_leaf is set=108, min_child_samples=20 will be ignored. Current value: min_data_in_leaf=108
[LightGBM] [Warning] bagging_freq is set=8, subsample_freq=0 will be ignored. Current value: bagging_freq=8

# List to store test results
results_test_tuned = pd.DataFrame()

# Using only the chosen classifier
classifier = xgb_model_dataset1  # XGBoost trained on Dataset1 UnderSampling

# Using only Dataset2 - UnderSampling for validation
data = data_sets_dict["Dataset2 - UnderSampling"]

# Instantiate a fresh copy of the model
classifier = classifier.__class__(**classifier.get_params())

X_train = data['x_train']
y_train = data['y_train']

# Fit the classifier on training data
classifier.fit(X_train, y_train)

# Subset the test dataset to match the columns in the current X_train
selected_columns = X_train.columns
X_test_selected = X_test_orig[selected_columns]

# Print columns for verification
print(f"Columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling:")
print("Train:", X_train.columns)
print("Test:", X_test_selected.columns)

# Check if columns are the same for X_train and X_test_selected
if list(X_train.columns) != list(X_test_selected.columns):
    raise ValueError(f"Feature mismatch for XGBoost (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling")

# Predict on test set
y_pred_scores = classifier.predict_proba(X_test_selected)[:, 1]
roc_auc = roc_auc_score(y_test_orig, y_pred_scores)
y_pred = classifier.predict(X_test_selected)
report = classification_report(y_test_orig, y_pred, output_dict=True)

# Store results in the DataFrame
df_temp = pd.DataFrame({
    'Classifier': ['XGBoost (Trained on Dataset1 UnderSampling)'],
    'Dataset': ['Dataset2 - UnderSampling'],
    'ROC_AUC_Score': [roc_auc],
    'Precision': [report['macro avg']['precision']],
    'Recall': [report['macro avg']['recall']],
    'F1-Score': [report['macro avg']['f1-score']],
    'Support': [report['macro avg']['support']]
})
results_test_tuned = pd.concat([results_test_tuned, df_temp], ignore_index=True)

display(results_test_tuned)

Columns for XGBoost (Trained on Dataset1 UnderSampling) on Dataset2 - UnderSampling:
Train: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')
Test: Index(['V22', 'V25', 'V26', 'V15', 'V13', 'V8', 'V23', 'V24', 'Amount', 'V28',
       'Time', 'V27', 'V20', 'V19', 'V21', 'V6', 'V2', 'V5', 'V9', 'V1'],
      dtype='object')

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, model_name, dataset_name):
    """ Plot the confusion matrix for a model's predictions. """
    # Get the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Plot the heatmap
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
    
    # Add labels and title
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix for {model_name} on {dataset_name}')
    plt.show()

# Focusing on the chosen classifier
classifier = xgb_model_dataset1  # XGBoost trained on Dataset1 UnderSampling

# Using Dataset2 - UnderSampling for validation
data = data_sets_dict["Dataset2 - UnderSampling"]

# Instantiate a fresh copy of the model
classifier = classifier.__class__(**classifier.get_params())

# Train the classifier
X_train = data['x_train']
y_train = data['y_train']
classifier.fit(X_train, y_train)

# Subset the test dataset to match the columns in the current X_train
selected_columns = X_train.columns
X_test_selected = X_test_orig[selected_columns]

# Predict on the test set
y_pred = classifier.predict(X_test_selected)

# Plot the confusion matrix
plot_confusion_matrix(y_test_orig, y_pred, 'XGBoost (Trained on Dataset1 UnderSampling)', 'Dataset2 - UnderSampling')

def plot_feature_importance(importance, names, model_type, dataset_name, top_n=10):
    """ Plot feature importance for a given model. """
    assert len(importance) == len(names), f"Mismatch: {len(importance)} vs {len(names)}"

    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data = {'feature_names': feature_names, 'feature_importance': feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    
    # Plot feature importance (top_n features)
    plt.figure(figsize=(10,8))
    sns.barplot(x=fi_df['feature_importance'][:top_n], y=fi_df['feature_names'][:top_n])
    plt.title(f'{model_type} - Feature Importance on {dataset_name}')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
    plt.show()

# Focus on the specific model and dataset
target_classifier_name = "XGBoost (Trained on Dataset1 UnderSampling)"
target_dataset_name = "Dataset2 - UnderSampling"

# Ensure the classifier and dataset are present in our models and data dicts
if target_classifier_name in [name for name, _ in tuned_models] and target_dataset_name in data_sets_dict:

    # Get the classifier object
    classifier = next((clf for name, clf in tuned_models if name == target_classifier_name), None)
    
    # Instantiate a fresh copy of the model
    classifier = classifier.__class__(**classifier.get_params())

    # Get the data
    data = data_sets_dict[target_dataset_name]
    X_train = data['x_train']
    y_train = data['y_train']
    classifier.fit(X_train, y_train)
    
    # Extract feature importances for the XGBoost model
    importance = classifier.feature_importances_
    
    # Plot the feature importance
    plot_feature_importance(importance, X_train.columns, target_classifier_name, target_dataset_name, top_n=10)
else:
    print(f"Either the classifier {target_classifier_name} or dataset {target_dataset_name} does not exist.")

from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 10)):
    """Generate a simple plot of the test and training learning curve"""
    plt.figure(figsize=(10,6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.legend(loc="best")
    return plt

# Selecting a smaller subset for speed. You can use the entire dataset if computation is not a concern.
X_small = X_train_orig.sample(frac=0.2, random_state=42)
y_small = y_train_orig[X_small.index]

target_classifier_name = "XGBoost (Trained on Dataset1 UnderSampling)"

# Ensure the classifier is present in our models
if target_classifier_name in [name for name, _ in tuned_models]:

    # Get the classifier object
    classifier = next((clf for name, clf in tuned_models if name == target_classifier_name), None)
    
    plot_learning_curve(classifier, f'Learning Curve of {target_classifier_name}', X_small, y_small, cv=5)
    plt.show()
else:
    print(f"The classifier {target_classifier_name} does not exist.")

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.00	-1.36	-0.07	2.54	1.38	-0.34	0.46	0.24	0.10	0.36	...	-0.02	0.28	-0.11	0.07	0.13	-0.19	0.13	-0.02	149.62
1	0.00	1.19	0.27	0.17	0.45	0.06	-0.08	-0.08	0.09	-0.26	...	-0.23	-0.64	0.10	-0.34	0.17	0.13	-0.01	0.01	2.69
2	1.00	-1.36	-1.34	1.77	0.38	-0.50	1.80	0.79	0.25	-1.51	...	0.25	0.77	0.91	-0.69	-0.33	-0.14	-0.06	-0.06	378.66
3	1.00	-0.97	-0.19	1.79	-0.86	-0.01	1.25	0.24	0.38	-1.39	...	-0.11	0.01	-0.19	-1.18	0.65	-0.22	0.06	0.06	123.50
4	2.00	-1.16	0.88	1.55	0.40	-0.41	0.10	0.59	-0.27	0.82	...	-0.01	0.80	-0.14	0.14	-0.21	0.50	0.22	0.22	69.99

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	...	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00	284807.00
mean	94813.86	0.00	0.00	-0.00	0.00	0.00	0.00	-0.00	0.00	-0.00	...	0.00	-0.00	0.00	0.00	0.00	0.00	-0.00	-0.00	88.35	0.00
std	47488.15	1.96	1.65	1.52	1.42	1.38	1.33	1.24	1.19	1.10	...	0.73	0.73	0.62	0.61	0.52	0.48	0.40	0.33	250.12	0.04
min	0.00	-56.41	-72.72	-48.33	-5.68	-113.74	-26.16	-43.56	-73.22	-13.43	...	-34.83	-10.93	-44.81	-2.84	-10.30	-2.60	-22.57	-15.43	0.00	0.00
25%	54201.50	-0.92	-0.60	-0.89	-0.85	-0.69	-0.77	-0.55	-0.21	-0.64	...	-0.23	-0.54	-0.16	-0.35	-0.32	-0.33	-0.07	-0.05	5.60	0.00
50%	84692.00	0.02	0.07	0.18	-0.02	-0.05	-0.27	0.04	0.02	-0.05	...	-0.03	0.01	-0.01	0.04	0.02	-0.05	0.00	0.01	22.00	0.00
75%	139320.50	1.32	0.80	1.03	0.74	0.61	0.40	0.57	0.33	0.60	...	0.19	0.53	0.15	0.44	0.35	0.24	0.09	0.08	77.16	0.00
max	172792.00	2.45	22.06	9.38	16.88	34.80	73.30	120.59	20.01	15.59	...	27.20	10.50	22.53	4.58	7.52	3.52	31.61	33.85	25691.16	1.00

	Amount	Time
count	284807.00	284807.00
mean	88.35	94813.86
std	250.12	47488.15
min	0.00	0.00
25%	5.60	54201.50
50%	22.00	84692.00
75%	77.16	139320.50
max	25691.16	172792.00

	V22	V25	V26	V15	V13	V8	V23	V24	Amount	V28	...	V27	V20	V19	V21	V6	V2	V5	V9	V1
2202	-0.14	-0.03	-0.19	-1.07	0.99	-0.09	0.54	0.69	238.88	0.17	...	0.02	0.59	-0.58	0.09	-0.23	0.17	-0.05	-1.23	-1.21
151351	0.30	-0.55	0.32	-2.56	0.44	0.02	-0.07	0.60	10.00	0.23	...	0.10	-0.12	-0.74	0.03	-0.78	0.47	1.56	1.28	-0.53
249833	1.02	-0.29	-0.43	0.23	1.26	0.01	-0.29	0.68	8.00	0.15	...	0.27	0.10	0.52	0.37	-1.13	0.91	1.76	-1.18	-0.95
173882	0.01	0.60	-0.48	-0.34	0.53	-0.33	-0.09	-0.13	10.39	-0.03	...	0.00	-0.15	-0.16	-0.08	-0.52	0.55	1.43	0.07	2.08
208023	0.28	0.30	0.48	-0.85	0.67	0.48	-0.42	0.71	61.60	-0.39	...	-1.24	-0.53	0.14	0.20	-0.62	1.90	0.23	-0.68	-2.09

	V22	V25	V26	V15	V13	V8	V23	V24	Amount	V28	...	V27	V20	V19	V21	V6	V2	V5	V9	V1
225184	0.66	0.74	-0.30	-1.22	-0.69	-0.72	-0.51	0.08	12.82	-0.13	...	-0.25	-0.29	-1.08	0.15	-1.60	0.95	1.39	-0.21	-0.58
116637	0.42	0.43	-0.29	0.64	0.30	-0.01	-0.31	0.43	256.39	0.08	...	0.02	0.39	-0.51	0.28	-0.19	-0.79	-1.19	0.73	0.69
99414	0.55	-0.27	0.28	-0.18	0.79	0.37	-0.06	0.59	16.44	0.13	...	0.22	0.08	0.13	0.21	-0.85	1.21	-0.18	-1.01	-0.85
217619	0.44	-0.08	0.52	-1.51	-0.64	0.44	0.41	0.65	270.00	0.22	...	0.15	0.51	-0.03	0.27	-0.02	0.62	-0.09	-0.41	-1.10
279878	-0.77	-0.33	0.20	0.14	-0.86	-0.28	0.35	0.02	1.29	-0.06	...	-0.08	-0.28	0.15	-0.30	-1.24	-0.11	-0.23	0.68	2.06

Credit Card Fraud Detection Predictive Models

Introduction ¶

Exploratory Data Analysis ¶

Data Preprocessing ¶

Sampling ¶

Model Evaluation ¶

Performance Analysis ¶

	V22	V25	V26	V15	V13	V8	V23	V24	Amount	V28	...	V27	V20	V19	V21	V6	V2	V5	V9	V1
16854	-1.52	0.23	0.89	0.98	0.97	-0.20	-0.02	-1.00	53.05	0.01	...	-0.07	-0.31	-0.17	-0.65	-0.28	-0.55	-0.37	-0.89	1.41
274097	1.06	0.84	0.12	-2.46	0.82	0.91	-0.57	-0.56	7.20	0.05	...	0.23	0.13	0.57	0.25	0.87	-0.14	-1.35	0.45	-1.16
194462	-0.09	0.78	-0.09	-0.83	-0.64	0.02	-0.46	0.09	15.00	0.03	...	-0.02	0.19	1.89	0.03	0.08	0.31	0.62	-1.81	-0.39
152441	-0.17	-0.53	0.46	-0.42	0.79	0.32	0.23	-0.42	14.95	-0.06	...	-0.06	-0.21	0.20	-0.15	0.87	-0.60	-0.30	2.65	2.01
112165	-0.55	0.37	-0.05	-2.75	-1.84	0.54	0.00	-0.04	0.76	0.02	...	0.05	-0.28	0.23	-0.30	1.07	-0.12	-0.85	0.73	1.07

	Classifier	Dataset	Avg_ROC_AUC	Avg_Precision	Avg_Recall	Avg_F1_Score	Avg_Support	Num_Columns
0	LogisticRegression	Dataset1 - UnderSampling	0.75	0.61	0.72	0.63	673.20	8
1	LogisticRegression	Dataset1 - Combined	0.78	0.73	0.73	0.73	918.00	8
2	LogisticRegression	Dataset2 - UnderSampling	0.87	0.62	0.78	0.64	673.20	20
3	LogisticRegression	Dataset2 - Combined	0.90	0.80	0.82	0.80	918.00	20
4	GradientBoostingMachine	Dataset1 - UnderSampling	0.92	0.89	0.76	0.81	673.20	8
5	GradientBoostingMachine	Dataset1 - Combined	0.96	0.90	0.88	0.89	918.00	8
6	GradientBoostingMachine	Dataset2 - UnderSampling	0.96	0.92	0.85	0.88	673.20	20
7	GradientBoostingMachine	Dataset2 - Combined	0.99	0.95	0.94	0.94	918.00	20
8	XGBoost	Dataset1 - UnderSampling	0.93	0.90	0.79	0.84	673.20	8
9	XGBoost	Dataset1 - Combined	0.98	0.93	0.92	0.92	918.00	8
10	XGBoost	Dataset2 - UnderSampling	0.97	0.96	0.87	0.91	673.20	20
11	XGBoost	Dataset2 - Combined	0.99	0.97	0.97	0.97	918.00	20
12	LightGBM	Dataset1 - UnderSampling	0.93	0.91	0.78	0.83	673.20	8
13	LightGBM	Dataset1 - Combined	0.97	0.91	0.89	0.90	918.00	8
14	LightGBM	Dataset2 - UnderSampling	0.97	0.96	0.87	0.91	673.20	20
15	LightGBM	Dataset2 - Combined	0.99	0.96	0.95	0.96	918.00	20
16	KNN	Dataset1 - UnderSampling	0.79	0.88	0.66	0.71	673.20	8
17	KNN	Dataset1 - Combined	0.95	0.88	0.89	0.89	918.00	8
18	KNN	Dataset2 - UnderSampling	0.66	0.72	0.58	0.60	673.20	20
19	KNN	Dataset2 - Combined	0.83	0.76	0.75	0.75	918.00	20

	Classifier	Dataset	ROC_AUC_Score	Precision	Recall	F1-Score	Support
0	XGBoost (Trained on Dataset1 UnderSampling)	Dataset1 - UnderSampling	0.92	0.59	0.76	0.63	56962
1	XGBoost (Trained on Dataset1 UnderSampling)	Dataset2 - UnderSampling	0.94	0.68	0.84	0.74	56962
2	XGBoost (Trained on Dataset2 UnderSampling)	Dataset1 - UnderSampling	0.92	0.55	0.79	0.58	56962
3	XGBoost (Trained on Dataset2 UnderSampling)	Dataset2 - UnderSampling	0.94	0.58	0.87	0.63	56962
4	LightGBM (Trained on Dataset1 UnderSampling)	Dataset1 - UnderSampling	0.92	0.54	0.75	0.57	56962
5	LightGBM (Trained on Dataset1 UnderSampling)	Dataset2 - UnderSampling	0.94	0.60	0.84	0.65	56962
6	LightGBM (Trained on Dataset2 UnderSampling)	Dataset1 - UnderSampling	0.91	0.53	0.78	0.56	56962
7	LightGBM (Trained on Dataset2 UnderSampling)	Dataset2 - UnderSampling	0.94	0.59	0.86	0.64	56962

Credit Card Fraud Detection Predictive Models

Introduction¶

Exploratory Data Analysis¶

Data Preprocessing¶

Sampling¶

Model Evaluation¶

Performance Analysis¶

Introduction ¶

Exploratory Data Analysis ¶

Data Preprocessing ¶

Sampling ¶

Model Evaluation ¶

Performance Analysis ¶