# basic libraries
import pandas as pd
import math as m
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# saving & time gauge
import joblib
import time
 
# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from imblearn.over_sampling import ADASYN

## cont
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
## cat
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# grid
from sklearn.model_selection import GridSearchCV

# pipeline
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

# metrics
from sklearn.metrics import classification_report ,confusion_matrix, f1_score, balanced_accuracy_score, precision_score, recall_score, roc_auc_score
# other
import missingno as msno

df = pd.read_csv("C:\\Users\\Mateusz\\Desktop\\Machine Learning II\\project\\c1.csv")

df

continuous_columns = ["age", "avg_glucose_level", "bmi", "feat01", "feat02", "feat03", "feat04", "feat05", "feat06", "feat07", "feat08", "feat09", "feat10"]
categorical_columns = [col for col in df.columns if col not in continuous_columns and col != "stroke"] # I exclude "stroke"

for col in categorical_columns:
    print(f"Feature: {col}")
    print(df[col].value_counts())
    print("-" * 40)

Feature: id
id
1       1
3580    1
3578    1
3577    1
3576    1
       ..
1786    1
1785    1
1784    1
1783    1
5359    1
Name: count, Length: 5359, dtype: int64
----------------------------------------
Feature: ever_married
ever_married
Yes    3573
No     1786
Name: count, dtype: int64
----------------------------------------
Feature: gender
gender
Female    3135
Male      2223
Other        1
Name: count, dtype: int64
----------------------------------------
Feature: heart_disease
heart_disease
0    5036
1     323
Name: count, dtype: int64
----------------------------------------
Feature: hypertension
hypertension
0    4795
1     564
Name: count, dtype: int64
----------------------------------------
Feature: Residence_type
Residence_type
Urban    2731
Rural    2628
Name: count, dtype: int64
----------------------------------------
Feature: smoking_status
smoking_status
never smoked       1982
Unknown            1591
formerly smoked     955
smokes              831
Name: count, dtype: int64
----------------------------------------
Feature: work_type
work_type
Private          3074
Self-employed     884
Govt_job          690
children          689
Never_worked       22
Name: count, dtype: int64
----------------------------------------

# gender as "other" appears just once! I can easily remove that "outlier", there's no need to create a separate variable for that
df = df[df['gender'] != "Other"].copy()

print(f"Number of rows: {df.shape[0]}")

Number of rows: 5358

df.drop(['id'], axis=1, inplace=True) # irrelevant column

df.isnull().sum() # missing values for bmi

age                    0
avg_glucose_level      0
bmi                  241
ever_married           0
feat01                 0
feat02                 0
feat03                 0
feat04                 0
feat05                 0
feat06                 0
feat07                 0
feat08                 0
feat09                 0
feat10                 0
gender                 0
heart_disease          0
hypertension           0
Residence_type         0
smoking_status         0
stroke                 0
work_type              0
dtype: int64

# checking for imbalanced data
stroke_counts = df['stroke'].value_counts()

stroke_percentage = stroke_counts / stroke_counts.sum() * 100

# the plot illustrating the problem
plt.figure(figsize=(6, 4))
ax = stroke_counts.plot(kind='bar', color=['purple', 'orchid'])

# % labels
for i, count in enumerate(stroke_counts):
    ax.text(i, count + 10, f'{stroke_percentage[i]:.1f}%', ha='center', fontsize=12)

plt.title('0s and 1s for "stroke" variable')
plt.xlabel('Value')
plt.ylabel('Number')
plt.xticks(rotation=0)
plt.show()

#X_names = [col for col in df.columns if col != 'stroke']
#y = df[['stroke']]
#X = df[X_names]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123456789)

#joblib.dump(X_train, 'X_train.pkl')

#joblib.dump(X_test, 'X_test.pkl')

#joblib.dump(y_train, 'y_train.pkl')

#joblib.dump(y_test, 'y_test.pkl')

def load_datasets():
    X_train = joblib.load('X_train.pkl')
    X_test = joblib.load('X_test.pkl')
    y_train = joblib.load('y_train.pkl')
    y_test = joblib.load('y_test.pkl')
    
    print("Datasets loaded successfully!")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
    return X_train, X_test, y_train, y_test
print('')

X_train, X_test, y_train, y_test = load_datasets()

Datasets loaded successfully!
X_train shape: (4286, 20), y_train shape: (4286, 1)
X_test shape: (1072, 20), y_test shape: (1072, 1)

# different approach towards numerical and categorical data

continuous_columns = ["age", "avg_glucose_level", "bmi", "feat01", "feat02", "feat03", "feat04", "feat05", "feat06", "feat07", "feat08", "feat09", "feat10"]

categorical_columns = [col for col in X_train.columns if col not in continuous_columns and col != "stroke"] # I exclude "stroke"

# optimal k for kNN imputation
n_nbs = m.ceil(np.sqrt(len(X_train[["bmi"]])))

# preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', KNNImputer(n_neighbors=n_nbs, weights="uniform", metric="nan_euclidean")),
            ('scaler', StandardScaler()), 
            ('feature_selection', SelectKBest(f_classif, k=5)) #  up to 5 cont. vars
        ]), continuous_columns),
        
        ('cat', Pipeline([
            ('encoder', OneHotEncoder(handle_unknown='ignore')),   # one-hot encoding
            ('feature_selection', SelectKBest(chi2, k=5))  #  up to 5 cat. vars
        ]), categorical_columns)
    ])

preprocessor.fit(X_train, y_train)

E:\anaconda\Lib\site-packages\sklearn\utils\validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  KNNImputer(n_neighbors=66)),
                                                 ('scaler', StandardScaler()),
                                                 ('feature_selection',
                                                  SelectKBest(k=5))]),
                                 ['age', 'avg_glucose_level', 'bmi', 'feat01',
                                  'feat02', 'feat03', 'feat04', 'feat05',
                                  'feat06', 'feat07', 'feat08', 'feat09',
                                  'feat10']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore')),
                                                 ('feature_selection',
                                                  SelectKBest(k=5,
                                                              score_func=<function chi2 at 0x000001CE84E9B740>))]),
                                 ['ever_married', 'gender', 'heart_disease',
                                  'hypertension', 'Residence_type',
                                  'smoking_status', 'work_type'])])

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  KNNImputer(n_neighbors=66)),
                                                 ('scaler', StandardScaler()),
                                                 ('feature_selection',
                                                  SelectKBest(k=5))]),
                                 ['age', 'avg_glucose_level', 'bmi', 'feat01',
                                  'feat02', 'feat03', 'feat04', 'feat05',
                                  'feat06', 'feat07', 'feat08', 'feat09',
                                  'feat10']),
                                ('cat',
                                 Pipeline(steps=[('encoder',
                                                  OneHotEncoder(handle_unknown='ignore')),
                                                 ('feature_selection',
                                                  SelectKBest(k=5,
                                                              score_func=<function chi2 at 0x000001CE84E9B740>))]),
                                 ['ever_married', 'gender', 'heart_disease',
                                  'hypertension', 'Residence_type',
                                  'smoking_status', 'work_type'])])

['age', 'avg_glucose_level', 'bmi', 'feat01', 'feat02', 'feat03', 'feat04', 'feat05', 'feat06', 'feat07', 'feat08', 'feat09', 'feat10']

KNNImputer(n_neighbors=66)

StandardScaler()

SelectKBest(k=5)

['ever_married', 'gender', 'heart_disease', 'hypertension', 'Residence_type', 'smoking_status', 'work_type']

OneHotEncoder(handle_unknown='ignore')

SelectKBest(k=5, score_func=<function chi2 at 0x000001CE84E9B740>)

cat_pipeline = preprocessor.named_transformers_['cat']  # a part regarding categorical variables
encoder = cat_pipeline.named_steps['encoder']  # I'm interested in the encoding process

X_cat_encoded = encoder.transform(X_train[categorical_columns]).toarray()
encoded_cat_feature_names = encoder.get_feature_names_out(categorical_columns)

X_cat_encoded_df = pd.DataFrame(X_cat_encoded, columns=encoded_cat_feature_names)  # as data frame
X_cat_encoded_df.head() # neat table

selector_cat = cat_pipeline.named_steps['feature_selection'] # now feature selection

chi2_scores = pd.DataFrame({
    'Feature': encoded_cat_feature_names,
    'Chi2-Score': selector_cat.scores_
}).sort_values(by='Chi2-Score', ascending=False) # getting chi^2 scores and sorting them in ascending order

num_pipeline = preprocessor.named_transformers_['num']
selector_num = num_pipeline.named_steps['feature_selection'] # for num ones too

X_num_processed = num_pipeline[:-1].transform(X_train[continuous_columns])  # up until selection
X_num_selected = num_pipeline.transform(X_train[continuous_columns])  # after selection

f_scores = pd.DataFrame({
    'Feature': continuous_columns,
    'F-Score': selector_num.scores_
}).sort_values(by='F-Score', ascending=False) # getting Fs for numerical features

#print("\nChi2 Scores for categorical features:")
#print(chi2_scores)

#print("\nF-Scores for numerical features:")
#print(f_scores) --> I'll make a plot to show that

fig, axes = plt.subplots(2, 1, figsize=(10, 14), sharex=False) # two plots, one each row

# chi2 Scores
axes[0].barh(chi2_scores['Feature'], chi2_scores['Chi2-Score'], color='purple')
axes[0].set_title('Chi2 Scores for Categorical Features', fontsize=16)
axes[0].set_xlabel('Chi2 Score', fontsize=14)
axes[0].invert_yaxis()  # there're a lot of features so it's better to rotate the plot

# f-Scores
axes[1].barh(f_scores['Feature'], f_scores['F-Score'], color='orchid')
axes[1].set_title('F-Scores for Numerical Features', fontsize=16)
axes[1].set_xlabel('F Score', fontsize=14)
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

# pipeline; only the classifier is going to be changed later when we apply other methods
'''
pipe = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()), # the data needs reasmpling (about 95% of 0s)
    ('classifier', RandomForestClassifier()) # classifier; to be changed later
])

# grid
param_grid_rf = {
    'classifier__bootstrap': [True, False],
    'classifier__max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
    'classifier__max_features': ['auto', 'sqrt'],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_samples_split': [2, 5],
    'classifier__n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
}

# 3-fold cross-validation (later on 5-fold CV is going to be implemented)
grid_search_rf = GridSearchCV(pipe, param_grid_rf, cv=3, verbose=2, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# results
print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Best score: {grid_search_rf.best_score_}")
test_score = grid_search_rf.score(X_test, y_test)
print(f"Test score: {test_score}")
'''
# Fitting 3 folds for each of 2640 candidates, totalling 7920 fits
print('')

grid_search_rf = joblib.load("grid_search_rf.pkl")

# the best model
best_rf_model = grid_search_rf.best_estimator_
print("Best hyperparameters for Random Forest:", grid_search_rf.best_params_)
y_pred_rf = best_rf_model.predict(X_test)
y_pred_prob_rf = best_rf_model.predict_proba(X_test)[:, 1]

Best hyperparameters for Random Forest: {'classifier__bootstrap': False, 'classifier__max_depth': 100, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}

# cutoff values
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (Random Forest)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_prob_rf = best_rf_model.predict_proba(X_test)[:, 1]
    row, col = divmod(idx, 3)
    y_pred_cutoff = (y_pred_prob_rf >= cutoff).astype(int)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# classification report and metrics basic loop
'''
for idx, cutoff in enumerate(cutoffs):
    y_pred_cutoff = (y_pred_prob_rf >= cutoff).astype(int)
    print(f"\nClassification Report for Cutoff = {cutoff:.2f}:")
    report = classification_report(y_test, y_pred_cutoff)
    print(report)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_rf)

    print("Metrics:")
    print(f"F1 Score: {f1:.3f}")
    print(f"Balanced Accuracy: {balanced_acc:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"AUC: {roc_auc:.3f}")
'''
print('Due to the fact that this loop produced a very long, not aesthetic output, it will be replaced by a more neat layout I created with ChatGPT. The basic loop is going to be deleted in the models I create later.')

Due to the fact that this loop produced a very long, not aesthetic output, it will be replaced by a more neat layout I created with ChatGPT. The basic loop is going to be deleted in the models I create later.

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_rf >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_rf)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

# joblib.dump(grid_search_rf, "grid_search_rf.pkl")

'''
pipe = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss')) # XGB now
])

param_grid_xgb = {
    'classifier__n_estimators': [100, 300, 500, 700],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__max_depth': [3, 6, 9, 12],
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__subsample': [0.6, 0.8, 1.0], # exclude 1 during the next attempt
    'classifier__colsample_bytree': [0.6, 0.8, 1.0], # exclude 1 during the next attempt
    'classifier__gamma': [0, 1, 5]   
}

# 5-fold CV
grid_search_xgb = GridSearchCV(
    pipe, 
    param_grid_xgb, 
    cv=5,
    verbose=2, 
    n_jobs=-1,
    scoring='accuracy'
)

# results
grid_search_xgb.fit(X_train, y_train)
print(f"Best parameters: {grid_search_xgb.best_params_}")
print(f"Best validation result: {grid_search_xgb.best_score_}")
'''
# Fitting 5 folds for each of 5184 candidates, totalling 25920 fits
print('')

grid_search_xgb = joblib.load("grid_search_xgb.pkl")

# the best model
best_xgb_model = grid_search_xgb.best_estimator_
print("Best hyperparameters for XGBoost:", grid_search_xgb.best_params_)
y_pred_xgb = best_xgb_model.predict(X_test)
y_pred_prob_xgb = best_xgb_model.predict_proba(X_test)[:, 1]

Best hyperparameters for XGBoost: {'classifier__colsample_bytree': 0.6, 'classifier__gamma': 0, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 9, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 500, 'classifier__subsample': 1.0}

# cutoff values
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (XGBoost)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_prob_xgb = best_xgb_model.predict_proba(X_test)[:, 1]
    row, col = divmod(idx, 3)
    y_pred_cutoff = (y_pred_prob_xgb >= cutoff).astype(int)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # plotting confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_xgb >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_xgb)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

# joblib.dump(grid_search_xgb, "grid_search_xgb.pkl")

'''
pipe_cb = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()),
    ('classifier', CatBoostClassifier(silent=True))
])

param_grid_cb = {
    'classifier__iterations': [100, 200, 500],
    'classifier__depth': [2, 6, 8, 10],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__l2_leaf_reg': [1, 3, 5, 7],
    'classifier__border_count': [32, 64, 128]
}

grid_search_cb = GridSearchCV(
    estimator=pipe_cb,
    param_grid=param_grid_cb,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search_cb.fit(X_train, y_train.values.ravel())  
'''
print('')

grid_search_cb = joblib.load("grid_search_cb.pkl")

# the best model
best_cb_model = grid_search_cb.best_estimator_
print("Best hyperparameters for CatBoost:", grid_search_cb.best_params_)
y_pred_cb = best_cb_model.predict(X_test)
y_pred_prob_cb = best_cb_model.predict_proba(X_test)[:, 1]

Best hyperparameters for CatBoost: {'classifier__border_count': 32, 'classifier__depth': 2, 'classifier__iterations': 500, 'classifier__l2_leaf_reg': 1, 'classifier__learning_rate': 0.01}

# cutoffs between 0.5 and 0.9 are to be tested
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (CatBoost)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_prob_cb = best_cb_model.predict_proba(X_test)[:, 1]
    row, col = divmod(idx, 3)
    y_pred_cutoff = (y_pred_prob_cb >= cutoff).astype(int)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # plotting the confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

# plot adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_cb >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_cb)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

# joblib.dump(grid_search_cb, "grid_search_cb.pkl")

'''
start_time = time.time()

pipe_lgbm = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()), 
    ('classifier', LGBMClassifier())
])

param_grid_lgbm = {
    'classifier__n_estimators': [100, 200, 500],
    'classifier__max_depth': [6, 8, 10, -1],  
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__num_leaves': [31, 63, 127], 
    'classifier__min_child_samples': [10, 20, 50], 
    'classifier__reg_alpha': [0, 0.1, 1, 10], 
    'classifier__reg_lambda': [0, 0.1, 1, 10] 
}

grid_search_lgbm = GridSearchCV(
    estimator=pipe_lgbm,
    param_grid=param_grid_lgbm,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search_lgbm.fit(X_train, y_train.values.ravel())

# results
best_lgbm_model = grid_search_lgbm.best_estimator_
print("Best hyperparameters for LightGBM:", grid_search_lgbm.best_params_)
print("Best cross-validated AUC score for LightGBM:", round(grid_search_lgbm.best_score_, 3))
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} s")
'''
print('')

grid_search_lgbm = joblib.load("grid_search_lgbm.pkl")

# the best model and predictions
best_lgbm_model = grid_search_lgbm.best_estimator_
print("Best hyperparameters for LightGBM:", grid_search_lgbm.best_params_)
y_pred_lgbm = best_lgbm_model.predict(X_test)
y_pred_prob_lgbm = best_lgbm_model.predict_proba(X_test)[:, 1]

Best hyperparameters for LightGBM: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 6, 'classifier__min_child_samples': 50, 'classifier__n_estimators': 100, 'classifier__num_leaves': 127, 'classifier__reg_alpha': 10, 'classifier__reg_lambda': 10}

# cutoff values
cutoffs = np.arange(0.5, 0.95, 0.05)

# a 3x3 canvas is desired
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (LightGBM)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_prob_lgbm = best_lgbm_model.predict_proba(X_test)[:, 1]
    row, col = divmod(idx, 3)
    y_pred_cutoff = (y_pred_prob_lgbm >= cutoff).astype(int)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # plotting the confusion matrix
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_lgbm >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_lgbm)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

# joblib.dump(grid_search_lgbm, "grid_search_lgbm.pkl")

'''
start_time = time.time()

pipe_mlp = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()),
    ('classifier', MLPClassifier()) 
])

param_grid_mlp = {
    'classifier__solver': ['adam'],
    'classifier__learning_rate_init': [0.0001],
    'classifier__max_iter': [300],
    'classifier__hidden_layer_sizes': [
        (500, 400, 300, 200, 100),
        (400, 400, 400, 400, 400),
        (300, 300, 300, 300, 300),
        (200, 200, 200, 200, 200)
    ],
    'classifier__activation': ['logistic', 'tanh', 'relu'],
    'classifier__alpha': [0.0001, 0.001, 0.005],
    'classifier__early_stopping': [True, False]
}

grid_search_mlp = GridSearchCV(
    estimator=pipe_mlp,
    param_grid=param_grid_mlp,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid_search_mlp.fit(X_train, y_train.values.ravel())

best_mlp_model = grid_search_mlp.best_estimator_
print("Best hyperparameters for MLPClassifier:", grid_search_mlp.best_params_)

print("Best cross-validated AUC score for MLPClassifier:", round(grid_search_mlp.best_score_, 3))

end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} s")
'''
print('')

grid_search_mlp = joblib.load("grid_search_mlp.pkl")

# the best model
best_mlp_model = grid_search_mlp.best_estimator_
print("Best hyperparameters for MLPClassifier:", grid_search_mlp.best_params_)

# predictions
y_pred_mlp = best_mlp_model.predict(X_test)
y_pred_prob_mlp = best_mlp_model.predict_proba(X_test)[:, 1]

Best hyperparameters for MLPClassifier: {'classifier__activation': 'logistic', 'classifier__alpha': 0.001, 'classifier__early_stopping': False, 'classifier__hidden_layer_sizes': (400, 400, 400, 400, 400), 'classifier__learning_rate_init': 0.0001, 'classifier__max_iter': 300, 'classifier__solver': 'adam'}

# cutoffs between 0.5 and 0.9 are to be tested
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (MLP 1st run)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)
    row, col = divmod(idx, 3)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # cm plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')
    
# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_mlp)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

'''
param_grid_mlp = {
    'classifier__solver': ['adam'],
    'classifier__learning_rate_init': [0.001, 0.01, 0.05, 0.1],       <-- 3 more learning rates
    'classifier__max_iter': [300],
    'classifier__hidden_layer_sizes': [
        (500, 400, 300, 200, 100),
        (400, 400, 400, 400, 400),
        (300, 300, 300, 300, 300),
        (200, 200, 200, 200, 200)
    ],
    'classifier__activation': ['tanh', 'relu'],                       <-- sigmoid activation function excluded
    'classifier__alpha': [0.0001, 0.005, 0.01],
    'classifier__early_stopping': [True]
}
'''
print('')

grid_search_mlp = joblib.load("grid_search_mlp_2.pkl")

# the best model
best_mlp_model = grid_search_mlp.best_estimator_
print("Best hyperparameters for MLPClassifier:", grid_search_mlp.best_params_)

# predictions
y_pred_mlp = best_mlp_model.predict(X_test)
y_pred_prob_mlp = best_mlp_model.predict_proba(X_test)[:, 1]

Best hyperparameters for MLPClassifier: {'classifier__activation': 'tanh', 'classifier__alpha': 0.005, 'classifier__early_stopping': True, 'classifier__hidden_layer_sizes': (500, 400, 300, 200, 100), 'classifier__learning_rate_init': 0.05, 'classifier__max_iter': 300, 'classifier__solver': 'adam'}

# cutoffs between 0.5 and 0.9 are to be tested
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (MLP 2nd run)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)
    row, col = divmod(idx, 3)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # cm plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')
    
# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff, zero_division=0) # zero_division to avoid warnings (as instructed in the warning)

    f1 = f1_score(y_test, y_pred_cutoff, zero_division=0)
    precision = precision_score(y_test, y_pred_cutoff, zero_division=0)
    recall = recall_score(y_test, y_pred_cutoff, zero_division=0)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_mlp)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

'''
param_grid_mlp = {
    'classifier__solver': ['adam'],
    'classifier__learning_rate_init': [0.001, 0.01, 0.05, 0.1],
    'classifier__max_iter': [300],
    'classifier__hidden_layer_sizes': [
        (512, 256, 128),         # less and less neurons in subsequent layers
        (256, 256, 256),         # symmetrical distribution of neurons
        (128, 256, 128),         # peak in the middle
        (128, 64, 32, 16),       # small architecture in terms of total number of neurons
        (1024, 512),             # only 2 layers but big ones
    ],
    'classifier__activation': ['tanh', 'relu'],
    'classifier__alpha': [0.0001, 0.001, 0.01],
    'classifier__early_stopping': [True, False]
}
'''
print('')

grid_search_mlp = joblib.load("grid_search_mlp_3.pkl")

# the best model
best_mlp_model = grid_search_mlp.best_estimator_
print("Best hyperparameters for MLPClassifier:", grid_search_mlp.best_params_)

# predictions
y_pred_mlp = best_mlp_model.predict(X_test)
y_pred_prob_mlp = best_mlp_model.predict_proba(X_test)[:, 1]

Best hyperparameters for MLPClassifier: {'classifier__activation': 'tanh', 'classifier__alpha': 0.005, 'classifier__early_stopping': True, 'classifier__hidden_layer_sizes': (256, 256, 256), 'classifier__learning_rate_init': 0.05, 'classifier__max_iter': 300, 'classifier__solver': 'adam'}

# cutoffs between 0.5 and 0.9 are to be tested
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (MLP)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)
    row, col = divmod(idx, 3)
    cm = confusion_matrix(y_test, y_pred_cutoff)

    # cm plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')
    
# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)

    report = classification_report(y_test, y_pred_cutoff)

    f1 = f1_score(y_test, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
    precision = precision_score(y_test, y_pred_cutoff)
    recall = recall_score(y_test, y_pred_cutoff)
    roc_auc = roc_auc_score(y_test, y_pred_prob_mlp)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

# joblib.dump(grid_search_mlp, "grid_search_mlp.pkl")

summary = {
    "Model (cut-off)": ["Random Forest (0.5)", "XGBoost (0.55)", "CatBoost (0.6)", "LightGBM (0.5)", "MLP 1st run (0.6)", "MLP 2nd run (0.7)", "MLP 3rd run (0.5)"],
    "F1": [0.411, 0.432, 0.484, 0.459, 0.432, 0.479, 0.456],
    "Balanced Accuracy": [0.675, 0.673, 0.826, 0.799, 0.821, 0.842, 0.811],
    "Precision": [0.400, 0.478, 0.341, 0.326, 0.284, 0.328, 0.315],
    "Recall": [0.422, 0.394, 0.835, 0.780, 0.899, 0.890, 0.826]
}

summary_df = pd.DataFrame(summary)

# Display the dataframe
summary_df

grid_search_mlp = joblib.load("grid_search_mlp_2.pkl")
best_mlp_model = grid_search_mlp.best_estimator_
y_pred_mlp = best_mlp_model.predict(X_test)
y_pred_prob_mlp = best_mlp_model.predict_proba(X_test)[:, 1]

cutoff = 0.7

y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)
cm = confusion_matrix(y_test, y_pred_cutoff)

report = classification_report(y_test, y_pred_cutoff)
f1 = f1_score(y_test, y_pred_cutoff)
balanced_acc = balanced_accuracy_score(y_test, y_pred_cutoff)
precision = precision_score(y_test, y_pred_cutoff)
recall = recall_score(y_test, y_pred_cutoff)
roc_auc = roc_auc_score(y_test, y_pred_prob_mlp)

full_report = f"Classification Report (Cutoff = {cutoff:.2f})\n\n{report}\n" \
              f"F1 Score: {f1:.3f}\n" \
              f"Balanced Accuracy: {balanced_acc:.3f}\n" \
              f"Precision: {precision:.3f}\n" \
              f"Recall: {recall:.3f}\n" \
              f"AUC: {roc_auc:.3f}"

fig, axes = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle("The best MLP, cut-off = 0.7", fontsize=18)
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[0], 
            xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
axes[0].set_title("Confusion Matrix", fontsize=14)
axes[0].set_xlabel("Predicted", fontsize=12)
axes[0].set_ylabel("Actual", fontsize=12)

axes[1].axis("off")
axes[1].text(0.5, 0.5, full_report, ha="center", va="center", wrap=True, fontsize=12)
axes[1].set_title("Metrics Report", fontsize=14)

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()
print("Best hyperparameters for MLPClassifier:", grid_search_mlp.best_params_)

Best hyperparameters for MLPClassifier: {'classifier__activation': 'tanh', 'classifier__alpha': 0.005, 'classifier__early_stopping': True, 'classifier__hidden_layer_sizes': (500, 400, 300, 200, 100), 'classifier__learning_rate_init': 0.05, 'classifier__max_iter': 300, 'classifier__solver': 'adam'}

pipe = ImbPipeline([
    ('preprocessor', preprocessor),
    ('resampler', ADASYN()), # the data needs reasmpling (about 95% of 0s)
    ('classifier', MLPClassifier()) # classifier; to be changed later
])

# grid
param_grid_mlp = {
    'classifier__activation': ['tanh'],
    'classifier__alpha': [0.005],
    'classifier__early_stopping': [True],
    'classifier__hidden_layer_sizes': [(500, 400, 300, 200, 100)],
    'classifier__learning_rate_init': [0.05],
    'classifier__max_iter': [300],
    'classifier__solver': ['adam']
}
# 3-fold cross-validation (later on 5-fold CV is going to be implemented)
grid_search_mlp = GridSearchCV(pipe, param_grid_mlp, cv=3, verbose=2, n_jobs=-1)
grid_search_mlp.fit(X_train, y_train)
print('')

Fitting 3 folds for each of 1 candidates, totalling 3 fits

E:\anaconda\Lib\site-packages\sklearn\utils\validation.py:1143: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
E:\anaconda\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:1098: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

# the best model
best_mlp_model = grid_search_mlp.best_estimator_

# predictions for the train part
y_pred_mlp = best_mlp_model.predict(X_train)
y_pred_prob_mlp = best_mlp_model.predict_proba(X_train)[:, 1]

# cutoffs between 0.5 and 0.9 are to be trained
cutoffs = np.arange(0.5, 0.95, 0.05)

# I'd like a 3x3 canvas
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Confusion Matrices for Different Cutoff Values (MLP)', fontsize=16)

# confusion matrices and reports for each cutoff
for idx, cutoff in enumerate(cutoffs):
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)
    row, col = divmod(idx, 3)
    cm = confusion_matrix(y_train, y_pred_cutoff)

    # cm plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', ax=axes[row, col], 
                xticklabels=['No Stroke', 'Stroke'], yticklabels=['No Stroke', 'Stroke'])
    
    axes[row, col].set_title(f'Cutoff = {cutoff:.2f}')
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')
    
# layout adjustment
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# neat layout by ChatGPT
reports = []

for cutoff in cutoffs:
    y_pred_cutoff = (y_pred_prob_mlp >= cutoff).astype(int)

    report = classification_report(y_train, y_pred_cutoff)

    f1 = f1_score(y_train, y_pred_cutoff)
    balanced_acc = balanced_accuracy_score(y_train, y_pred_cutoff)
    precision = precision_score(y_train, y_pred_cutoff)
    recall = recall_score(y_train, y_pred_cutoff)
    roc_auc = roc_auc_score(y_train, y_pred_prob_mlp)

    full_report = f"Cutoff = {cutoff:.2f}\n\n{report}\n" \
                  f"F1 Score: {f1:.3f}\n" \
                  f"Balanced Accuracy: {balanced_acc:.3f}\n" \
                  f"Precision: {precision:.3f}\n" \
                  f"Recall: {recall:.3f}\n" \
                  f"AUC: {roc_auc:.3f}"
    reports.append(full_report)

rows, cols = 3, 3

while len(reports) < rows * cols:
    reports.append("")  

table = [reports[i:i + cols] for i in range(0, len(reports), cols)]

fig, axes = plt.subplots(rows, cols, figsize=(15, 15))
for i in range(rows):
    for j in range(cols):
        ax = axes[i, j]
        ax.axis("off")
        ax.text(0.5, 0.5, table[i][j], ha="center", va="center", wrap=True, fontsize=10)

plt.tight_layout()
plt.show()

	id	age	avg_glucose_level	bmi	ever_married	feat01	feat02	feat03	feat04	feat05	...	feat08	feat09	feat10	gender	heart_disease	hypertension	Residence_type	smoking_status	stroke	work_type
0	1	75.0	219.82	29.5	Yes	0.475089	0.595032	1.230383	0.845381	1.067904	...	1.426921	1.454747	0.441987	Female	0	1	Rural	formerly smoked	0	Self-employed
1	2	50.0	69.92	18.7	Yes	0.618836	0.432241	1.402145	1.191038	1.395761	...	1.320794	0.708369	0.348898	Female	0	0	Urban	formerly smoked	0	Self-employed
2	3	79.0	72.73	28.4	Yes	0.421711	0.629341	0.916465	1.107330	1.335406	...	0.669458	0.696877	0.809689	Male	0	0	Rural	never smoked	1	Private
3	4	3.0	78.24	16.2	No	0.482635	0.557748	0.762485	0.925121	0.975914	...	0.420634	1.343314	0.444809	Male	0	0	Rural	Unknown	0	children
4	5	53.0	196.25	24.9	Yes	0.435785	0.497572	0.743418	1.007523	0.648819	...	0.648643	1.168722	0.391860	Female	1	1	Urban	smokes	0	Private
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5354	5355	55.0	92.98	25.6	Yes	0.514424	0.512044	0.599711	0.527823	1.484436	...	0.662278	1.126019	0.255485	Female	0	0	Rural	never smoked	1	Self-employed
5355	5356	59.0	109.82	23.7	Yes	0.513911	0.603200	1.471257	0.971114	1.448692	...	0.671674	1.347132	0.373458	Female	0	0	Urban	never smoked	0	Private
5356	5357	31.0	125.38	24.4	Yes	0.458564	0.509621	1.552418	0.719406	0.506281	...	1.140376	0.736226	0.616051	Female	0	0	Urban	smokes	0	Private
5357	5358	62.0	74.32	34.0	Yes	0.311724	0.530730	1.641201	1.242290	1.468356	...	0.996123	1.050264	0.175882	Female	0	1	Rural	never smoked	0	Self-employed
5358	5359	79.0	201.38	31.1	Yes	0.454434	0.493910	0.725071	1.124918	1.487161	...	0.395280	0.781069	0.398358	Female	1	0	Rural	never smoked	0	Private

Stroke Probability Prediction with Random Forests, XGBoost, CatBoost, LightGBM & MLP¶

Mateusz Kowalski & Aleksandra Ciesińska¶

2024/2025¶

DISCLAIMER: THE ORIGINAL DATASET WAS EXTENDED BY SOME ARTIFICIAL FEATURES TO MAKE THE TASK DIFFERENT FROM A KAGGLE COMPTETITION. THUS, ALL THE FEATURES NAMED FEAT01-FEAT10 ARE TREATED AS IF THEY WERE INFORMATIVE AND POTANTIALLY USEFUL FOR PREDICITON!¶

Data preprocessing with ImbPipeline()¶

Categorical features distribution¶

Missing values and possibly imbalanced data¶

Train-test split¶

Preprocessor¶

One-hot encoding¶

Feature selection¶

Random Forest¶

XGBoost¶

CatBoost¶

LightGBM¶

Multilayer Perceptron | 1st run¶

Multilayer Perceptron | 2nd run¶

Multilayer Perceptron | 3rd run¶

Summary¶

Additional overfitting check¶

	ever_married_No	ever_married_Yes	gender_Female	gender_Male	heart_disease_0	heart_disease_1	hypertension_0	Residence_type_Rural	Residence_type_Urban	smoking_status_Unknown	smoking_status_formerly smoked	smoking_status_never smoked	work_type_Private	work_type_Self-employed	work_type_children
0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0
1	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0
2	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
3	0.0	1.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0
4	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0

	Model (cut-off)	F1	Balanced Accuracy	Precision	Recall
0	Random Forest (0.5)	0.411	0.675	0.400	0.422
1	XGBoost (0.55)	0.432	0.673	0.478	0.394
2	CatBoost (0.6)	0.484	0.826	0.341	0.835
3	LightGBM (0.5)	0.459	0.799	0.326	0.780
4	MLP 1st run (0.6)	0.432	0.821	0.284	0.899
5	MLP 2nd run (0.7)	0.479	0.842	0.328	0.890
6	MLP 3rd run (0.5)	0.456	0.811	0.315	0.826

	ever_married_No	ever_married_Yes	gender_Female	gender_Male	heart_disease_0	heart_disease_1	hypertension_0	Residence_type_Rural	Residence_type_Urban	smoking_status_Unknown	smoking_status_formerly smoked	smoking_status_never smoked	work_type_Private	work_type_Self-employed	work_type_children
0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0
1	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0
2	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
3	0.0	1.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0
4	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0

	ever_married_No	ever_married_Yes	gender_Female	gender_Male	heart_disease_0	heart_disease_1	hypertension_0	Residence_type_Rural	Residence_type_Urban	smoking_status_Unknown	smoking_status_formerly smoked	smoking_status_never smoked	work_type_Private	work_type_Self-employed	work_type_children
0	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0
1	0.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0
2	1.0	0.0	0.0	1.0	1.0	0.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	0.0
3	0.0	1.0	1.0	0.0	1.0	0.0	1.0	1.0	0.0	0.0	0.0	1.0	1.0	0.0	0.0
4	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0