01 - Compare Various ML/AI Techniques to Diabetes Data

This is adapted from the kaggle notebook here https://www.kaggle.com/code/alhazmi36/uas-ml/notebook

Imports¶

# Import libraries
import pandas as pd
import numpy as np

# For pre-processing and data sharing
from sklearn.model_selection import train_test_split

# For handling data imbalance
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

# To standardize the data
from sklearn.preprocessing import StandardScaler

# For feature selection
from sklearn.feature_selection import RFE

# Model machine learning
from sklearn.ensemble import RandomForestClassifier

# For model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


df = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')
df.head()

df.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  MentHlth              253680 non-null  float64
 16  PhysHlth              253680 non-null  float64
 17  DiffWalk              253680 non-null  float64
 18  Sex                   253680 non-null  float64
 19  Age                   253680 non-null  float64
 20  Education             253680 non-null  float64
 21  Income                253680 non-null  float64
dtypes: float64(22)
memory usage: 42.6 MB
None
       Diabetes_binary         HighBP       HighChol      CholCheck  \
count    253680.000000  253680.000000  253680.000000  253680.000000   
mean          0.139333       0.429001       0.424121       0.962670   
std           0.346294       0.494934       0.494210       0.189571   
min           0.000000       0.000000       0.000000       0.000000   
25%           0.000000       0.000000       0.000000       1.000000   
50%           0.000000       0.000000       0.000000       1.000000   
75%           0.000000       1.000000       1.000000       1.000000   
max           1.000000       1.000000       1.000000       1.000000   

                 BMI         Smoker         Stroke  HeartDiseaseorAttack  \
count  253680.000000  253680.000000  253680.000000         253680.000000   
mean       28.382364       0.443169       0.040571              0.094186   
std         6.608694       0.496761       0.197294              0.292087   
min        12.000000       0.000000       0.000000              0.000000   
25%        24.000000       0.000000       0.000000              0.000000   
50%        27.000000       0.000000       0.000000              0.000000   
75%        31.000000       1.000000       0.000000              0.000000   
max        98.000000       1.000000       1.000000              1.000000   

        PhysActivity         Fruits  ...  AnyHealthcare    NoDocbcCost  \
count  253680.000000  253680.000000  ...  253680.000000  253680.000000   
mean        0.756544       0.634256  ...       0.951053       0.084177   
std         0.429169       0.481639  ...       0.215759       0.277654   
min         0.000000       0.000000  ...       0.000000       0.000000   
25%         1.000000       0.000000  ...       1.000000       0.000000   
50%         1.000000       1.000000  ...       1.000000       0.000000   
75%         1.000000       1.000000  ...       1.000000       0.000000   
max         1.000000       1.000000  ...       1.000000       1.000000   

             GenHlth       MentHlth       PhysHlth       DiffWalk  \
count  253680.000000  253680.000000  253680.000000  253680.000000   
mean        2.511392       3.184772       4.242081       0.168224   
std         1.068477       7.412847       8.717951       0.374066   
min         1.000000       0.000000       0.000000       0.000000   
25%         2.000000       0.000000       0.000000       0.000000   
50%         2.000000       0.000000       0.000000       0.000000   
75%         3.000000       2.000000       3.000000       0.000000   
max         5.000000      30.000000      30.000000       1.000000   

                 Sex            Age      Education         Income  
count  253680.000000  253680.000000  253680.000000  253680.000000  
mean        0.440342       8.032119       5.050434       6.053875  
std         0.496429       3.054220       0.985774       2.071148  
min         0.000000       1.000000       1.000000       1.000000  
25%         0.000000       6.000000       4.000000       5.000000  
50%         0.000000       8.000000       5.000000       7.000000  
75%         1.000000      10.000000       6.000000       8.000000  
max         1.000000      13.000000       6.000000       8.000000  

[8 rows x 22 columns]

df['Diabetes_binary'].value_counts().plot(kind='pie')
df['Diabetes_binary'].value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

SPLIT DATA TRAIN & DATA TEST¶

# Separate features and targets
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

# Division of data into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

Make sure the dataset is balanced¶

from imblearn.combine import SMOTETomek

# Split up the data
smt = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)

# Resample
print('Class distribution before resampling:', np.bincount(y_train))
print('Class distribution before resampling:', np.bincount(y_train_resampled))

# Visualization of class distribution before and after
def plot_class_distribution(y_before, y_after, title_before, title_after):
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # Plot sebelum SMOTE-Tomek
    sns.countplot(x=y_before, ax=ax[0], palette="Set2")
    ax[0].set_title(title_before)
    ax[0].set_xlabel("")
    ax[0].set_ylabel("Jumlah")

    # Plot sesudah SMOTE-Tomek
    sns.countplot(x=y_after, ax=ax[1], palette="Set1")
    ax[1].set_title(title_after)
    ax[1].set_xlabel("Diabetes_binary")
    ax[1].set_ylabel("Jumlah")

    plt.tight_layout()
    plt.show()

# Sebelum SMOTE-Tomek (y_train) dan Setelah SMOTE-Tomek (y_train_resampled)
plot_class_distribution(y_train, y_train_resampled, "Before distribution split", "After distribution split")

/var/folders/bw/c9j8z20x45s2y20vv6528qjc0000gq/T/ipykernel_77798/1282855888.py:8: DeprecationWarning: Non-integer input passed to bincount. In a future version of NumPy, this will be an error. (Deprecated NumPy 2.1)
  print('Class distribution before resampling:', np.bincount(y_train))
/var/folders/bw/c9j8z20x45s2y20vv6528qjc0000gq/T/ipykernel_77798/1282855888.py:9: DeprecationWarning: Non-integer input passed to bincount. In a future version of NumPy, this will be an error. (Deprecated NumPy 2.1)
  print('Class distribution before resampling:', np.bincount(y_train_resampled))
/var/folders/bw/c9j8z20x45s2y20vv6528qjc0000gq/T/ipykernel_77798/1282855888.py:16: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=y_before, ax=ax[0], palette="Set2")

Class distribution before resampling: [130931  21277]
Class distribution before resampling: [130840 130931]

/var/folders/bw/c9j8z20x45s2y20vv6528qjc0000gq/T/ipykernel_77798/1282855888.py:22: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=y_after, ax=ax[1], palette="Set1")

Feature Selection¶

from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
estimator2= RandomForestClassifier() 
rfecv = RFECV(estimator=estimator2, step=1, n_jobs=-1, cv=5, scoring='recall')   #5-fold cross-validation
rfecv = rfecv.fit(X_train_resampled, y_train_resampled)

# Get the best features selected by RFE
selected_rfecv = X_train_resampled.columns[rfecv.support_]

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', selected_rfecv)

Optimal number of features : 18
Best features : Index(['HighBP', 'HighChol', 'BMI', 'Smoker', 'HeartDiseaseorAttack',
       'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost',
       'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age',
       'Education', 'Income'],
      dtype='object')

# Create a new DataFrame with only the selected features
X_train_selected_rfecv = X_train_resampled[selected_rfecv]
X_test_selected_rfecv = X_test[selected_rfecv]

Random Forest¶

# Initialize the Random Forest
rf_model = RandomForestClassifier(bootstrap= True, 
                                n_estimators=200, n_jobs=-1, 
                                max_depth=20, max_features='sqrt', 
                                min_samples_leaf=1,
                                max_leaf_nodes = 150,
                                min_samples_split = 5,
                                random_state=42)

# Train the model on the processed training data
rf_model.fit(X_train_selected_rfecv, y_train_resampled)

# Prediction on test data
y_pred = rf_model.predict(X_test_selected_rfecv)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))

# Confusion Matrix
cf_matrix = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(cf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

Accuracy: 83.37%
Precision: 41.10%
Recall: 46.08%

# Evaluate model performance on training data
y_train_pred = rf_model.predict(X_train_selected_rfecv)
train_accuracy = accuracy_score(y_train_resampled, y_train_pred)
print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))

# Evaluate model performance on test data
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

Training Accuracy: 87.63%
Test Accuracy: 83.37%

KNN¶

# Model machine learning
from sklearn.neighbors import KNeighborsClassifier

# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', n_jobs=-1)

# Train the model on pre-processed training data
knn_model.fit(X_train_selected_rfecv, y_train_resampled)

# Prediction on test data
y_pred_knn = knn_model.predict(X_test_selected_rfecv)

# Evaluate model performance
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)

print("Accuracy: {:.2f}%".format(accuracy_knn * 100))
print("Precision: {:.2f}%".format(precision_knn * 100))
print("Recall: {:.2f}%".format(recall_knn * 100))

# Confusion Matrix
cf_matrix_knn = confusion_matrix(y_test, y_pred_knn)

# Plot Confusion Matrix
plt.figure(figsize=(6,4))
sns.heatmap(cf_matrix_knn, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix - KNN')
plt.show()

Accuracy: 69.19%
Precision: 25.65%
Recall: 64.37%

# Evaluate model performance on training data
y_train_pred_knn = knn_model.predict(X_train_selected_rfecv)
train_accuracy_knn = accuracy_score(y_train_resampled, y_train_pred_knn)
print("Training Accuracy: {:.2f}%".format(train_accuracy_knn * 100))

# Evaluate model performance on the test data
test_accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("Test Accuracy: {:.2f}%".format(test_accuracy_knn * 100))

Training Accuracy: 88.19%
Test Accuracy: 69.19%

SVM¶

from sklearn.svm import SVC

# Initialize SVM model
svm_model = SVC(kernel='rbf', C=1, gamma='scale', random_state=42)

# Train the model on pre-processed training data (diabetes risk classification)
svm_model.fit(X_train_selected_rfecv, y_train_resampled)

# Prediction on test data using SVM
y_pred_svm = svm_model.predict(X_test_selected_rfecv)

# Evaluation of SVM model performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, pos_label=1)
recall_svm = recall_score(y_test, y_pred_svm, pos_label=1)

print("=== SVM untuk Klasifikasi Risiko Diabetes ===")
print("Accuracy: {:.2f}%".format(accuracy_svm * 100))
print("Precision: {:.2f}%".format(precision_svm * 100))
print("Recall: {:.2f}%".format(recall_svm * 100))

# Confusion Matrix for SVM model
cf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Plot Confusion Matrix for SVM model
plt.figure(figsize=(6, 4))
sns.heatmap(cf_matrix_svm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix - SVM (Klasifikasi Risiko Diabetes)')
plt.show()

# Evaluate model performance on training data
y_train_pred_svm = svm_model.predict(X_train_selected_rfecv)
train_accuracy_svm = accuracy_score(y_train_resampled, y_train_pred_svm)
print("Training Accuracy: {:.2f}%".format(train_accuracy_svm * 100))

# Evaluate model performance on test data
test_accuracy_svm = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(test_accuracy_svm * 100))

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Prediksi dengan model
y_pred_knn_model = knn_model.predict(X_test_selected_rfecv)
y_pred_rf_model = rf_model.predict(X_test_selected_rfecv)
y_pred_svm_model = svm_model.predict(X_test_selected_rfecv)

# Evaluasi metrik untuk masing-masing model
models = ['KNN', 'Random Forest', 'SVM']
accuracy = [
    accuracy_score(y_test, y_pred_knn_model),
    accuracy_score(y_test, y_pred_rf_model),
    accuracy_score(y_test, y_pred_svm_model)
]
precision = [
    precision_score(y_test, y_pred_knn_model),
    precision_score(y_test, y_pred_rf_model),
    precision_score(y_test, y_pred_svm_model)
    
]
recall = [
    recall_score(y_test, y_pred_knn_model),
    recall_score(y_test, y_pred_rf_model),
    recall_score(y_test, y_pred_svm_model)
]
f1 = [
    f1_score(y_test, y_pred_knn_model),
    f1_score(y_test, y_pred_rf_model),
    f1_score(y_test, y_pred_svm_model)
]
roc_auc = [
    roc_auc_score(y_test, y_pred_knn_model),
    roc_auc_score(y_test, y_pred_rf_model),
    roc_auc_score(y_test, y_pred_svm_model)
]

# Data untuk visualisasi
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
values = [accuracy, precision, recall, f1, roc_auc]

# Plot hasil perbandingan dalam bentuk bar plot
fig, axes = plt.subplots(1, len(metrics), figsize=(20, 5))

for i, metric in enumerate(metrics):
    axes[i].bar(models, values[i], color=['blue', 'red','green'])
    axes[i].set_title(metric)
    axes[i].set_ylim([0, 1])
    axes[i].set_ylabel('Score')
    for j, val in enumerate(values[i]):
        axes[i].text(j, val + 0.02, f'{val:.2f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

KBase (Biology)

Example 4: Abundance Analysis

AI/ML

02 - Dataset Clustering