# Simple KNN For Multi-Class Classification, rendered and some key aspects are explained along the notebook structure.

# Initializing libraries code block

import seaborn as sns
import pandas
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

# Initializing dataset from drive

data_diabetes = pandas.read_csv('/content/drive/MyDrive/ML&Big_Data/diabetes.csv')

# Describing dataframe objects, gathering info such as mean, std deviation,
# what percentage are in the different quartiles, and maximum values a single object can hold

data_diabetes.describe()

data_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   HighBP                253680 non-null  int64
 1   HighChol              253680 non-null  int64
 2   CholCheck             253680 non-null  int64
 3   BMI                   253680 non-null  int64
 4   Smoker                253680 non-null  int64
 5   Stroke                253680 non-null  int64
 6   HeartDiseaseorAttack  253680 non-null  int64
 7   PhysActivity          253680 non-null  int64
 8   Fruits                253680 non-null  int64
 9   Veggies               253680 non-null  int64
 10  HvyAlcoholConsump     253680 non-null  int64
 11  AnyHealthcare         253680 non-null  int64
 12  NoDocbcCost           253680 non-null  int64
 13  GenHlth               253680 non-null  int64
 14  MentHlth              253680 non-null  int64
 15  PhysHlth              253680 non-null  int64
 16  DiffWalk              253680 non-null  int64
 17  Sex                   253680 non-null  int64
 18  Age                   253680 non-null  int64
 19  Education             253680 non-null  int64
 20  Income                253680 non-null  int64
 21  Diabetes              253680 non-null  int64
dtypes: int64(22)
memory usage: 42.6 MB

# Describing dataframe array 'Diabetes'
data_diabetes['Diabetes'].unique()

array([0, 2, 1])

# Feature scaling code block

unscaled_features = data_diabetes.drop(columns=['Diabetes'])
label = data_diabetes['Diabetes']

scaler_metric = StandardScaler()
scaled_features = scaler_metric.fit_transform(unscaled_features)
scaled_features.view()

array([[ 1.15368814,  1.16525449,  0.19692156, ...,  0.31690008,
        -1.06559465, -1.4744874 ],
       [-0.86678537, -0.85818163, -5.07816412, ..., -0.33793279,
         0.96327159, -2.44013754],
       [ 1.15368814,  1.16525449,  0.19692156, ...,  0.31690008,
        -1.06559465,  0.93963796],
       ...,
       [-0.86678537, -0.85818163,  0.19692156, ..., -1.97501498,
        -0.05116153, -1.95731247],
       [ 1.15368814, -0.85818163,  0.19692156, ..., -0.33793279,
        -0.05116153, -2.44013754],
       [ 1.15368814,  1.16525449,  0.19692156, ...,  0.31690008,
         0.96327159, -1.95731247]])

test_vals = [0.15, 0.2, 0.25, 0.3,]
neighbor_values = [2, 5, 7, 10]
for value in test_vals:
  features_tr, features_te, labels_tr, labels_te = train_test_split(scaled_features, label, test_size=value, random_state=42)
  for iteration in neighbor_values:
    knnmodel = KNeighborsClassifier(n_neighbors=iteration)
    knnmodel.fit(features_tr, labels_tr)
    pred = knnmodel.predict(features_te)

    pred_accuracy = accuracy_score(labels_te, pred)
    print(f"This is the subset of {value}0% Testing Set")
    print(f"The Accuracy of\t`{iteration}`\tN-Neighbor Value is {pred_accuracy:.2f}")
    confusion = confusion_matrix(labels_te, pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g',
                xticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"],
                yticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"])
    plt.title("Confusion Matrix For KNN")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
    print("\n")

This is the subset of 0.150% Testing Set
The Accuracy of	`2`	N-Neighbor Value is 0.83


This is the subset of 0.150% Testing Set
The Accuracy of	`5`	N-Neighbor Value is 0.83


This is the subset of 0.150% Testing Set
The Accuracy of	`7`	N-Neighbor Value is 0.84


This is the subset of 0.150% Testing Set
The Accuracy of	`10`	N-Neighbor Value is 0.84


This is the subset of 0.20% Testing Set
The Accuracy of	`2`	N-Neighbor Value is 0.83


This is the subset of 0.20% Testing Set
The Accuracy of	`5`	N-Neighbor Value is 0.83


This is the subset of 0.20% Testing Set
The Accuracy of	`7`	N-Neighbor Value is 0.84


This is the subset of 0.20% Testing Set
The Accuracy of	`10`	N-Neighbor Value is 0.84


This is the subset of 0.250% Testing Set
The Accuracy of	`2`	N-Neighbor Value is 0.83


This is the subset of 0.250% Testing Set
The Accuracy of	`5`	N-Neighbor Value is 0.83

features_tr, features_te, labels_tr, labels_te = train_test_split(scaled_features, label, test_size=0.2, random_state=42)

optimal_k = KNeighborsClassifier(n_neighbors=7)
optimal_k.fit(features_tr, labels_tr)

optimal_pred = optimal_k.predict(features_te)
optimal_accuracy = accuracy_score(labels_te, optimal_pred)

print(f"This is the subset of 0.20% Testing Set")
print(f"The Accuracy of 7 N-Neighbor Value is {optimal_accuracy:.2f}")

confusion = confusion_matrix(labels_te, optimal_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g',
            xticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"],
            yticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"])
plt.title("Confusion Matrix For KNN")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
print("\n")

This is the subset of 0.20% Testing Set
The Accuracy of 7 N-Neighbor Value is 0.84

print(optimal_accuracy)

0.8354225796278777

features_tr, features_te, labels_tr, labels_te = train_test_split(scaled_features, label, test_size=0.2, random_state=42)
weights_list = ['uniform', 'distance']
for weight in weights_list:
  weighted_knn = KNeighborsClassifier(n_neighbors=7, weights=weight)
  weighted_knn.fit(features_tr, labels_tr)

  weighted_pred = weighted_knn.predict(features_te)
  weighted_pred_accuracy = accuracy_score(labels_te, weighted_pred)

  print(f"This is the subset of 0.20% {weight} method Testing Set")
  print(f"The Accuracy of 7 N-Neighbor value with {weight} method:  is {weighted_pred_accuracy:.2f}")

  confusion = confusion_matrix(labels_te, weighted_pred)
  plt.figure(figsize=(6, 4))
  sns.heatmap(confusion, annot=True, cmap='Blues', fmt='g',
              xticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"],
              yticklabels=["No Diabetes", "Type 1 Diabetic", "Type 2 Diabetic"])
  plt.title("Confusion Matrix For KNN")
  plt.xlabel("Predicted Label")
  plt.ylabel("True Label")
  plt.show()
  print("\n")

This is the subset of 0.20% uniform method Testing Set
The Accuracy of 7 N-Neighbor value with uniform method:  is 0.84

!jupyter nbconvert --to html /content/drive/MyDrive/Colab-Notebooks/simple-knn-mcc.ipynb

	HighBP	HighChol	CholCheck	BMI	Smoker	Stroke	HeartDiseaseorAttack	PhysActivity	Fruits	Veggies	...	NoDocbcCost	GenHlth	MentHlth	PhysHlth	DiffWalk	Sex	Age	Education	Income	Diabetes
count	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	...	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000	253680.000000
mean	0.429001	0.424121	0.962670	28.382364	0.443169	0.040571	0.094186	0.756544	0.634256	0.811420	...	0.084177	2.511392	3.184772	4.242081	0.168224	0.440342	8.032119	5.050434	6.053875	0.296921
std	0.494934	0.494210	0.189571	6.608694	0.496761	0.197294	0.292087	0.429169	0.481639	0.391175	...	0.277654	1.068477	7.412847	8.717951	0.374066	0.496429	3.054220	0.985774	2.071148	0.698160
min	0.000000	0.000000	0.000000	12.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000
25%	0.000000	0.000000	1.000000	24.000000	0.000000	0.000000	0.000000	1.000000	0.000000	1.000000	...	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000	6.000000	4.000000	5.000000	0.000000
50%	0.000000	0.000000	1.000000	27.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	...	0.000000	2.000000	0.000000	0.000000	0.000000	0.000000	8.000000	5.000000	7.000000	0.000000
75%	1.000000	1.000000	1.000000	31.000000	1.000000	0.000000	0.000000	1.000000	1.000000	1.000000	...	0.000000	3.000000	2.000000	3.000000	0.000000	1.000000	10.000000	6.000000	8.000000	0.000000
max	1.000000	1.000000	1.000000	98.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	5.000000	30.000000	30.000000	1.000000	1.000000	13.000000	6.000000	8.000000	2.000000

Dataframe Analysis of Values¶

Unique Classification Values¶

Objective 1: How KNN Works¶

Feature Scaling¶

Optimal K-Neighbors Value & Test Subset¶

Weighted KNN For Optimal Value of K-Neighbors and Test Subset¶

Comparison Between Models (Unweighted & Weighted)¶