#!pip install zipcodes
#!pip install basemap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipcodes as zcode
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from warnings import filterwarnings
filterwarnings("ignore")
# Load the dataset
file_path = 'Customer_Data.csv'
data = pd.read_csv(file_path)
# Display the first few rows of the dataset to understand its structure
data.head()
CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
Data Cleaning and Preprocessing:¶
# Checking for missing values and data types
missing_values = data.isnull().sum()
data_types = data.dtypes
# Summary statistics for detecting outliers
summary_statistics = data.describe()
missing_values, data_types, summary_statistics
(CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64, CUST_ID object BALANCE float64 BALANCE_FREQUENCY float64 PURCHASES float64 ONEOFF_PURCHASES float64 INSTALLMENTS_PURCHASES float64 CASH_ADVANCE float64 PURCHASES_FREQUENCY float64 ONEOFF_PURCHASES_FREQUENCY float64 PURCHASES_INSTALLMENTS_FREQUENCY float64 CASH_ADVANCE_FREQUENCY float64 CASH_ADVANCE_TRX int64 PURCHASES_TRX int64 CREDIT_LIMIT float64 PAYMENTS float64 MINIMUM_PAYMENTS float64 PRC_FULL_PAYMENT float64 TENURE int64 dtype: object, BALANCE BALANCE_FREQUENCY PURCHASES ONEOFF_PURCHASES \ count 8950.000000 8950.000000 8950.000000 8950.000000 mean 1564.474828 0.877271 1003.204834 592.437371 std 2081.531879 0.236904 2136.634782 1659.887917 min 0.000000 0.000000 0.000000 0.000000 25% 128.281915 0.888889 39.635000 0.000000 50% 873.385231 1.000000 361.280000 38.000000 75% 2054.140036 1.000000 1110.130000 577.405000 max 19043.138560 1.000000 49039.570000 40761.250000 INSTALLMENTS_PURCHASES CASH_ADVANCE PURCHASES_FREQUENCY \ count 8950.000000 8950.000000 8950.000000 mean 411.067645 978.871112 0.490351 std 904.338115 2097.163877 0.401371 min 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.083333 50% 89.000000 0.000000 0.500000 75% 468.637500 1113.821139 0.916667 max 22500.000000 47137.211760 1.000000 ONEOFF_PURCHASES_FREQUENCY PURCHASES_INSTALLMENTS_FREQUENCY \ count 8950.000000 8950.000000 mean 0.202458 0.364437 std 0.298336 0.397448 min 0.000000 0.000000 25% 0.000000 0.000000 50% 0.083333 0.166667 75% 0.300000 0.750000 max 1.000000 1.000000 CASH_ADVANCE_FREQUENCY CASH_ADVANCE_TRX PURCHASES_TRX CREDIT_LIMIT \ count 8950.000000 8950.000000 8950.000000 8949.000000 mean 0.135144 3.248827 14.709832 4494.449450 std 0.200121 6.824647 24.857649 3638.815725 min 0.000000 0.000000 0.000000 50.000000 25% 0.000000 0.000000 1.000000 1600.000000 50% 0.000000 0.000000 7.000000 3000.000000 75% 0.222222 4.000000 17.000000 6500.000000 max 1.500000 123.000000 358.000000 30000.000000 PAYMENTS MINIMUM_PAYMENTS PRC_FULL_PAYMENT TENURE count 8950.000000 8637.000000 8950.000000 8950.000000 mean 1733.143852 864.206542 0.153715 11.517318 std 2895.063757 2372.446607 0.292499 1.338331 min 0.000000 0.019163 0.000000 6.000000 25% 383.276166 169.123707 0.000000 12.000000 50% 856.901546 312.343947 0.000000 12.000000 75% 1901.134317 825.485459 0.142857 12.000000 max 50721.483360 76406.207520 1.000000 12.000000 )
data.drop('CUST_ID', axis = 1, inplace= True)
data.head()
BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
Missing Values:
For CREDIT_LIMIT, since only one value is missing, we can impute it with a central tendency measure (mean or median).
For MINIMUM_PAYMENTS, given the higher number of missing values, we need to decide whether to impute them (using mean, median, or a more complex imputation method) or to remove these records.
# Imputing missing value in CREDIT_LIMIT with the median
data['CREDIT_LIMIT'].fillna(data['CREDIT_LIMIT'].median(), inplace=True)
# Removing records with missing MINIMUM_PAYMENTS
data_cleaned = data.dropna(subset=['MINIMUM_PAYMENTS'])
# Checking the shape of the data before and after removing missing values
original_shape = data.shape
cleaned_shape = data_cleaned.shape
original_shape, cleaned_shape
((8950, 17), (8637, 17))
data_cleaned['TENURE'].value_counts()
data_cleaned.head()
BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.00 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.25 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.00 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.00 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
5 | 1809.828751 | 1.000000 | 1333.28 | 0.00 | 1333.28 | 0.000000 | 0.666667 | 0.000000 | 0.583333 | 0.00 | 0 | 8 | 1800.0 | 1400.057770 | 2407.246035 | 0.000000 | 12 |
# Selecting numerical columns for plotting
numerical_columns = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
# Plotting boxplots for each numerical column to visualize outliers
plt.figure(figsize=(20, 15))
for i, col in enumerate(numerical_columns):
plt.subplot(len(numerical_columns) // 3 + 1, 3, i + 1)
sns.boxplot(y=data_cleaned[col])
plt.title(col)
plt.tight_layout()
plt.show()
There are no issues in the data as noises
Let`s Normalize the data¶
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_cleaned)
# Checking the first few rows of the scaled data
scaled_data[:5]
array([[-0.74455592, -0.3693992 , -0.42914833, -0.35913681, -0.35478902, -0.46863463, -0.82060062, -0.68622114, -0.71707755, -0.68200785, -0.47942277, -0.51757308, -0.96257337, -0.54389103, -0.30548157, -0.53768681, 0.35532361], [ 0.76425857, 0.0680405 , -0.47317466, -0.35913681, -0.45880689, 2.56872771, -1.23595801, -0.68622114, -0.9264219 , 0.55703709, 0.09930182, -0.59700728, 0.67728465, 0.79695187, 0.08773463, 0.21245101, 0.35532361], [ 0.42670046, 0.5054802 , -0.11636295, 0.0999561 , -0.45880689, -0.46863463, 1.25618136, 2.64681173, -0.9264219 , -0.68200785, -0.47942277, -0.12040209, 0.81393948, -0.39944714, -0.09986968, -0.53768681, 0.35532361], [-0.37383225, 0.5054802 , -0.46579079, -0.34963633, -0.45880689, -0.46863463, -1.02828056, -0.40846951, -0.9264219 , -0.68200785, -0.47942277, -0.55729018, -0.90791144, -0.38010797, -0.26110226, -0.53768681, 0.35532361], [ 0.0996408 , 0.5054802 , 0.14212329, -0.35913681, 0.99491363, -0.46863463, 0.42546907, -0.68622114, 0.53899358, -0.68200785, -0.47942277, -0.27927048, -0.74392564, -0.13205363, 0.65043775, -0.53768681, 0.35532361]])
# Selecting a subset of the dataset for visualization purposes
sampled_data = scaled_data.sample(n=100, random_state=42)
# Plotting scatter plots for each pair of features
# We are using pairplot from seaborn which automatically plots pairwise relationships in the dataset
sns.pairplot(sampled_data, diag_kind='kde', plot_kws={'alpha':0.6, 's':20, 'edgecolor':'k'}, height=2)
plt.show()
we take 13 and 14 as our x, y to show in 2d
Model¶
K-Means
kmeans_set = {'init':'random', 'n_init':10, 'max_iter':300, 'random_state':42}
# Finding the optimal number of clusters using the elbow method
inertia = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, **kmeans_set)
kmeans.fit(scaled_data)
inertia.append(kmeans.inertia_)
# Plotting the elbow method graph
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()
# Let's also evaluate the silhouette score for different number of clusters
silhouette_scores = []
for i in range(2, 11):
kmeans = KMeans(n_clusters=i, **kmeans_set)
kmeans.fit(scaled_data)
score = silhouette_score(scaled_data, kmeans.labels_)
silhouette_scores.append(score)
# Plotting the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.title('Silhouette Scores for Different Number of Clusters')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()
pip install kneed
Defaulting to user installation because normal site-packages is not writeable Looking in links: /usr/share/pip-wheels Requirement already satisfied: kneed in ./.local/lib/python3.11/site-packages (0.8.5) Requirement already satisfied: numpy>=1.14.2 in /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages (from kneed) (1.24.3) Requirement already satisfied: scipy>=1.0.0 in /opt/conda/envs/anaconda-panel-2023.05-py310/lib/python3.11/site-packages (from kneed) (1.11.1) Note: you may need to restart the kernel to use updated packages.
from kneed import KneeLocator
kl = KneeLocator(range(1, 11), inertia, curve ='convex', direction ='decreasing')
kl.elbow
4
# Plotting the elbow method graph
plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.axvline(x = kl.elbow, color = 'b', label = 'axvline - full height', ls ='--')
plt.grid(True)
plt.show()
kmeans =KMeans(n_clusters =4).fit(data_cleaned)
centroids = kmeans.cluster_centers_
print(centroids)
plt.scatter(data_cleaned['MINIMUM_PAYMENTS'], data_cleaned['PAYMENTS'], c= kmeans.labels_.astype(float), s=50, alpha = 0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c = 'red', s=50)
plt.show()
[[5.98784744e+03 9.58217937e-01 2.65268750e+03 1.61459108e+03 1.03823369e+03 4.76390092e+03 5.24534631e-01 3.10867975e-01 4.10617546e-01 3.35048065e-01 1.09151235e+01 3.36203704e+01 1.11496984e+04 6.79839497e+03 3.31890089e+03 1.04418332e-01 1.17453704e+01] [8.38945378e+02 8.75314373e-01 5.38521666e+02 2.65495091e+02 2.73325861e+02 4.97813132e+02 4.59926188e-01 1.37401617e-01 3.52241430e-01 1.11756603e-01 2.34466287e+00 9.96490906e+00 2.28446394e+03 9.58590647e+02 6.04084650e+02 1.54811013e-01 1.14131913e+01] [2.08245987e+03 9.20474871e-01 1.40796958e+03 8.75310686e+02 5.33042982e+02 1.09122637e+03 5.62395976e-01 3.20654943e-01 3.90365676e-01 1.43416961e-01 3.45955591e+00 2.01153846e+01 7.53784154e+03 2.02917440e+03 7.72034605e+02 1.79689826e-01 1.17359239e+01] [5.40533094e+03 9.57954542e-01 2.72763637e+04 2.18771029e+04 5.39926083e+03 1.55037839e+03 9.09027750e-01 8.42361167e-01 7.20833292e-01 5.90277500e-02 2.50000000e+00 1.27958333e+02 1.60833333e+04 2.79256345e+04 3.26667104e+03 5.11205792e-01 1.19166667e+01]]
data_cleaned
BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
5 | 1809.828751 | 1.000000 | 1333.28 | 0.00 | 1333.28 | 0.000000 | 0.666667 | 0.000000 | 0.583333 | 0.000000 | 0 | 8 | 1800.0 | 1400.057770 | 2407.246035 | 0.000000 | 12 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
8943 | 5.871712 | 0.500000 | 20.90 | 20.90 | 0.00 | 0.000000 | 0.166667 | 0.166667 | 0.000000 | 0.000000 | 0 | 1 | 500.0 | 58.644883 | 43.473717 | 0.000000 | 6 |
8945 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
8947 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
8948 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
8949 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8637 rows × 17 columns
conda install tkinter
Channels: - defaults Platform: linux-64 Collecting package metadata (repodata.json): done Solving environment: failed PackagesNotFoundError: The following packages are not available from current channels: - tkinter Current channels: - defaults To search for alternate channels that may provide the conda package you're looking for, navigate to https://anaconda.org and use the search bar at the top of the page. Note: you may need to restart the kernel to use updated packages.
import tkinter as tk
from tkinter import filedialog
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
kmeans = KMeans(n_clusters = 4).fit(data_cleaned)
centroids = kmeans.cluster_centers_
root = tk.Tk()
canvas1 = tk.Canvas(root, width = 100, height=100)
canvas1.pack()
label1 = tk.Label(root, text = centroids, justify='center')
canvas1.create_window(70, 50, window = label1)
figure1 = plt.Figure(figsize= (5, 8), dpi = 100)
ax1 = figure1.add_subplot(111)
ax1.scatter(data_cleaned['MINIMUM_PAYMENTS'], data_cleaned['PAYMENTS'], c = kmeans.labels_.astype(float), s=50, alpha = 0.5)
ax1.scatter(centroids[:, 0], centroids[:, 1], c='red', s = 50)
scatter1 = FigureCanvasTkAgg(figure1, root)
scatter1.get_tk_widget().pack(side=tk.LEFT, fill = tk.BOTH)
root.mainloop()
root = tk.Tk()
canvas1 = tk.Canvas(root, width = 400, height=300, relief = 'raised')
canvas1.pack()
label1 = tk.Label(root, text = 'K-Means Clustering')
label1.config(font=('helvetica', 14))
canvas1.create_window(200, 25, window = label1)
label2=tk.Label(root, text ='Type Number of Clusters:')
label2.config(font = ('helvetica', 8))
canvas1.create_window(200, 120, window = label2)
entry1=tk.Entry(root)
canvas1.create_window(200, 140, window = entry1)
def getKMeans():
global data_cleaned
global number_of_clusters
number_of_clusters = int(entry.get())
label3 = tk.Label(root, text = centroids)
canvas1.create_window(200, 200, window =label3)
figure1 = plt.Figure(figsize=(4, 3), dpi = 100)
ax1 = figure1.add_subplot(111)
ax1.scatter(data_cleaned['MINIMUM_PAYMENTS'], data_cleaned['PAYMENTS'], c = kmeans.labels.astype(float), s=50, alpha = 0.5)
ax1.scatter(centroids[:, 0], centroids[:, 1], c='red', s = 50)
scatter1.get_tk_widget().pack(side=tk.RIGHT, fill = tk.BOTH)
processButton = tk.Button(text = ' Process k-Means ',command = getKMeans, bg='brown', fg='white' , font = ('helvetica', 10, 'bold'))
canvas1.create_window(200, 170, window=processButton)
canvas1.create_window(200, 170, window = processButton)
root.mainloop()
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
def evaluate_clustering(labels, data):
silhouette = silhouette_score(data, labels)
calinski = calinski_harabasz_score(data, labels)
davies = davies_bouldin_score(data, labels)
return silhouette, calinski, davies
kmeans_scores = evaluate_clustering(kmeans.labels_, scaled_data)
kmeans_scores
Affinity Propagation¶
from sklearn.cluster import AffinityPropagation
affinity_propagation = AffinityPropagation(random_state=42)
affinity_labels = affinity_propagation.fit_predict(data_cleaned)
evaluate_clustering(affinity_propagation.labels_, data_cleaned)
(0.1588224370367602, 545.3048259421168, 1.0123695722157848)
Mean Shift¶
from sklearn.cluster import MeanShift, estimate_bandwidth
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(data_cleaned, quantile=0.2, n_samples=100)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(data_cleaned)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
number of estimated clusters : 154
import matplotlib.pyplot as plt
plt.figure(1)
plt.clf()
colors = ["#dede00", "#377eb8", "#f781bf"]
markers = ["x", "o", "^"]
labels = make_shift.labels_
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(data_cleaned[my_members, 0], data_cleaned[my_members, 1], markers[k], color=col)
plt.plot(
cluster_center[0],
cluster_center[1],
markers[k],
markerfacecolor=col,
markeredgecolor="k",
markersize=14,
)
plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()
Spectral Clustering¶
from sklearn.cluster import SpectralClustering
spectral_clustering = SpectralClustering(n_clusters=4, random_state=42)
spectral_labels = spectral_clustering.fit_predict(scaled_data)
evaluate_clustering(spectral_labels, spectral_clustering)
Hierarchical Clustering¶
from sklearn.cluster import AgglomerativeClustering
hierarchical = AgglomerativeClustering(n_clusters=4)
hierarchical_labels = hierarchical.fit_predict(scaled_data)
evaluate_clustering(hierarchical_labels, hierarchical)
DBSCAN¶
Determining Optimal Parameters for DBSCAN¶
from sklearn.neighbors import NearestNeighbors
import numpy as np
scaled_features = scaled_data
# We'll use the scaled features 'PAYMENTS' and 'MINIMUM_PAYMENTS'
neighbors = NearestNeighbors(n_neighbors=2)
neighbors_fit = neighbors.fit(scaled_features)
distances, indices = neighbors_fit.kneighbors(scaled_features)
# Sort and plot distances
distances = np.sort(distances, axis=0)
distances = distances[:, 1]
plt.plot(distances)
plt.title("Distance to Nearest Neighbor")
plt.xlabel("Data points sorted by distance")
plt.ylabel("Epsilon")
plt.show()
look for the point where the increase in distance begins to slow down (known as the "elbow"), as it can be a good estimate for eps.
Clustering with DBSCAN
from sklearn.cluster import DBSCAN
# Use the estimated eps value and a suitable min_samples
dbscan = DBSCAN(eps=0.5, min_samples=2000)
dbscan_labels = dbscan.fit_predict(scaled_features)
Evaluating the Results¶
DBSCAN doesn't have a set number of clusters like K-means. Instead, it forms clusters based on the density, and points that don't belong to any cluster are marked as noise. You can evaluate the results by:
Counting the number of clusters and noise points.
Checking the spread and size of the clusters.
Using silhouette score, although it might not be as informative for DBSCAN as for other algorithms.
evaluate_clustering(dbscan.labels_, data_cleaned)
(0.4868922360554868, 1768.8963154477767, 1.7107184815587702)
dbscan_labels
array([ 0, -1, 0, ..., 0, 0, 0])
Visualization¶
# Plotting
plt.figure(figsize=(10, 6))
unique_labels = set(dbscan_labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise
col = [0, 0, 0, 1]
class_member_mask = (dbscan_labels == k)
xy = scaled_features[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6)
plt.title("DBSCAN Clustering")
plt.xlabel("PAYMENTS")
plt.ylabel("MINIMUM_PAYMENTS")
plt.show()
HDBSCAN¶
import hdbscan
hdbscan_cluster = hdbscan.HDBSCAN(min_cluster_size=5)
hdbscan_labels = hdbscan_cluster.fit_predict(scaled_data)
evaluate_clustering(hdbscan_labels, hdbscan_cluster)
OPTICS¶
from sklearn.cluster import OPTICS
optics = OPTICS(min_samples=5)
optics_labels = optics.fit_predict(scaled_data)
evaluate_clustering(optics_labels, optics)
BIRCH¶
from sklearn.cluster import BIRCH
birch = BIRCH(n_clusters=4)
birch_labels = birch.fit_predict(scaled_data)
evaluate_clustering(birch_labels, birch)
Evaluation¶
# Extracting and scaling the specific features
features = data_cleaned[['PAYMENTS', 'MINIMUM_PAYMENTS']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
# Apply K-means
kmeans = KMeans(n_clusters=3)
kmeans_labels = kmeans.fit_predict(scaled_features)
centroids = kmeans.cluster_centers_
# Plotting
plt.figure(figsize=(10, 6))
plt.scatter(scaled_features[:, 0], scaled_features[:, 1], c=kmeans_labels, cmap='viridis', marker='o', edgecolor='k', s=50, alpha=0.7)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='x', s=200)
# Decision boundaries (optional, for visualization)
x_min, x_max = scaled_features[:, 0].min() - 1, scaled_features[:, 0].max() + 1
y_min, y_max = scaled_features[:, 1].min() - 1, scaled_features[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.2, cmap='viridis')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-means Clustering on 'PAYMENTS' and 'MINIMUM_PAYMENTS'")
plt.xlabel('PAYMENTS (scaled)')
plt.ylabel('MINIMUM_PAYMENTS (scaled)')
plt.show()