Kmeans - Posts - OneCompiler

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

df_cust=pd.read_csv(r"C:\Users\HAI\Desktop\Kavya\NHCE\Academic Year 22-23(ODD)\Machine Learning\programs\Mall_Customers.csv")
df_cust.head()

df_cust.corr()
df_cust.columns

df_cust.drop(["CustomerID"],axis=1,inplace=True)

df_cust.plot.scatter(x='Age',y='Spending Score (1-100)')

sns.countplot(x='Genre',data=df_cust)

plt.figure(figsize=(12,10))
sns.countplot(x='Age',data=df_cust)

#Importing KMeans from sklearn
from sklearn.cluster import KMeans

X=df_cust[["Annual Income (k$)","Spending Score (1-100)"]]

Elbow Method to find no of clusters
wcss=[]
for i in range(1,11):
km=KMeans(n_clusters=i)
km.fit(X)
wcss.append(km.inertia_)

#Taking 5 clusters
km1=KMeans(n_clusters=5)
#Fitting the input data
km1.fit(X)
#predicting the labels of the input data
y=km1.predict(X)
#adding the labels to a column named label
df_cust["label"] = y
#The new dataframe with the clustering done
df_cust.head()

#Scatterplot of the clusters
plt.figure(figsize=(10,6))
sns.scatterplot(x = 'Annual Income (k $)',y = 'Spending Score (1-100)',hue="label", palette=['green','orange','brown','dodgerblue','red'], legend='full',data = df_cust ,s = 60 ) plt.xlabel('Annual Income (k$ )')
plt.ylabel('Spending Score (1-100)')
plt.title('Spending Score (1-100) vs Annual Income (k$)')
plt.show()