Kmeans
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df_cust=pd.read_csv(r"C:\Users\HAI\Desktop\Kavya\NHCE\Academic Year 22-23(ODD)\Machine Learning\programs\Mall_Customers.csv")
df_cust.head()
df_cust.corr()
df_cust.columns
df_cust.drop(["CustomerID"],axis=1,inplace=True)
df_cust.plot.scatter(x='Age',y='Spending Score (1-100)')
sns.countplot(x='Genre',data=df_cust)
plt.figure(figsize=(12,10))
sns.countplot(x='Age',data=df_cust)
#Importing KMeans from sklearn
from sklearn.cluster import KMeans
X=df_cust[["Annual Income (k$)","Spending Score (1-100)"]]
Elbow Method to find no of clusters
wcss=[]
for i in range(1,11):
km=KMeans(n_clusters=i)
km.fit(X)
wcss.append(km.inertia_)
#Taking 5 clusters
km1=KMeans(n_clusters=5)
#Fitting the input data
km1.fit(X)
#predicting the labels of the input data
y=km1.predict(X)
#adding the labels to a column named label
df_cust["label"] = y
#The new dataframe with the clustering done
df_cust.head()
#Scatterplot of the clusters
plt.figure(figsize=(10,6))
sns.scatterplot(x = 'Annual Income (k)')
plt.ylabel('Spending Score (1-100)')
plt.title('Spending Score (1-100) vs Annual Income (k$)')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
df_cust=pd.read_csv(r"C:\Users\HAI\Desktop\Kavya\NHCE\Academic Year 22-23(ODD)\Machine Learning\programs\Mall_Customers.csv")
df_cust.head()
df_cust.corr()
df_cust.columns
df_cust.drop(["CustomerID"],axis=1,inplace=True)
df_cust.plot.scatter(x='Age',y='Spending Score (1-100)')
sns.countplot(x='Genre',data=df_cust)
plt.figure(figsize=(12,10))
sns.countplot(x='Age',data=df_cust)
#Importing KMeans from sklearn
from sklearn.cluster import KMeans
X=df_cust[["Annual Income (k$)","Spending Score (1-100)"]]
Elbow Method to find no of clusters
wcss=[]
for i in range(1,11):
km=KMeans(n_clusters=i)
km.fit(X)
wcss.append(km.inertia_)
#Taking 5 clusters
km1=KMeans(n_clusters=5)
#Fitting the input data
km1.fit(X)
#predicting the labels of the input data
y=km1.predict(X)
#adding the labels to a column named label
df_cust["label"] = y
#The new dataframe with the clustering done
df_cust.head()
#Scatterplot of the clusters
plt.figure(figsize=(10,6))
sns.scatterplot(x = 'Annual Income (k)')
plt.ylabel('Spending Score (1-100)')
plt.title('Spending Score (1-100) vs Annual Income (k$)')
plt.show()