DE_ASS3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
def load_dataset(file_path):
"""Load the dataset from a CSV file."""
try:
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")
return data
except Exception as e:
print(f"Error loading dataset: {e}")
return None
def display_dataset(data):
"""Display basic details of the dataset."""
print("\nDataset Overview:")
print(data.head())
print("\nShape of the dataset:", data.shape)
print("\nDataset Information:")
print(data.info())
def frequency_view(data, column):
"""View frequency of each value in a specified column."""
if column in data.columns:
print(f"\nFrequency of each value in the column '{column}':")
print(data[column].value_counts())
else:
print(f"Column '{column}' does not exist in the dataset.")
def identify_duplicates(data):
"""Identify number of duplicate rows in the dataset."""
duplicate_count = data.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")
return duplicate_count
def remove_duplicates(data):
"""Remove duplicate rows from the dataset."""
before = data.shape[0]
data = data.drop_duplicates()
after = data.shape[0]
print(f"\nRemoved {before - after} duplicate rows.")
return data
def plot_histogram(data, column):
"""Plot a histogram for the specified column."""
if column in data.columns:
plt.hist(data[column], bins=20, edgecolor='black', alpha=0.7)
plt.title(f"Histogram of {column}")
plt.xlabel(column)
plt.ylabel("Frequency")
plt.show()
else:
print(f"Column '{column}' does not exist in the dataset.")
def correlation_analysis(data):
"""Perform correlation analysis and plot a heatmap."""
correlation_matrix = data.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
def data_transformation(data, column):
"""Perform a log transformation on the specified column."""
if column in data.columns:
data[f"{column}_log"] = np.log1p(data[column]) # Adding 1 to avoid log(0)
print(f"\nPerformed log transformation on column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data
def feature_encoding(data, column):
"""Perform feature encoding on a categorical column."""
if column in data.columns:
data[column] = pd.Categorical(data[column])
data[f"{column}_encoded"] = data[column].cat.codes
print(f"\nPerformed feature encoding on column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data
def data_normalization(data, column, method="minmax"):
"""Perform data normalization using Min-Max or Z-score scaling."""
if column in data.columns:
if method == "minmax":
scaler = MinMaxScaler()
data[f"{column}_minmax"] = scaler.fit_transform(data[[column]])
print(f"\nPerformed Min-Max normalization on column '{column}'.")
elif method == "zscore":
scaler = StandardScaler()
data[f"{column}_zscore"] = scaler.fit_transform(data[[column]])
print(f"\nPerformed Z-score normalization on column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data
def data_smoothing(data, column, method="mean"):
"""Perform data smoothing using mean or median."""
if column in data.columns:
if method == "mean":
data[f"{column}_smoothed"] = data[column].rolling(window=3).mean()
print(f"\nPerformed mean smoothing on column '{column}'.")
elif method == "median":
data[f"{column}_smoothed"] = data[column].rolling(window=3).median()
print(f"\nPerformed median smoothing on column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data
def main():
file_path = input("Enter the path to your CSV file: ")
data = load_dataset(file_path)
if data is not None:
display_dataset(data)
# Frequency view
column = input("Enter column name to view frequency: ")
frequency_view(data, column)
# Duplicates
identify_duplicates(data)
data = remove_duplicates(data)
# Histogram
column = input("Enter column name to plot histogram: ")
plot_histogram(data, column)
# Correlation analysis
correlation_analysis(data)
# Data transformation
column = input("Enter column name for log transformation: ")
data = data_transformation(data, column)
# Feature encoding
column = input("Enter column name for feature encoding: ")
data = feature_encoding(data, column)
# Normalization
column = input("Enter column name for normalization: ")
method = input("Choose normalization method (minmax/zscore): ").lower()
data = data_normalization(data, column, method)
# Smoothing
column = input("Enter column name for smoothing: ")
method = input("Choose smoothing method (mean/median): ").lower()
data = data_smoothing(data, column, method)
print("\nFinal Dataset Overview:")
print(data.head())
else:
print("Failed to process the dataset.")
if name == "main":
main()