DE_ASS2
import pandas as pd
import numpy as np
def load_dataset(file_path):
"""Load the dataset from the specified CSV file."""
try:
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")
return data
except Exception as e:
print(f"Error loading dataset: {e}")
return None
def display_dataset(data):
"""Display basic details about the dataset."""
print("\nDataset Overview:")
print("First 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nShape of the dataset:", data.shape)
print("\nDataset Summary:")
print(data.info())
def identify_missing_values(data):
"""Identify missing values in the dataset."""
print("\nMissing Values Per Column:")
print(data.isnull().sum())
def drop_missing_values(data, scenario):
"""Drop rows based on specified missing value scenarios."""
if scenario == "any":
# Drop all rows with any NaN values
data = data.dropna()
print("\nDropped rows with any missing values.")
elif scenario == "all":
# Drop rows where all values are NaN
data = data.dropna(how='all')
print("\nDropped rows where all values are missing.")
elif scenario == "threshold":
# Drop rows with more than two NaN values
data = data.dropna(thresh=data.shape[1] - 2)
print("\nDropped rows with more than two missing values.")
elif scenario == "specific":
# Drop NaN values in a specific column
column = input("Enter the column name to drop rows with missing values: ")
if column in data.columns:
data = data.dropna(subset=[column])
print(f"\nDropped rows with missing values in the column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data
def handle_missing_with_default(data, default_value=0):
"""Handle missing data by replacing with a default value."""
print(f"\nFilling missing values with the default value: {default_value}")
data.fillna(default_value, inplace=True)
return data
def impute_missing_values(data):
"""Impute missing values using mean, median, etc."""
numeric_columns = data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
mean_value = data[col].mean()
median_value = data[col].median()
data[col].fillna(mean_value, inplace=True) # Example: Replace with mean
print(f"Filled missing values in '{col}' with mean value: {mean_value:.2f}")
return data
def identify_duplicates(data):
"""Identify duplicate rows in the dataset."""
duplicates = data.duplicated()
print("\nNumber of duplicate rows:", duplicates.sum())
return duplicates
def remove_duplicates(data):
"""Remove duplicate rows from the dataset."""
before = data.shape[0]
data = data.drop_duplicates()
after = data.shape[0]
print(f"\nRemoved {before - after} duplicate rows.")
return data
def handle_redundancy(data):
"""Handle data redundancy (example: remove duplicate columns)."""
data = data.loc[:, ~data.columns.duplicated()]
print("\nRemoved redundant columns if any.")
return data
def main():
file_path = input("Enter the path to your CSV file: ")
data = load_dataset(file_path)
if data is not None:
display_dataset(data)
# Identify and handle missing values
identify_missing_values(data)
scenario = input("\nChoose a missing value scenario to drop rows (any/all/threshold/specific): ").strip().lower()
data = drop_missing_values(data, scenario)
# Handle missing data with default values
data = handle_missing_with_default(data)
# Impute missing values
data = impute_missing_values(data)
# Identify and handle duplicates
identify_duplicates(data)
data = remove_duplicates(data)
# Handle redundancy
data = handle_redundancy(data)
print("\nFinal Dataset Overview:")
print(data.info())
print("\nFinal Dataset Head:")
print(data.head())
else:
print("Failed to process the dataset.")
if name == "main":
main()