DE_ASS2 - Posts - OneCompiler

import pandas as pd
import numpy as np

def load_dataset(file_path):
"""Load the dataset from the specified CSV file."""
try:
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")
return data
except Exception as e:
print(f"Error loading dataset: {e}")
return None

def display_dataset(data):
"""Display basic details about the dataset."""
print("\nDataset Overview:")
print("First 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nShape of the dataset:", data.shape)
print("\nDataset Summary:")
print(data.info())

def identify_missing_values(data):
"""Identify missing values in the dataset."""
print("\nMissing Values Per Column:")
print(data.isnull().sum())

def drop_missing_values(data, scenario):
"""Drop rows based on specified missing value scenarios."""
if scenario == "any":
# Drop all rows with any NaN values
data = data.dropna()
print("\nDropped rows with any missing values.")
elif scenario == "all":
# Drop rows where all values are NaN
data = data.dropna(how='all')
print("\nDropped rows where all values are missing.")
elif scenario == "threshold":
# Drop rows with more than two NaN values
data = data.dropna(thresh=data.shape[1] - 2)
print("\nDropped rows with more than two missing values.")
elif scenario == "specific":
# Drop NaN values in a specific column
column = input("Enter the column name to drop rows with missing values: ")
if column in data.columns:
data = data.dropna(subset=[column])
print(f"\nDropped rows with missing values in the column '{column}'.")
else:
print(f"Column '{column}' does not exist in the dataset.")
return data

def handle_missing_with_default(data, default_value=0):
"""Handle missing data by replacing with a default value."""
print(f"\nFilling missing values with the default value: {default_value}")
data.fillna(default_value, inplace=True)
return data

def impute_missing_values(data):
"""Impute missing values using mean, median, etc."""
numeric_columns = data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
mean_value = data[col].mean()
median_value = data[col].median()
data[col].fillna(mean_value, inplace=True) # Example: Replace with mean
print(f"Filled missing values in '{col}' with mean value: {mean_value:.2f}")
return data

def identify_duplicates(data):
"""Identify duplicate rows in the dataset."""
duplicates = data.duplicated()
print("\nNumber of duplicate rows:", duplicates.sum())
return duplicates

def remove_duplicates(data):
"""Remove duplicate rows from the dataset."""
before = data.shape[0]
data = data.drop_duplicates()
after = data.shape[0]
print(f"\nRemoved {before - after} duplicate rows.")
return data

def handle_redundancy(data):
"""Handle data redundancy (example: remove duplicate columns)."""
data = data.loc[:, ~data.columns.duplicated()]
print("\nRemoved redundant columns if any.")
return data

def main():
file_path = input("Enter the path to your CSV file: ")
data = load_dataset(file_path)

if data is not None:
    display_dataset(data)
    
    # Identify and handle missing values
    identify_missing_values(data)
    scenario = input("\nChoose a missing value scenario to drop rows (any/all/threshold/specific): ").strip().lower()
    data = drop_missing_values(data, scenario)
    
    # Handle missing data with default values
    data = handle_missing_with_default(data)
    
    # Impute missing values
    data = impute_missing_values(data)
    
    # Identify and handle duplicates
    identify_duplicates(data)
    data = remove_duplicates(data)
    
    # Handle redundancy
    data = handle_redundancy(data)
    
    print("\nFinal Dataset Overview:")
    print(data.info())
    print("\nFinal Dataset Head:")
    print(data.head())
else:
    print("Failed to process the dataset.")

if name == "main":
main()