DE_ASS1
Import necessary libraries
import pandas as pd
import numpy as np
Load Dataset
def load_dataset(file_path):
"""Load a dataset from a CSV file."""
try:
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")
return data
except Exception as e:
print(f"Error loading dataset: {e}")
return None
Display basic details
def dataset_overview(data):
"""Display basic information about the dataset."""
print("Dataset Overview:")
print("-" * 50)
print("Shape of the dataset:", data.shape)
print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nLast 5 rows of the dataset:")
print(data.tail())
print("\nDataset Description:")
print(data.describe())
print("\nDataset Summary:")
print(data.info())
print("-" * 50)
Handle duplicates
def remove_duplicates(data):
"""Remove duplicate observations from the dataset."""
before = data.shape[0]
data = data.drop_duplicates()
after = data.shape[0]
print(f"Removed {before - after} duplicate rows.")
return data
Identify and handle missing values
def handle_missing_values(data):
"""Identify and handle missing values in the dataset."""
print("\nMissing Values Overview:")
print(data.isnull().sum())
# Example: Filling missing values with the mean for numeric columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
if data[col].isnull().sum() > 0:
data[col].fillna(data[col].mean(), inplace=True)
print(f"Filled missing values in '{col}' with the column mean.")
# Example: Dropping rows with missing values for non-numeric columns
non_numeric_columns = data.select_dtypes(exclude=[np.number]).columns
for col in non_numeric_columns:
if data[col].isnull().sum() > 0:
data[col].fillna("Unknown", inplace=True)
print(f"Filled missing values in '{col}' with 'Unknown'.")
return data
Main function
def main():
file_path = input("Enter the path to your CSV file: ")
data = load_dataset(file_path)
if data is not None:
dataset_overview(data)
data = remove_duplicates(data)
data = handle_missing_values(data)
print("\nUpdated Dataset Overview:")
print("Shape of the dataset after handling duplicates and missing values:", data.shape)
print(data.head())
print("Basic statistical summary of updated dataset:")
print(data.describe())
else:
print("Failed to process the dataset.")
Run the program
if name == "main":
main()