DE_ASS7
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
Load Dataset
def load_dataset(file_path):
"""Load the dataset from a CSV file."""
try:
data = pd.read_csv(file_path)
print("Dataset loaded successfully!")
return data
except Exception as e:
print(f"Error loading dataset: {e}")
return None
Preprocess the dataset
def preprocess_data(data, target_column):
"""
Handle missing values and prepare features and target.
:param data: DataFrame
:param target_column: Target column name
"""
data = data.dropna() # Drop rows with missing values
X = data.drop(columns=[target_column])
y = data[target_column]
return X, y
Train and Evaluate Models
def evaluate_models(X, y):
"""
Train multiple classifiers, evaluate performance, and compare them.
:param X: Features
:param y: Target
"""
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define models
models = {
"Logistic Regression": LogisticRegression(max_iter=500),
"Random Forest": RandomForestClassifier(),
"Support Vector Machine": SVC(),
"Decision Tree": DecisionTreeClassifier(),
"Naive Bayes": GaussianNB(),
"K-Nearest Neighbors": KNeighborsClassifier()
}
# Initialize a results dictionary
results = {}
# Train and evaluate each model
for model_name, model in models.items():
print(f"\nEvaluating {model_name}...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Compute metrics
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
# Store results
results[model_name] = {"Accuracy": acc, "Precision": precision, "Recall": recall, "F1 Score": f1}
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title(f"Confusion Matrix for {model_name}")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=kfold, scoring="accuracy")
print(f"Cross-Validation Scores for {model_name}: {cv_scores}")
print(f"Mean CV Accuracy for {model_name}: {cv_scores.mean():.4f}")
return results
Compare Models
def compare_models(results):
"""
Compare models based on their performance metrics.
:param results: Dictionary with model performance metrics
"""
metrics_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(metrics_df)
# Plot comparison
metrics_df.plot(kind="bar", figsize=(12, 8), colormap="Set3")
plt.title("Model Performance Comparison")
plt.xlabel("Model")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(loc="lower right")
plt.show()
Main function
def main():
file_path = input("Enter the path to your dataset: ")
target_column = input("Enter the target column name: ")
data = load_dataset(file_path)
if data is not None:
X, y = preprocess_data(data, target_column)
results = evaluate_models(X, y)
compare_models(results)
else:
print("Failed to process the dataset.")
if name == "main":
main()