#here we import all the necessary packages 
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
plt.style.use('ggplot')
from scipy.stats import shapiro
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


from sklearn.metrics import classification_report

#here we use this code to avoid the number of processors exception that appereas when we use the naive bayas.

import os

# Set LOKY_MAX_CPU_COUNT to the number of CPU cores you want to use (e.g., 2 for dual-core)
os.environ['LOKY_MAX_CPU_COUNT'] = '2'  # Change '2' to your desired number of cores

 
#first we lood the datasets

data = load_breast_cancer()

#here we print the data features names which are the colomns and the target column name 
print (data.feature_names)
print (data.target_names)
print ("\n==========================================================")



# Create we create a dataframe so that we could handle and manibulate the data easily

df = pd.DataFrame(data.data, columns=data.feature_names)

# Get the number of columns
num_columns = df.shape[1]

print(f'Number of columns in the breast cancer dataset: {num_columns}')


# Calculate means and standard deviations
means = df.mean()
std_devs = df.std()

# Print the results
print("Means:")
print(means)
print ("\n==========================================================")

print("\nStandard Deviations:")
print(std_devs)
print ("\n==========================================================")

# now we looks for the features that are normaly distributed using the Shapiro-Wilk test 
# 


 

# Select the first Five columns
selected_columns = df.columns[:5]

# Set a significance level
alpha = 0.05

# Iterate through the selected columns and check skewness
for column in selected_columns:
    feature_data = df[column]
    
    # Calculate skewness for the feature
    skewness = stats.skew(feature_data)
    
    print(f'Feature: {column}')
    print(f'Skewness: {skewness}')
    
    # Determine if the feature is close to normally distributed
    if -1 <= skewness <= 1:
        print("Feature appears to be close to normally distributed.")
    else:
        print("Feature does not appear to be close to normally distributed.")

    # Create a histogram to visualize the distribution
    plt.figure(figsize=(8, 4))
    sns.histplot(feature_data, kde=True)
    plt.title(f"Histogram for {column}")
    plt.show()
    

print ("\n==========================================================")


#here we set the data and the data target to X and Y accordingly
X, y = data.data, data.target



#============================================================================#
#reduce the less important features
# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
rf_classifier.fit(X, y)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Set a threshold for feature importance
threshold = 0.01  # You can adjust this threshold

# Identify less important features
less_important_features = [feature for feature, importance in zip(data.feature_names, feature_importances) if importance < threshold]

# Print the less important features
print("Less Important Features:")
for feature in less_important_features:
    print(feature)

# Remove less important features from the dataset
X_reduced = df.drop(less_important_features, axis=1)  # If using a DataFrame

#===========================================================================================



#here we give the possible prediction found in the dataset according to the target feature values
#and the number of examples for each prediction
print("========== the possible prediction of the features ==================== \n\n")
possible_predictions = {
    0: "Malignant (Cancerous)",
    1: "Benign (Non-Cancerous)"
}

# Print the possible predictions and their corresponding counts
for prediction, label in possible_predictions.items():
    num_examples = (y == prediction).sum()
    print(f"{label}: {num_examples} examples")

#==============================================================================================    


# Split the reduced dataset into training and test sets
X_train_reduced, X_test_reduced, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2,
                                                                    random_state=42)

# Print the sizes of the training and test sets
print ("\n========= the size of the columns ==================== ")
print(f"Training set size: {len(X_train_reduced)} examples")
print(f"Test set size: {len(X_test_reduced)} examples\n")

#==================================================================================================================
#train the models on  the dataset
#============= polynomial regression =======================


poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train_reduced)
X_poly_test = poly.transform(X_test_reduced)

poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_train)

print ("\npolinomial regression traning done  ")

#===========================================================

# Initialize and train the Gaussian Naive Bayes model
nb_classifier = GaussianNB()
nb_classifier.fit(X_train_reduced, y_train)
print ("\nGaussian Naive Bayes model traning done  ")
#===========================================================

# Initialize and train the K-Nearest Neighbors (KNN) model
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_reduced, y_train)
print ("\nK-Nearest Neighbors (KNN) model traning done  ")
#============================================================



#===================================== Prediction =========================

#Polynominal regression
print ("\n================ The Prediction Accuracy  ===============")
poly_predictions = poly_reg.predict(X_poly_test)
# Convert continuous predictions to binary class labels
binary_predictions = (poly_predictions > 0.5).astype(int)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, binary_predictions)
print(f"\nPolynominal Regression Accuracy: {accuracy:.4f}")

#Gaussian Naive Bayes

nb_predictions = nb_classifier.predict(X_test_reduced)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, nb_predictions)
print(f"\nGaussian Naive Bayes Accuracy: {accuracy:.4f}")


# K-Nearest Neighbors (KNN)

knn_predictions = knn_classifier.predict(X_test_reduced)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, knn_predictions)
print(f"\nK-Nearest Neighbors (KNN) Accuracy: {accuracy:.4f}")

#=========================================================================

print ("\n=========== The Classification Report ===========================\n")
# Calculate classification reports for each model
poly_classification_report = classification_report(y_test, binary_predictions, target_names=["Malignant", "Benign"])
nb_classification_report = classification_report(y_test, nb_predictions, target_names=["Malignant", "Benign"])
knn_classification_report = classification_report(y_test, knn_predictions, target_names=["Malignant", "Benign"])

# Create a summary report
summary_report = f"Polynomial Regression Model:\n{poly_classification_report}\n\n" \
                f"Gaussian Naive Bayes Model:\n{nb_classification_report}\n\n" \
                f"K-Nearest Neighbors (KNN) Model:\n{knn_classification_report}"

# Print the summary report
print("Summary Classification Report:")
print(summary_report)



#==================== END   ==================================================
print ("================== ENd =============================================")

created 1 year ago

Python Online Compiler

Write, Run & Share Python code online using OneCompiler's Python online compiler for free. It's one of the robust, feature-rich online compilers for python language, supporting both the versions which are Python 3 and Python 2.7. Getting started with the OneCompiler's Python editor is easy and fast. The editor shows sample boilerplate code when you choose language as Python or Python2 and start coding.

Taking inputs (stdin)

OneCompiler's python online editor supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample python program which takes name as input and print your name with hello.

import sys
name = sys.stdin.readline()
print("Hello "+ name)

About Python

Python is a very popular general-purpose programming language which was created by Guido van Rossum, and released in 1991. It is very popular for web development and you can build almost anything like mobile apps, web apps, tools, data analytics, machine learning etc. It is designed to be simple and easy like english language. It's is highly productive and efficient making it a very popular language.

Tutorial & Syntax help

Loops

1. If-Else:

When ever you want to perform a set of operations based on a condition IF-ELSE is used.

if conditional-expression
    #code
elif conditional-expression
    #code
else:
    #code

Note:

Indentation is very important in Python, make sure the indentation is followed correctly

2. For:

For loop is used to iterate over arrays(list, tuple, set, dictionary) or strings.

Example:

mylist=("Iphone","Pixel","Samsung")
for i in mylist:
    print(i)

3. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while condition  
    #code

Collections

There are four types of collections in Python.

1. List:

List is a collection which is ordered and can be changed. Lists are specified in square brackets.

Example:

mylist=["iPhone","Pixel","Samsung"]
print(mylist)

2. Tuple:

Tuple is a collection which is ordered and can not be changed. Tuples are specified in round brackets.

Example:

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)

Below throws an error if you assign another value to tuple again.

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)
myTuple[1]="onePlus"
print(myTuple)

3. Set:

Set is a collection which is unordered and unindexed. Sets are specified in curly brackets.

Example:

myset = {"iPhone","Pixel","Samsung"}
print(myset)

4. Dictionary:

Dictionary is a collection of key value pairs which is unordered, can be changed, and indexed. They are written in curly brackets with key - value pairs.

Example:

mydict = {
    "brand" :"iPhone",
    "model": "iPhone 11"
}
print(mydict)

Supported Libraries

Following are the libraries supported by OneCompiler's Python compiler

Name	Description
NumPy	NumPy python library helps users to work on arrays with ease
SciPy	SciPy is a scientific computation library which depends on NumPy for convenient and fast N-dimensional array manipulation
SKLearn/Scikit-learn	Scikit-learn or Scikit-learn is the most useful library for machine learning in Python
Pandas	Pandas is the most efficient Python library for data manipulation and analysis
DOcplex	DOcplex is IBM Decision Optimization CPLEX Modeling for Python, is a library composed of Mathematical Programming Modeling and Constraint Programming Modeling