# Projekt - Random Forest Regression

# Pandas is used for data manipulation
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split # trzeba zainstalować bibliotekę scikit-learn
from pprint import pprint # od wyświetlania danych dot. m.in modelu. randomForest
from sklearn.ensemble import RandomForestRegressor # ponieważ wartości, które przewidujemy (actual max temp) są wartościami typu continous to używamy Regressor
from sklearn.model_selection import RandomizedSearchCV # do sprawdenia zestawu parametrów
import pydot # do wizualizacji drzewek
from sklearn.tree import export_graphviz # do exportu obrazu drzewa
import matplotlib.pyplot as plt # do wykresów - jedna z najbardziej popularnych biblioek dla pythona
import seaborn as sns # do wykresów heatmap
from scipy.stats import chi2_contingency # do określenia numerycznej wartości korelacji zmiennych

# Read in data and display first 5 rows


features = pd.read_csv('temps.csv')

print(features.tail(5))  # drukujemy w terminalu/konsoli 5 pierwszych (head)/ostatnich obserwacji (tail)

print('Kształ danych: ', features.shape) # metoda shape zwraca kształ danych (liczba wierszy, liczba kolumn)

print(features.describe())  # podstawowe statystyki dot. naszych danych

# Przygotowanie danych

features = pd.get_dummies(features) # One-hot encode - konwersja zmiennej kategorycznej week na 7 zmiennych binarnych week_dzieńtygodnia

#print(features.iloc[:, 11:18].head(5)) # drukujemy w konsoli zakres określony kolumnami (11:18) oraz 5 pierwszych wierszy head(5)

labels = np.array(features['actual']) # Y - wartość do przewidzenia w danych testowych a dostępna w danych trainingowych do wyszkolenia modelu

features = features.drop('actual', axis=1) # usuwamy kolumnę 'actual' stanowiącą Y - wartości do przewidzenia. Pozostają zmienne typu X - niezależne wartości

features = features.loc[:, ["temp_1", "average", "temp_2", "friend", "forecast_noaa", "forecast_acc", "forecast_under"]] # Badamy skuteczność na przyciętej tabeli na podstawie feature_importances. - 94.37% accuracy

features_cor = features.loc[:, ["temp_1", "average", "friend"]] # Badamy skuteczność na przyciętej tabeli na podstawie feature_importances. - 94.37% accuracy


feature_list = list(features.columns) # tworzymy liste z nazwami kolumn z tablicy features

features = np.array(features)


train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.25,
                                                                            random_state=42) # kod 42 oznacza, że uzyskujemy komperatywny podział losowy dancyh
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

# Ustawiamy bazowe statystyki porównawcze
baseline_preds = test_features[:, feature_list.index('average')]
baseline_errors = abs(baseline_preds - test_labels) # błąd wyliczonej wartości max temp (actual) vs average (historyczne)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

# Szkolenie modelu:

n_estimators = [int(x) for x in np.linspace(start = 1000, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(2, 10, num = 5)]
print(n_estimators)
print(max_depth)
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth}
pprint(random_grid)


start_time = time.time()
#rf = RandomForestRegressor()
rf = RandomForestRegressor(n_estimators=1000, max_depth=4, random_state=42) # inicjalizacja modelu z 1000 drzewek
#pprint(rf.get_params())

#rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=2, verbose=2,
   #                            random_state=42, n_jobs=-1) # model badające wszystkie parametry ustalone w random_grid i wynik zastosowania tych parametrów - accuracy
#rf_random.fit(train_features, train_labels)
#pprint(rf_random.best_params_)
#pprint(rf_random.best_score_)
#pprint(rf_random.best_estimator_)
#pprint(rf_random.best_index_)


rf.fit(train_features, train_labels) # faktyczne szkolenie modelu

# Stosujemy wyszkolony model na danych trainigowych do przewidywania Y dla danych testowych (Y - actual)

predictions = rf.predict(test_features) # przewidywanie na danych testowych - Yków

errors = abs(predictions - test_labels) # wyliczenie błędu w oparciu o przewidziane wartości Y (predictions) a faktycznym Ykiem (test_labels)

print("MAE - MEAN ABSOLUTE ERROR: ", round(np.mean(errors), 2), 'degrees.')

# MEAN ABSOLUTE PERCENTAGE ERROR
MAPE = 100*(errors/test_labels)

accuracy = 100 - np.mean(MAPE)

print("Accuracy:", round(accuracy, 2), '%.')
end_time = time.time()
print("Upływ czasu: ", end_time-start_time)

tree = rf.estimators_[7] # wyciągamy jedno drzewko z naszego lasu

export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision=1) # Export obrazu do dot file

(graph,) = pydot.graph_from_dot_file('tree.dot') # za pomocą pliku .dot tworzymy wykres
graph.write_png('tree.png') # zapis do png

# Badanie ważkości zmiennych X w celu przewidzenia zmiennej Y (actual)

importances = list(rf.feature_importances_)

feature_importances = pd.Series(importances, index=feature_list)

print(feature_importances)

cor = features_cor.corr(method ='pearson')

fig, ax = plt.subplots(figsize=(8,6))
plt.title("Correlation Matrix Plot")
sns.heatmap(cor, mask=np.zeros_like(cor, dtype=np.bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.show()

csq=chi2_contingency(pd.crosstab(features_cor['average'], features_cor['temp_1']))
print("P-value average vs temp_1:", csq[1]) # poznamy faktyczną wartość korelacji określonej przez współczynnik P-value

csq2=chi2_contingency(pd.crosstab(features_cor['average'], features_cor['friend']))
print("P-value average vs friend:", csq2[1]) # poznamy faktyczną wartość korelacji określonej przez współczynnik P-value

csq3=chi2_contingency(pd.crosstab(features_cor['temp_1'], features_cor['friend']))
print("P-value temp_1 vs friend:", csq3[1]) # poznamy faktyczną wartość korelacji określonej przez współczynnik P-value 

Python Online Compiler

Write, Run & Share Python code online using OneCompiler's Python online compiler for free. It's one of the robust, feature-rich online compilers for python language, supporting both the versions which are Python 3 and Python 2.7. Getting started with the OneCompiler's Python editor is easy and fast. The editor shows sample boilerplate code when you choose language as Python or Python2 and start coding.

Taking inputs (stdin)

OneCompiler's python online editor supports stdin and users can give inputs to programs using the STDIN textbox under the I/O tab. Following is a sample python program which takes name as input and print your name with hello.

import sys
name = sys.stdin.readline()
print("Hello "+ name)

About Python

Python is a very popular general-purpose programming language which was created by Guido van Rossum, and released in 1991. It is very popular for web development and you can build almost anything like mobile apps, web apps, tools, data analytics, machine learning etc. It is designed to be simple and easy like english language. It's is highly productive and efficient making it a very popular language.

Tutorial & Syntax help

Loops

1. If-Else:

When ever you want to perform a set of operations based on a condition IF-ELSE is used.

if conditional-expression
    #code
elif conditional-expression
    #code
else:
    #code

Note:

Indentation is very important in Python, make sure the indentation is followed correctly

2. For:

For loop is used to iterate over arrays(list, tuple, set, dictionary) or strings.

Example:

mylist=("Iphone","Pixel","Samsung")
for i in mylist:
    print(i)

3. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while condition  
    #code 

Collections

There are four types of collections in Python.

1. List:

List is a collection which is ordered and can be changed. Lists are specified in square brackets.

Example:

mylist=["iPhone","Pixel","Samsung"]
print(mylist)

2. Tuple:

Tuple is a collection which is ordered and can not be changed. Tuples are specified in round brackets.

Example:

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)

Below throws an error if you assign another value to tuple again.

myTuple=("iPhone","Pixel","Samsung")
print(myTuple)
myTuple[1]="onePlus"
print(myTuple)

3. Set:

Set is a collection which is unordered and unindexed. Sets are specified in curly brackets.

Example:

myset = {"iPhone","Pixel","Samsung"}
print(myset)

4. Dictionary:

Dictionary is a collection of key value pairs which is unordered, can be changed, and indexed. They are written in curly brackets with key - value pairs.

Example:

mydict = {
    "brand" :"iPhone",
    "model": "iPhone 11"
}
print(mydict)

Supported Libraries

Following are the libraries supported by OneCompiler's Python compiler

NameDescription
NumPyNumPy python library helps users to work on arrays with ease
SciPySciPy is a scientific computation library which depends on NumPy for convenient and fast N-dimensional array manipulation
SKLearn/Scikit-learnScikit-learn or Scikit-learn is the most useful library for machine learning in Python
PandasPandas is the most efficient Python library for data manipulation and analysis
DOcplexDOcplex is IBM Decision Optimization CPLEX Modeling for Python, is a library composed of Mathematical Programming Modeling and Constraint Programming Modeling