R project
mydata<-read.csv("C:/Users/91626/Downloads/archive/WHR2023YEEEE.csv")
mydata
install.packages("carData")
install.packages("mice")
install.packages("VIM")
install.packages("caret")
install.packages("corrplot")
install.packages("moments")
install.packages("lattice")
library(lattice)
install.packages("ggplot2")
library(ggplot2)
install.packages("tidyverse")
library(tidyverse)
install.packages("dplyr")
library(dplyr)
library(dplyr)
library(carData)
library(mice)
library(VIM)
library(caret)
library(corrplot)
library(moments)
mydata
dim(mydata)
str(mydata)
summary(mydata)
data() # To identify different sets of data in Rstudio
head(mydata) #first six rows
#Checking the missing value
sum(is.na(mydata)) #no. of all value of na in the data set
colSums(is.na(mydata)) #Sum column wise
md.pattern(mydata, plot = TRUE, rotate.names = TRUE) #run it
md.pairs(mydata)
#Recording the missing value
mydataGenerosity)] <- mean(mydataLogged.GDP.per.capita [is.na(mydataLogged.GDP.per.capita, na.rm = TRUE)
summary(mydata)
mydataupperwhisker)]<- 0.200
summary(mydata)
#Exclude Missing values
mean(mydataHealthy.life.expectancy, na.rm = TRUE)
mydata[complete.cases(mydata),] #rows with complete data
mydata[!complete.cases(mydata),]#rows without incomplete data…
dim(mydata)
dim(mydata[complete.cases(mydata),])
dim(mydata[!complete.cases(mydata),]) #39+168=207
#New data set without missing values
mydata_wo_misval<- na.omit(mydata)
dim(mydata_wo_misval)
summary(mydata_wo_misval)
#Impute
impute<-mice(mydata[,3:10], m=3, seed=123) # frequency, m=3 and in pattern 123 and 123 and ...
mydata_imputed<- complete(impute,1)
summary(mydata_imputed)
dim(mydata_imputed)
#Exploring outliers
boxplot(mydata_imputed, plot = TRUE)
#Remove outliers
quartiles<- quantile(mydata_imputedLadder.score)
Lower<-quartiles[1]-1.5IQR
Upper<-quartiles[2]+ 1.5IQR
quartiles[1]
Upper
mydata_no_outlier<- subset(mydata_imputed, mydata_imputedLadder.score>Lower & mydata_imputedLadder.score<Upper)
summary(mydata_no_outlier)
#Results after removing outlier
dim(mydata_no_outlier)
boxplot(mydata_no_outlier, plot= TRUE)
#Find and eliminate erroneous data
#Mislabeled variables
mydata<- mydata%>% rename(Name_of_Country=Country.name)
summary(mydata)
#Faulty data types
class(mydataPerceptions.of.corruption)
str(mydata)
#mydataPerceptions.of.corruption<- as.numeric(mydata$Perceptions.of.corruptionn) #Not required as data is numeric already
#Non-unique ID numbers
#duplicated()
#distinct()
#String inconsistencies
unique(mydataregion)
#mydataregion<- gsub("(?i)As|(?i)Asia", "1", mydataregion) #Not required as data string consistent
mydataregion
#Normalise the data, log transformation
mydata_log_ladder_score= log(as.data.frame(mydata_imputed$Ladder.score))
summary(mydata_log_ladder_score)
summary(mydata_imputed$Ladder.score)
#Min-Max Scaling = (X-min(X))/(max(X)-min(X))
norm_process<-preProcess(as.data.frame(mydata_imputed$Ladder.score))
norm_process
mydata_norm_minmax<-predict(norm_process, as.data.frame(mydata_imputed$Ladder.score))
summary(mydata_norm_minmax)
#Standard Scaling (Z score) = (X-nu)/sigma
mydata_norm_std<-as.data.frame(scale(mydata_imputed$Ladder.score))
summary(mydata_norm_std)
#Correlation Analysis
mydata_descrCor<-cor(mydata_imputed)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))
#Balanced skewed data
#Find skewness in the dataframe
skewness(mydata_imputed)
plot(density(mydata_imputedLadder.score)
#log transformation
#skewness(mydata_log_gdp)
#plot(density(mydata_log_gdpLogged GDP per capita`))
#square root transform
mydata_sqrt_gdp<- sqrt(mydata_imputed$Ladder.score)
skewness(mydata_sqrt_gdp)
plot(density(mydata_sqrt_gdp))
#Choose Normalisation Technique
library(ggplot2)
library(tidyverse)
ggplot(mydata_no_outlier, aes(log(mydata_no_outlierLadder.score)))+ geom_density(fill="blue")
mydata_norm_log<- mydata_no_outlier
mydata_norm_log
dim(mydata_norm_log)
mydata_norm_log<- scale(mydata_norm_log)
summary(mydata_norm_log)
dim(mydata_norm_log) #Dimensions
str(mydata_norm_log)
#Choose Variables
#Correlation
library(corrplot)
mydata_descrCor<-cor(mydata_norm_log)
dim(mydata_descrCor)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex=0.7, tl.col=rgb(0,0,0))
mydata_imputed$Standard.error.of.ladder.score
#Drop Variable
mydata_mlr<- subset(mydata_norm_log, select=-c(Standard.error.of.ladder.score,upperwhisker,lowerwhisker))
mydata_descrCor<- cor(mydata_mlr)
corrplot(mydata_descrCor, order="FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))
#Split the sample data
set.seed(1)
row.number<- sample(1:nrow(mydata_mlr), 0.8*nrow(mydata_mlr))
train= mydata_mlr[row.number,]
test= mydata_mlr[-row.number,]
dim(train)
dim(test)
#Model Building
train_df <- as.data.frame(train)
model1= lm(train_df$Ladder.score~., data = train_df )
summary(model1)
model2= update(model1, ~.-Perceptions.of.corruption,-Generosity)
summary(model2)
#t>= |2|
#F-statistics should be greater than 1
#multiple r square and adjusted r square
#r-square= explained variation/total variation
par(mfrow =c(2,2))
plot(model2)
#Predictions
test_df <- as.data.frame(test)
pred1<- predict(model2, newdata = test_df)
rmse<- sqrt(sum((exp(pred1)- test_dfLadder.score))
c(RMSE=rmse, R2=summary(model2)$r.squared)
par(mfrow=c(1,1))
plot(test_df$Ladder.score, exp(pred1))