OneCompiler

R project

281

mydata<-read.csv("C:/Users/91626/Downloads/archive/WHR2023YEEEE.csv")
mydata

install.packages("carData")
install.packages("mice")
install.packages("VIM")
install.packages("caret")
install.packages("corrplot")
install.packages("moments")
install.packages("lattice")
library(lattice)
install.packages("ggplot2")
library(ggplot2)
install.packages("tidyverse")
library(tidyverse)
install.packages("dplyr")
library(dplyr)
library(dplyr)
library(carData)
library(mice)
library(VIM)
library(caret)
library(corrplot)
library(moments)

mydata
dim(mydata)
str(mydata)
summary(mydata)
data() # To identify different sets of data in Rstudio
head(mydata) #first six rows

#Checking the missing value
sum(is.na(mydata)) #no. of all value of na in the data set
colSums(is.na(mydata)) #Sum column wise
md.pattern(mydata, plot = TRUE, rotate.names = TRUE) #run it
md.pairs(mydata)

#Recording the missing value
mydataGenerosity[is.na(mydataGenerosity[is.na(mydataGenerosity)] <- mean(mydataGenerosity,na.rm=TRUE)mydataGenerosity, na.rm = TRUE) mydataLogged.GDP.per.capita [is.na(mydataLogged.GDP.per.capita)]<mean(mydataLogged.GDP.per.capita)] <- mean(mydataLogged.GDP.per.capita, na.rm = TRUE)
summary(mydata)
mydataupperwhisker[is.na(mydataupperwhisker[is.na(mydataupperwhisker)]<- 0.200
summary(mydata)

#Exclude Missing values
mean(mydataHealthy.life.expectancy)mean(mydataHealthy.life.expectancy) mean(mydataHealthy.life.expectancy, na.rm = TRUE)
mydata[complete.cases(mydata),] #rows with complete data
mydata[!complete.cases(mydata),]#rows without incomplete data…
dim(mydata)
dim(mydata[complete.cases(mydata),])
dim(mydata[!complete.cases(mydata),]) #39+168=207

#New data set without missing values
mydata_wo_misval<- na.omit(mydata)
dim(mydata_wo_misval)
summary(mydata_wo_misval)

#Impute
impute<-mice(mydata[,3:10], m=3, seed=123) # frequency, m=3 and in pattern 123 and 123 and ...
mydata_imputed<- complete(impute,1)
summary(mydata_imputed)
dim(mydata_imputed)

#Exploring outliers
boxplot(mydata_imputed, plot = TRUE)

#Remove outliers
quartiles<- quantile(mydata_imputedLadder.score,probs=c(.25,.75),na.rm=FALSE)IQR<IQR(mydataimputedLadder.score, probs=c(.25, .75), na.rm = FALSE) IQR<- IQR(mydata_imputedLadder.score)
Lower<-quartiles[1]-1.5IQR
Upper<-quartiles[2]+ 1.5
IQR
quartiles[1]
Upper

mydata_no_outlier<- subset(mydata_imputed, mydata_imputedLadder.score>Lower & mydata_imputedLadder.score<Upper)
summary(mydata_no_outlier)
#Results after removing outlier
dim(mydata_no_outlier)
boxplot(mydata_no_outlier, plot= TRUE)

#Find and eliminate erroneous data
#Mislabeled variables
mydata<- mydata%>% rename(Name_of_Country=Country.name)
summary(mydata)

#Faulty data types
class(mydataPerceptions.of.corruption) str(mydata) #mydataPerceptions.of.corruption<- as.numeric(mydata$Perceptions.of.corruptionn) #Not required as data is numeric already

#Non-unique ID numbers
#duplicated()
#distinct()

#String inconsistencies
unique(mydataregion) #mydataregion<- gsub("(?i)As|(?i)Asia", "1", mydataregion) #Not required as data string consistent mydataregion

#Normalise the data, log transformation
mydata_log_ladder_score= log(as.data.frame(mydata_imputed$Ladder.score))

summary(mydata_log_ladder_score)
summary(mydata_imputed$Ladder.score)

#Min-Max Scaling = (X-min(X))/(max(X)-min(X))
norm_process<-preProcess(as.data.frame(mydata_imputed$Ladder.score))

norm_process
mydata_norm_minmax<-predict(norm_process, as.data.frame(mydata_imputed$Ladder.score))
summary(mydata_norm_minmax)

#Standard Scaling (Z score) = (X-nu)/sigma
mydata_norm_std<-as.data.frame(scale(mydata_imputed$Ladder.score))
summary(mydata_norm_std)

#Correlation Analysis
mydata_descrCor<-cor(mydata_imputed)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))

#Balanced skewed data
#Find skewness in the dataframe
skewness(mydata_imputed)
plot(density(mydata_imputedLadder.score))hist(mydataimputedLadder.score)) hist(mydata_imputedLadder.score)

#log transformation
#skewness(mydata_log_gdp)
#plot(density(mydata_log_gdpmydataimputed`mydata_imputedLogged GDP per capita`))

#square root transform
mydata_sqrt_gdp<- sqrt(mydata_imputed$Ladder.score)
skewness(mydata_sqrt_gdp)
plot(density(mydata_sqrt_gdp))

#Choose Normalisation Technique
library(ggplot2)
library(tidyverse)

ggplot(mydata_no_outlier, aes(log(mydata_no_outlierLadder.score)))+geomdensity(fill="blue")ggplot(mydatanooutlier,aes(sqrt(mydatanooutlierLadder.score)))+ geom_density(fill="blue") ggplot(mydata_no_outlier, aes(sqrt(mydata_no_outlierLadder.score)))+ geom_density(fill="blue")

mydata_norm_log<- mydata_no_outlier
mydata_norm_log
dim(mydata_norm_log)
mydata_norm_log<- scale(mydata_norm_log)
summary(mydata_norm_log)
dim(mydata_norm_log) #Dimensions
str(mydata_norm_log)

#Choose Variables
#Correlation
library(corrplot)
mydata_descrCor<-cor(mydata_norm_log)
dim(mydata_descrCor)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex=0.7, tl.col=rgb(0,0,0))
mydata_imputed$Standard.error.of.ladder.score

#Drop Variable
mydata_mlr<- subset(mydata_norm_log, select=-c(Standard.error.of.ladder.score,upperwhisker,lowerwhisker))
mydata_descrCor<- cor(mydata_mlr)
corrplot(mydata_descrCor, order="FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))

#Split the sample data
set.seed(1)
row.number<- sample(1:nrow(mydata_mlr), 0.8*nrow(mydata_mlr))
train= mydata_mlr[row.number,]
test= mydata_mlr[-row.number,]
dim(train)
dim(test)

#Model Building
train_df <- as.data.frame(train)
model1= lm(train_df$Ladder.score~., data = train_df )
summary(model1)

model2= update(model1, ~.-Perceptions.of.corruption,-Generosity)
summary(model2)

#t>= |2|
#F-statistics should be greater than 1

#multiple r square and adjusted r square
#r-square= explained variation/total variation

par(mfrow =c(2,2))
plot(model2)

#Predictions
test_df <- as.data.frame(test)
pred1<- predict(model2, newdata = test_df)
rmse<- sqrt(sum((exp(pred1)- test_dfLadder.score)2)/length(testdfLadder.score)^2)/length(test_dfLadder.score))
c(RMSE=rmse, R2=summary(model2)$r.squared)

par(mfrow=c(1,1))
plot(test_df$Ladder.score, exp(pred1))