mydata<-read.csv("C:/Users/91626/Downloads/archive/WHR2023YEEEE.csv")
mydata

install.packages("carData")
install.packages("mice")
install.packages("VIM")
install.packages("caret")
install.packages("corrplot")
install.packages("moments")
install.packages("lattice")
library(lattice)
install.packages("ggplot2")
library(ggplot2)
install.packages("tidyverse")
library(tidyverse)
install.packages("dplyr")
library(dplyr)
library(dplyr) 
library(carData) 
library(mice) 
library(VIM) 
library(caret) 
library(corrplot) 
library(moments)

mydata
dim(mydata)
str(mydata)
summary(mydata)
data() # To identify different sets of data in Rstudio
head(mydata) #first six rows

#Checking the missing value
sum(is.na(mydata)) #no. of all value of na in the data set
colSums(is.na(mydata)) #Sum column wise
md.pattern(mydata, plot = TRUE, rotate.names = TRUE) #run it
md.pairs(mydata) 

#Recording the missing value
mydata$Generosity[is.na(mydata$Generosity)] <- mean(mydata$Generosity, na.rm = TRUE)
mydata$Logged.GDP.per.capita [is.na(mydata$Logged.GDP.per.capita)] <- mean(mydata$Logged.GDP.per.capita, na.rm = TRUE)
summary(mydata)
mydata$upperwhisker[is.na(mydata$upperwhisker)]<- 0.200
summary(mydata)

#Exclude Missing values
mean(mydata$Healthy.life.expectancy)
mean(mydata$Healthy.life.expectancy, na.rm = TRUE)
mydata[complete.cases(mydata),] #rows with complete data
mydata[!complete.cases(mydata),]#rows without incomplete data…
dim(mydata)
dim(mydata[complete.cases(mydata),])
dim(mydata[!complete.cases(mydata),]) #39+168=207

#New data set without missing values
mydata_wo_misval<- na.omit(mydata)
dim(mydata_wo_misval)
summary(mydata_wo_misval)

#Impute
impute<-mice(mydata[,3:10], m=3, seed=123) # frequency, m=3 and in pattern 123 and 123 and ...
mydata_imputed<- complete(impute,1)
summary(mydata_imputed)
dim(mydata_imputed)

#Exploring outliers
boxplot(mydata_imputed, plot = TRUE)



#Remove outliers
quartiles<- quantile(mydata_imputed$Ladder.score, probs=c(.25, .75), na.rm = FALSE)
IQR<- IQR(mydata_imputed$Ladder.score)
Lower<-quartiles[1]-1.5*IQR
Upper<-quartiles[2]+ 1.5*IQR
quartiles[1]
Upper

mydata_no_outlier<- subset(mydata_imputed, mydata_imputed$Ladder.score>Lower & mydata_imputed$Ladder.score<Upper)
summary(mydata_no_outlier)
#Results after removing outlier
dim(mydata_no_outlier)
boxplot(mydata_no_outlier, plot= TRUE)

#Find and eliminate erroneous data
#Mislabeled variables
mydata<- mydata%>% rename(Name_of_Country=Country.name)
summary(mydata)

#Faulty data types
class(mydata$Perceptions.of.corruption)
str(mydata)
#mydata$Perceptions.of.corruption<- as.numeric(mydata$Perceptions.of.corruptionn) #Not required as data is numeric already

#Non-unique ID numbers
#duplicated()
#distinct()

#String inconsistencies
unique(mydata$region)
#mydata$region<- gsub("(?i)As|(?i)Asia", "1", mydata$region) #Not required as data string consistent
mydata$region

#Normalise the data, log transformation
mydata_log_ladder_score= log(as.data.frame(mydata_imputed$Ladder.score))

summary(mydata_log_ladder_score)  
summary(mydata_imputed$Ladder.score)

#Min-Max Scaling = (X-min(X))/(max(X)-min(X))
norm_process<-preProcess(as.data.frame(mydata_imputed$Ladder.score))

norm_process
mydata_norm_minmax<-predict(norm_process, as.data.frame(mydata_imputed$Ladder.score))
summary(mydata_norm_minmax)


#Standard Scaling (Z score) = (X-nu)/sigma
mydata_norm_std<-as.data.frame(scale(mydata_imputed$Ladder.score))
summary(mydata_norm_std)


#Correlation Analysis
mydata_descrCor<-cor(mydata_imputed)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))


#Balanced skewed data
#Find skewness in the dataframe
skewness(mydata_imputed)
plot(density(mydata_imputed$Ladder.score))
hist(mydata_imputed$Ladder.score)

#log transformation
#skewness(mydata_log_gdp)
#plot(density(mydata_log_gdp$`mydata_imputed$Logged GDP per capita`))

#square root transform
mydata_sqrt_gdp<- sqrt(mydata_imputed$Ladder.score)
skewness(mydata_sqrt_gdp)
plot(density(mydata_sqrt_gdp))



#Choose Normalisation Technique
library(ggplot2)
library(tidyverse)

ggplot(mydata_no_outlier, aes(log(mydata_no_outlier$Ladder.score)))+ geom_density(fill="blue")
ggplot(mydata_no_outlier, aes(sqrt(mydata_no_outlier$Ladder.score)))+ geom_density(fill="blue")       

mydata_norm_log<- mydata_no_outlier
mydata_norm_log
dim(mydata_norm_log)
mydata_norm_log<- scale(mydata_norm_log)
summary(mydata_norm_log)
dim(mydata_norm_log) #Dimensions
str(mydata_norm_log)

#Choose Variables
#Correlation
library(corrplot)
mydata_descrCor<-cor(mydata_norm_log)
dim(mydata_descrCor)
corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex=0.7, tl.col=rgb(0,0,0))
mydata_imputed$Standard.error.of.ladder.score

#Drop Variable
mydata_mlr<- subset(mydata_norm_log, select=-c(Standard.error.of.ladder.score,upperwhisker,lowerwhisker))
mydata_descrCor<- cor(mydata_mlr)
corrplot(mydata_descrCor, order="FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0))

#Split the sample data
set.seed(1)
row.number<- sample(1:nrow(mydata_mlr), 0.8*nrow(mydata_mlr))
train= mydata_mlr[row.number,]
test= mydata_mlr[-row.number,]
dim(train)
dim(test)

#Model Building
train_df <- as.data.frame(train)
model1= lm(train_df$Ladder.score~., data = train_df )
summary(model1)

model2= update(model1, ~.-Perceptions.of.corruption,-Generosity)
summary(model2)

#t>= |2|
#F-statistics should be greater than 1

#multiple r square and adjusted r square
#r-square= explained variation/total variation

par(mfrow =c(2,2))
plot(model2)

#Predictions
test_df <- as.data.frame(test)
pred1<- predict(model2, newdata = test_df)
rmse<- sqrt(sum((exp(pred1)- test_df$Ladder.score)^2)/length(test_df$Ladder.score))
c(RMSE=rmse, R2=summary(model2)$r.squared)

par(mfrow=c(1,1))
plot(test_df$Ladder.score, exp(pred1)) 

R Language Online Compiler

Write, Run & Share R Language code online using OneCompiler's R Language online compiler for free. It's one of the robust, feature-rich online compilers for R language, running on the latest version 3.4. Getting started with the OneCompiler's R Language compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as R and start coding.

About R

R is very popular for data analytics which was created by Ross Ihaka and Robert Gentleman in 1993. Many big companies like Google, Facebook, Airbnb etc uses this language for data analytics. R is good for software developers, statisticians and data miners.

Key Features

  • Interpreted programming language(no compilation required)
  • provides highly extensible graphical techniques.
  • Good community support
  • Free and open-source
  • Handles data very effectively.

Syntax help

Data Types

Data typeDescriptionUsage
NumericTo represent decimal valuesx=1.84
IntegerTo represent integer values, L tells to store the value as integerx=10L
ComplexTo represent complex valuesx = 10+2i
LogicalTo represent boolean values, true or falsex = TRUE
CharacterTo represent string valuesx <- "One compiler"
rawHolds raw bytes

Variables

Variables can be assigned using any of the leftward, rightward or equal to operator. You can print the variables using either print or cat functions.

var-name = value
var-name <- value
value -> var-name

Loops

1. IF Family:

If, If-else, Nested-Ifs are used when you want to perform a certain set of operations based on conditional expressions.

If

if(conditional-expression){    
    #code    
} 

If-else

if(conditional-expression){  
    #code if condition is true  
} else {  
    #code if condition is false  
} 

Nested-If-else

if(condition-expression1) {  
    #code if above condition is true  
} elseif(condition-expression2){  
    #code if above condition is true  
}  
elseif(condition-expression3) {  
    #code if above condition is true  
}  
...  
else {  
    #code if all the conditions are false  
}  

2. Switch:

Switch is used to execute one set of statement from multiple conditions.

switch(expression, case-1, case-2, case-3....)   

3. For:

For loop is used to iterate a set of statements based on a condition.

for (value in vector) {  
  # code  
} 

4. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while(condition) {  
 # code 
}  

5. Repeat:

Repeat is used tyo iterate a set of statements with out any condition. You can write a user-defined condition to exit from the loop using IF.

repeat {   
   #code   
   if(condition-expression) {  
      break  
   }  
} 

Functions

Function is a sub-routine which contains set of statements. Usually functions are written when multiple calls are required to same set of statements which increases re-usuability and modularity.

How to define a Function

func-name <- function(parameter_1, parameter_2, ...) {  
   #code for function body   
}  

How to call a Function

function_name (parameters)

Vectors

Vector is a basic data strucre where sequence of data values share same data type.

For example, the below statement assigns 1 to 10 values to x.
You can also use se() function to create vectors.

x <- 1:10
#using seq() function
 x <- seq(1, 10, by=2)

the above statement prints the output as [1] 1 3 5 7 9.