mydata<-read.csv("C:/Users/91626/Downloads/archive/WHR2023YEEEE.csv") mydata install.packages("carData") install.packages("mice") install.packages("VIM") install.packages("caret") install.packages("corrplot") install.packages("moments") install.packages("lattice") library(lattice) install.packages("ggplot2") library(ggplot2) install.packages("tidyverse") library(tidyverse) install.packages("dplyr") library(dplyr) library(dplyr) library(carData) library(mice) library(VIM) library(caret) library(corrplot) library(moments) mydata dim(mydata) str(mydata) summary(mydata) data() # To identify different sets of data in Rstudio head(mydata) #first six rows #Checking the missing value sum(is.na(mydata)) #no. of all value of na in the data set colSums(is.na(mydata)) #Sum column wise md.pattern(mydata, plot = TRUE, rotate.names = TRUE) #run it md.pairs(mydata) #Recording the missing value mydata$Generosity[is.na(mydata$Generosity)] <- mean(mydata$Generosity, na.rm = TRUE) mydata$Logged.GDP.per.capita [is.na(mydata$Logged.GDP.per.capita)] <- mean(mydata$Logged.GDP.per.capita, na.rm = TRUE) summary(mydata) mydata$upperwhisker[is.na(mydata$upperwhisker)]<- 0.200 summary(mydata) #Exclude Missing values mean(mydata$Healthy.life.expectancy) mean(mydata$Healthy.life.expectancy, na.rm = TRUE) mydata[complete.cases(mydata),] #rows with complete data mydata[!complete.cases(mydata),]#rows without incomplete data… dim(mydata) dim(mydata[complete.cases(mydata),]) dim(mydata[!complete.cases(mydata),]) #39+168=207 #New data set without missing values mydata_wo_misval<- na.omit(mydata) dim(mydata_wo_misval) summary(mydata_wo_misval) #Impute impute<-mice(mydata[,3:10], m=3, seed=123) # frequency, m=3 and in pattern 123 and 123 and ... mydata_imputed<- complete(impute,1) summary(mydata_imputed) dim(mydata_imputed) #Exploring outliers boxplot(mydata_imputed, plot = TRUE) #Remove outliers quartiles<- quantile(mydata_imputed$Ladder.score, probs=c(.25, .75), na.rm = FALSE) IQR<- IQR(mydata_imputed$Ladder.score) Lower<-quartiles[1]-1.5*IQR Upper<-quartiles[2]+ 1.5*IQR quartiles[1] Upper mydata_no_outlier<- subset(mydata_imputed, mydata_imputed$Ladder.score>Lower & mydata_imputed$Ladder.score<Upper) summary(mydata_no_outlier) #Results after removing outlier dim(mydata_no_outlier) boxplot(mydata_no_outlier, plot= TRUE) #Find and eliminate erroneous data #Mislabeled variables mydata<- mydata%>% rename(Name_of_Country=Country.name) summary(mydata) #Faulty data types class(mydata$Perceptions.of.corruption) str(mydata) #mydata$Perceptions.of.corruption<- as.numeric(mydata$Perceptions.of.corruptionn) #Not required as data is numeric already #Non-unique ID numbers #duplicated() #distinct() #String inconsistencies unique(mydata$region) #mydata$region<- gsub("(?i)As|(?i)Asia", "1", mydata$region) #Not required as data string consistent mydata$region #Normalise the data, log transformation mydata_log_ladder_score= log(as.data.frame(mydata_imputed$Ladder.score)) summary(mydata_log_ladder_score) summary(mydata_imputed$Ladder.score) #Min-Max Scaling = (X-min(X))/(max(X)-min(X)) norm_process<-preProcess(as.data.frame(mydata_imputed$Ladder.score)) norm_process mydata_norm_minmax<-predict(norm_process, as.data.frame(mydata_imputed$Ladder.score)) summary(mydata_norm_minmax) #Standard Scaling (Z score) = (X-nu)/sigma mydata_norm_std<-as.data.frame(scale(mydata_imputed$Ladder.score)) summary(mydata_norm_std) #Correlation Analysis mydata_descrCor<-cor(mydata_imputed) corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0)) #Balanced skewed data #Find skewness in the dataframe skewness(mydata_imputed) plot(density(mydata_imputed$Ladder.score)) hist(mydata_imputed$Ladder.score) #log transformation #skewness(mydata_log_gdp) #plot(density(mydata_log_gdp$`mydata_imputed$Logged GDP per capita`)) #square root transform mydata_sqrt_gdp<- sqrt(mydata_imputed$Ladder.score) skewness(mydata_sqrt_gdp) plot(density(mydata_sqrt_gdp)) #Choose Normalisation Technique library(ggplot2) library(tidyverse) ggplot(mydata_no_outlier, aes(log(mydata_no_outlier$Ladder.score)))+ geom_density(fill="blue") ggplot(mydata_no_outlier, aes(sqrt(mydata_no_outlier$Ladder.score)))+ geom_density(fill="blue") mydata_norm_log<- mydata_no_outlier mydata_norm_log dim(mydata_norm_log) mydata_norm_log<- scale(mydata_norm_log) summary(mydata_norm_log) dim(mydata_norm_log) #Dimensions str(mydata_norm_log) #Choose Variables #Correlation library(corrplot) mydata_descrCor<-cor(mydata_norm_log) dim(mydata_descrCor) corrplot(mydata_descrCor, order = "FPC", method = "color", type = "lower", tl.cex=0.7, tl.col=rgb(0,0,0)) mydata_imputed$Standard.error.of.ladder.score #Drop Variable mydata_mlr<- subset(mydata_norm_log, select=-c(Standard.error.of.ladder.score,upperwhisker,lowerwhisker)) mydata_descrCor<- cor(mydata_mlr) corrplot(mydata_descrCor, order="FPC", method = "color", type = "lower", tl.cex = 0.7, tl.col = rgb(0,0,0)) #Split the sample data set.seed(1) row.number<- sample(1:nrow(mydata_mlr), 0.8*nrow(mydata_mlr)) train= mydata_mlr[row.number,] test= mydata_mlr[-row.number,] dim(train) dim(test) #Model Building train_df <- as.data.frame(train) model1= lm(train_df$Ladder.score~., data = train_df ) summary(model1) model2= update(model1, ~.-Perceptions.of.corruption,-Generosity) summary(model2) #t>= |2| #F-statistics should be greater than 1 #multiple r square and adjusted r square #r-square= explained variation/total variation par(mfrow =c(2,2)) plot(model2) #Predictions test_df <- as.data.frame(test) pred1<- predict(model2, newdata = test_df) rmse<- sqrt(sum((exp(pred1)- test_df$Ladder.score)^2)/length(test_df$Ladder.score)) c(RMSE=rmse, R2=summary(model2)$r.squared) par(mfrow=c(1,1)) plot(test_df$Ladder.score, exp(pred1))
Write, Run & Share R Language code online using OneCompiler's R Language online compiler for free. It's one of the robust, feature-rich online compilers for R language, running on the latest version 3.4. Getting started with the OneCompiler's R Language compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as R
and start coding.
R is very popular for data analytics which was created by Ross Ihaka and Robert Gentleman in 1993. Many big companies like Google, Facebook, Airbnb etc uses this language for data analytics. R is good for software developers, statisticians and data miners.
Data type | Description | Usage |
---|---|---|
Numeric | To represent decimal values | x=1.84 |
Integer | To represent integer values, L tells to store the value as integer | x=10L |
Complex | To represent complex values | x = 10+2i |
Logical | To represent boolean values, true or false | x = TRUE |
Character | To represent string values | x <- "One compiler" |
raw | Holds raw bytes |
Variables can be assigned using any of the leftward, rightward or equal to operator. You can print the variables using either print or cat functions.
var-name = value
var-name <- value
value -> var-name
If, If-else, Nested-Ifs are used when you want to perform a certain set of operations based on conditional expressions.
if(conditional-expression){
#code
}
if(conditional-expression){
#code if condition is true
} else {
#code if condition is false
}
if(condition-expression1) {
#code if above condition is true
} elseif(condition-expression2){
#code if above condition is true
}
elseif(condition-expression3) {
#code if above condition is true
}
...
else {
#code if all the conditions are false
}
Switch is used to execute one set of statement from multiple conditions.
switch(expression, case-1, case-2, case-3....)
For loop is used to iterate a set of statements based on a condition.
for (value in vector) {
# code
}
While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.
while(condition) {
# code
}
Repeat is used tyo iterate a set of statements with out any condition. You can write a user-defined condition to exit from the loop using IF
.
repeat {
#code
if(condition-expression) {
break
}
}
Function is a sub-routine which contains set of statements. Usually functions are written when multiple calls are required to same set of statements which increases re-usuability and modularity.
func-name <- function(parameter_1, parameter_2, ...) {
#code for function body
}
function_name (parameters)
Vector is a basic data strucre where sequence of data values share same data type.
For example, the below statement assigns 1 to 10 values to x.
You can also use se() function to create vectors.
x <- 1:10
#using seq() function
x <- seq(1, 10, by=2)
the above statement prints the output as [1] 1 3 5 7 9
.