#install.packages("GRange")
   library(titanic)    # loads titanic_train data frame
      library(caret)
      library(tidyverse)
      library(rpart)
      
      # 3 significant digits
      options(digits = 3)
      
      # clean the data - `titanic_train` is loaded with the titanic package
      titanic_clean <- titanic_train %>%
        mutate(Survived = factor(Survived),
               Embarked = factor(Embarked),
               Age = ifelse(is.na(Age), median(Age, na.rm = TRUE), Age), # NA age to median age
               FamilySize = SibSp + Parch + 1) %>%    # count family members
        select(Survived,  Sex, Pclass, Age, Fare, SibSp, Parch, FamilySize, Embarked)
#Q1 Training and test sets
      #Split titanic_clean into test and training sets - after running the setup code, it should have 891 rows and 9 variables.
      #Set the seed to 42, then use the caret package to create a 20% data partition based on the Survived column. Assign the 20% partition to test_set and the remaining 80% partition to train_set.
      #How many observations are in the training set?
      set.seed(42) # if R version >= 3.6, sample.kind = 'Rounding'
      test_index <- createDataPartition(titanic_clean$Survived, times = 1, p = 0.2, list = FALSE)
      train_set <- titanic_clean[-test_index,]
      test_set <- titanic_clean[test_index,]
      nrow(train_set)
      nrow(test_set)
      mean(train_set$Survived == 1)      
#Q2 Question 2: Baseline prediction by guessing the outcome
      #The simplest prediction method is randomly guessing the outcome without using additional predictors. These methods will help us determine whether our machine learning algorithm performs better than chance. How accurate are two methods of guessing Titanic passenger survival?
      #Set the seed to 3. For each individual in the test set, randomly guess whether that person survived or not by sampling from the vector c(0,1) (Note: use the default argument setting of prob from the sample function).
      #hat is the accuracy of this guessing method?
      set.seed(3) # if R version >= 3.6, sample.kind = 'Rounding'
      guess_ <- sample(c(0,1), nrow(test_set), replace = TRUE)
      test_set %>% 
        filter(Survived == guess_) %>%
        summarize(n() / nrow(test_set))
      # guess with equal probability of survival
      #guess <- sample(c(0,1), nrow(test_set), replace = TRUE)
      #mean(guess == test_set$Survived)
 #Q3a Predicting survival by sex
      #Use the training set to determine whether members of a given sex were more likely to survive or die. Apply this insight to generate survival predictions on the test set.
      #What proportion of training set females survived?
      train_set %>%
        group_by(Sex) %>%
        summarize(Survived = mean(Survived == 1))
#Q3b Predicting survival by sex
      #Predict survival using sex on the test set: if the survival rate for a sex is over 0.5, predict survival for all individuals of that sex, and predict death if the survival rate for a sex is under 0.5.
      #What is the accuracy of this sex-based prediction method on the test set?
      test_set %>%
        summarize( (sum(Sex == 'female' & Survived == 1) + sum(Sex == 'male' & Survived == 0)) / n())
#Q4a predicting survival by passenger class
    # In the training set, which class(es) (Pclass) were passengers more likely to survive than die?
      survival_class <- titanic_clean %>%
        group_by(Pclass) %>%
        summarize(PredictingSurvival = ifelse(mean(Survived == 1) >=0.5, 1, 0))
      survival_class
     # corrected
      train_set %>%
        group_by(Pclass) %>%
        summarize(Survived = mean(Survived == 1))
#Q4b Predicting survival by passenger class
      #Predict survival using passenger class on the test set: predict survival if the survival rate for a class is over 0.5, otherwise predict death.
      #What is the accuracy of this class-based prediction method on the test set?
      test_set %>%
        inner_join(survival_class, by='Pclass') %>%
        summarize(PredictingSurvival = mean(Survived == PredictingSurvival))
#Q4c Predicting survival by passenger class
     #Use the training set to group passengers by both sex and passenger class.
     #Which sex and class combinations were more likely to survive than die (i.e. >50% survival)?
      survival_class <- titanic_clean %>%
        group_by(Sex, Pclass) %>%
        summarize(PredictingSurvival = ifelse(mean(Survived == 1) > 0.5, 1, 0))
      survival_class
     #option
      train_set %>%
        group_by(Sex, Pclass) %>%
        summarize(Survived = mean(Survived == 1)) %>%
        filter(Survived > 0.5)      
#Q4d Predicting survival by passenger class
    #Predict survival using both sex and passenger class on the test set. Predict survival if the survival rate for a sex/class combination is over 0.5, otherwise predict death.
    #What is the accuracy of this sex- and class-based prediction method on the test set?
      test_set %>%
        inner_join(survival_class, by=c('Sex', 'Pclass')) %>%
        summarize(PredictingSurvival = mean(Survived == PredictingSurvival))
#option 
      sex_class_model <- ifelse(test_set$Sex == "female" & test_set$Pclass != 3, 1, 0)
      mean(sex_class_model == test_set$Survived)
#Q5a  Confusion matrix
      #Use the confusionMatrix() function to create confusion matrices for the sex model, class model, and combined sex and class model. You will need to convert predictions and survival status to factors to use this function.
      #What is the "positive" class used to calculate confusion matrix metrics?
      # Confusion Matrix: sex model
      sex_model <- train_set %>%
        group_by(Sex) %>%
        summarize(Survived_predict = ifelse(mean(Survived == 1) > 0.5, 1, 0))
      test_set1 <- test_set %>%
        inner_join(sex_model, by = 'Sex')
      cm1 <- confusionMatrix(data = factor(test_set1$Survived_predict), reference = factor(test_set1$Survived))
      cm1 %>%
        tidy() %>%
        filter(term == 'sensitivity') %>%
        .$estimate
      cm1 %>%
        tidy() %>%
        filter(term == 'specificity') %>%
        .$estimate
      cm1 %>%
        tidy() %>%
        filter(term == 'balanced_accuracy') %>%
        .$estimate
      # Confusion Matrix: class model
      class_model <- train_set %>%
        group_by(Pclass) %>%
        summarize(Survived_predict = ifelse(mean(Survived == 1) > 0.5, 1, 0))
      test_set2 <- test_set %>%
        inner_join(class_model, by = 'Pclass')
      cm2 <- confusionMatrix(data = factor(test_set2$Survived_predict), reference = factor(test_set2$Survived))
      cm2 %>%
        tidy() %>%
        filter(term == 'sensitivity') %>%
        .$estimate
      cm2 %>%
        tidy() %>%
        filter(term == 'specificity') %>%
        .$estimate
      cm2 %>%
        tidy() %>%
        filter(term == 'balanced_accuracy') %>%
        .$estimate
      # Confusion Matrix: sex and class model
      sex_class_model <- train_set %>%
        group_by(Sex, Pclass) %>%
        summarize(Survived_predict = ifelse(mean(Survived == 1) > 0.5, 1, 0))
      test_set3 <- test_set %>%
        inner_join(sex_class_model, by=c('Sex', 'Pclass'))
      cm3 <- confusionMatrix(data = factor(test_set3$Survived_predict), reference = factor(test_set3$Survived))
      cm3 %>%
        tidy() %>%
        filter(term == 'sensitivity') %>%
        .$estimate
      cm3 %>%
        tidy() %>%
        filter(term == 'specificity') %>%
        .$estimate
      cm3 %>%
        tidy() %>%
        filter(term == 'balanced_accuracy') %>%
        .$estimate
#Q5b Question 5b: Confusion matrix
      #What is the maximum value of balanced accuracy from Q5a?
      confusionMatrix(data = factor(sex_model), reference = factor(test_set$Survived))
      confusionMatrix(data = factor(class_model), reference = factor(test_set$Survived))
      confusionMatrix(data = factor(sex_class_model), reference = factor(test_set$Survived))
#Q6 F1 scores
    #Use the F_meas() function to calculate  scores for the sex model, class model, and combined sex and class model. You will need to convert predictions to factors to use this function.
    #Which model has the highest  score?     
      F_meas(data=factor(test_set1$Survived), reference = factor(test_set1$Survived_predict))
      F_meas(data=factor(test_set2$Survived), reference = factor(test_set2$Survived_predict))
      F_meas(data=factor(test_set3$Survived), reference = factor(test_set3$Survived_predict))
     #option
      F_meas(data = factor(sex_model), reference = test_set$Survived)
      F_meas(data = factor(class_model), reference = test_set$Survived)
      F_meas(data = factor(sex_class_model), reference = test_set$Survived)      
# Titanic Exercises, part 2
     #Q7: Survival by fare - LDA and QDA
      #Set the seed to 1. Train a model using linear discriminant analysis (LDA) with the caret lda method using fare as the only predictor.
      #What is the accuracy on the test set for the LDA model?
      fit_lda <- train(Survived ~ Fare, data = train_set, method = 'lda')
      Survived_hat <- predict(fit_lda, test_set)
      mean(test_set$Survived == Survived_hat)      
#Q7 #Train a model using quadratic discriminant analysis (QDA) with the caret qda method using fare as the only predictor. What is the accuracy on the test set for the QDA model?
      fit_qda <- train(Survived ~ Fare, data = train_set, method = 'qda')
      Survived_hat <- predict(fit_qda, test_set)
      mean(test_set$Survived == Survived_hat)
#option 
      #set.seed(1) # if using R 3.5 or earlier
      set.seed(1) # if using R 3.6 or later, sample.kind = "Rounding"
      train_lda <- train(Survived ~ Fare, method = "lda", data = train_set)
      lda_preds <- predict(train_lda, test_set)
      mean(lda_preds == test_set$Survived)
      #set.seed(1) # if using R 3.5 or earlier
      set.seed(1) # if using R 3.6 or later, sample.kind = "Rounding"
      train_qda <- train(Survived ~ Fare, method = "qda", data = train_set)
      qda_preds <- predict(train_qda, test_set)
      mean(qda_preds == test_set$Survived)      
  #Note: when training models for Titanic Exercises Part 2, please use the S3 method for class formula rather than the default S3 method of caret train() (see ?caret::train for details).
#Q8: Logistic regression models
     #Set the seed to 1. Train a logistic regression model with the caret glm method using age as the only predictor.
     #What is the accuracy of your model (using age as the only predictor) on the test set ?  
      fit_logreg_a <- glm(Survived ~ Age, data = train_set, family = 'binomial')
      survived_hat_a <- ifelse(predict(fit_logreg_a, test_set) >= 0, 1, 0)
      mean(survived_hat_a == test_set$Survived) 
#Set the seed to 1. Train a logistic regression model with the caret glm method using four predictors: sex, class, fare, and age.
    #What is the accuracy of your model (using these four predictors) on the test set?   
      fit_logreg_b <- glm(Survived ~ Sex + Pclass + Fare + Age, data = train_set, family = 'binomial')
      survived_hat_b <- ifelse(predict(fit_logreg_b, test_set) >= 0, 1, 0)
      mean(survived_hat_b == test_set$Survived)
#Set the seed to 1. Train a logistic regression model with the caret glm method using all predictors. Ignore warnings about rank-deficient fit.
      #What is the accuracy of your model (using all predictors) on the test set ?  
      str(train_set)
      fit_logreg_c <- glm(Survived ~ ., data = train_set, family = 'binomial')
      survived_hat_c <- ifelse(predict(fit_logreg_c, test_set) >= 0, 1, 0)
      mean(survived_hat_c == test_set$Survived)
#option
      #set.seed(1) # if using R 3.5 or earlier
      set.seed(1, sample.kind = "Rounding") # if using R 3.6 or later
      train_glm <- train(Survived ~ Sex + Pclass + Fare + Age, method = "glm", data = train_set)
      glm_preds <- predict(train_glm, test_set)
      mean(glm_preds == test_set$Survived)
      #set.seed(1) # if using R 3.5 or earlier
      set.seed(1, sample.kind = "Rounding") if using R 3.6 or later
      train_glm_all <- train(Survived ~ ., method = "glm", data = train_set)
      glm_all_preds <- predict(train_glm_all, test_set)
      mean(glm_all_preds == test_set$Survived)
#Q9a: kNN model
      #Set the seed to 6. Train a kNN model on the training set using the caret train function. Try tuning with k = seq(3, 51, 2).
      #What is the optimal value of the number of neighbors k?   
      set.seed(6), sample.kind = "Rounding"
      # Method below doesn't give same result as EdX (though it is correct)
      # ks <- seq(3,51,2)
      # res_knn9a <- sapply(ks, function(k) {
      #     fit_knn9a <- knn3(Survived ~ ., data = train_set, k = k)
      #     survived_hat <- predict(fit_knn9a, train_set, type = "class") %>% factor(levels = levels(train_set$Survived))
      #     cm_test <- confusionMatrix(data = survived_hat, reference = train_set$Survived)
      #     cm_test$overall["Accuracy"]
      # })
      # ks[which.max(res_knn9a)]
      # Other method using train function
      k <- seq(3,51,2)
      fit_knn9a <- train(Survived ~ ., data = train_set, method = "knn", tuneGrid = data.frame(k))
      fit_knn9a$bestTune
#option
      #set.seed(6)
      set.seed(6) # if using R 3.6 or later, sample.kind = "Rounding"
      train_knn <- train(Survived ~ .,
                         method = "knn",
                         data = train_set,
                         tuneGrid = data.frame(k = seq(3, 51, 2)))
      train_knn$bestTune
#Q9b: kNN model
      #Plot the kNN model to investigate the relationship between the number of neighbors and accuracy on the training set.
      #Of these values of , which yields the highest accuracy?   
      ggplot(train_knn)
#Q9c: kNN model
      #What is the accuracy of the kNN model on the test set?      
      survived_hat <- predict(fit_knn9a, test_set) %>% factor(levels = levels(test_set$Survived))
      cm_test <- confusionMatrix(data = survived_hat, reference = test_set$Survived)
      cm_test$overall["Accuracy"]
      # corrected
      knn_preds <- predict(train_knn, test_set)
      mean(knn_preds == test_set$Survived)
#Q10: Cross-validation
      #Set the seed to 8 and train a new kNN model. Instead of the default training control, use 10-fold cross-validation where each partition consists of 10% of the total. Try tuning with k = seq(3, 51, 2).
      #What is the optimal value of k using cross-validation? 
      #set.seed(8)
      set.seed(8)    # simulate R 3.5, sample.kind = "Rounding"
      train_knn_cv <- train(Survived ~ .,
                            method = "knn",
                            data = train_set,
                            tuneGrid = data.frame(k = seq(3, 51, 2)),
                            trControl = trainControl(method = "cv", number = 10, p = 0.9))
      train_knn_cv$bestTune
      #
      knn_cv_preds <- predict(train_knn_cv, test_set)
      mean(knn_cv_preds == test_set$Survived)
#Q11a: Classification tree model
      #Set the seed to 10. Use caret to train a decision tree with the rpart method. Tune the complexity parameter with cp = seq(0, 0.05, 0.002).
      #What is the optimal value of the complexity parameter (cp)?    
      #set.seed(10)
      set.seed(10)    # simulate R 3.5, sample.kind = "Rounding"
      train_rpart <- train(Survived ~ ., 
                           method = "rpart",
                           tuneGrid = data.frame(cp = seq(0, 0.05, 0.002)),
                           data = train_set)
      train_rpart$bestTune
#
      rpart_preds <- predict(train_rpart, test_set)
      mean(rpart_preds == test_set$Survived)
#Q11b: Classification tree model      
      #Inspect the final model and plot the decision tree.
      #Which variables are used in the decision tree?
      #Select ALL that apply.   
      train_rpart$finalModel # inspect final model
      
      # make plot of decision tree
      plot(train_rpart$finalModel, margin = 0.1)
      text(train_rpart$finalModel)
#Q11c: Classification tree model
      #Using the decision rules generated by the final model, predict whether the following individuals would survive.     
#Q12: Random forest model
      #Set the seed to 14. Use the caret train() function with the rf method to train a random forest. Test values of mtry = seq(1:7). Set ntree to 100.
      #What mtry value maximizes accuracy?  
      set.seed(14), sample.kind = 'Rounding'
      fit12_rf <- train(Survived ~., 
                        data = train_set,
                        method = "rf", 
                        tuneGrid = data.frame(mtry = seq(1, 7)), 
                        ntree = 100)
      fit12_rf$bestTune
      survived_hat <- predict(fit12_rf, test_set)
      mean(survived_hat == test_set$Survived)
      varImp(fit12_rf) 

R Language Online Compiler

Write, Run & Share R Language code online using OneCompiler's R Language online compiler for free. It's one of the robust, feature-rich online compilers for R language, running on the latest version 3.4. Getting started with the OneCompiler's R Language compiler is simple and pretty fast. The editor shows sample boilerplate code when you choose language as R and start coding.

About R

R is very popular for data analytics which was created by Ross Ihaka and Robert Gentleman in 1993. Many big companies like Google, Facebook, Airbnb etc uses this language for data analytics. R is good for software developers, statisticians and data miners.

Key Features

  • Interpreted programming language(no compilation required)
  • provides highly extensible graphical techniques.
  • Good community support
  • Free and open-source
  • Handles data very effectively.

Syntax help

Data Types

Data typeDescriptionUsage
NumericTo represent decimal valuesx=1.84
IntegerTo represent integer values, L tells to store the value as integerx=10L
ComplexTo represent complex valuesx = 10+2i
LogicalTo represent boolean values, true or falsex = TRUE
CharacterTo represent string valuesx <- "One compiler"
rawHolds raw bytes

Variables

Variables can be assigned using any of the leftward, rightward or equal to operator. You can print the variables using either print or cat functions.

var-name = value
var-name <- value
value -> var-name

Loops

1. IF Family:

If, If-else, Nested-Ifs are used when you want to perform a certain set of operations based on conditional expressions.

If

if(conditional-expression){    
    #code    
} 

If-else

if(conditional-expression){  
    #code if condition is true  
} else {  
    #code if condition is false  
} 

Nested-If-else

if(condition-expression1) {  
    #code if above condition is true  
} elseif(condition-expression2){  
    #code if above condition is true  
}  
elseif(condition-expression3) {  
    #code if above condition is true  
}  
...  
else {  
    #code if all the conditions are false  
}  

2. Switch:

Switch is used to execute one set of statement from multiple conditions.

switch(expression, case-1, case-2, case-3....)   

3. For:

For loop is used to iterate a set of statements based on a condition.

for (value in vector) {  
  # code  
} 

4. While:

While is also used to iterate a set of statements based on a condition. Usually while is preferred when number of iterations are not known in advance.

while(condition) {  
 # code 
}  

5. Repeat:

Repeat is used tyo iterate a set of statements with out any condition. You can write a user-defined condition to exit from the loop using IF.

repeat {   
   #code   
   if(condition-expression) {  
      break  
   }  
} 

Functions

Function is a sub-routine which contains set of statements. Usually functions are written when multiple calls are required to same set of statements which increases re-usuability and modularity.

How to define a Function

func-name <- function(parameter_1, parameter_2, ...) {  
   #code for function body   
}  

How to call a Function

function_name (parameters)

Vectors

Vector is a basic data strucre where sequence of data values share same data type.

For example, the below statement assigns 1 to 10 values to x.
You can also use se() function to create vectors.

x <- 1:10
#using seq() function
 x <- seq(1, 10, by=2)

the above statement prints the output as [1] 1 3 5 7 9.