16 Rcpp

16.1 Getting started with C++

  1. Q: With the basics of C++ in hand, it’s now a great time to practice by reading and writing some simple C++ functions. For each of the following functions, read the code and figure out what the corresponding base R function is. You might not understand every part of the code yet, but you should be able to figure out the basics of what the function does.

    double f1(NumericVector x) {
      int n = x.size();
      double y = 0;
    
      for(int i = 0; i < n; ++i) {
        y += x[i] / n;
      }
      return y;
    }
    
    NumericVector f2(NumericVector x) {
      int n = x.size();
      NumericVector out(n);
    
      out[0] = x[0];
      for(int i = 1; i < n; ++i) {
        out[i] = out[i - 1] + x[i];
      }
      return out;
    }
    
    bool f3(LogicalVector x) {
      int n = x.size();
    
      for(int i = 0; i < n; ++i) {
        if (x[i]) return true;
      }
      return false;
    }
    
    int f4(Function pred, List x) {
      int n = x.size();
    
      for(int i = 0; i < n; ++i) {
        LogicalVector res = pred(x[i]);
        if (res[0]) return i + 1;
      }
      return 0;
    }
    
    NumericVector f5(NumericVector x, NumericVector y) {
      int n = std::max(x.size(), y.size());
      NumericVector x1 = rep_len(x, n);
      NumericVector y1 = rep_len(y, n);
    
      NumericVector out(n);
    
      for (int i = 0; i < n; ++i) {
        out[i] = std::min(x1[i], y1[i]);
      }
    
      return out;
    }

    A: The R equivalents are:

    • f1: mean()
    • f2: cumsum()
    • f3: any()
    • f4: Position()
    • f5: pmin()
  2. Q: To practice your function writing skills, convert the following functions into C++. For now, assume the inputs have no missing values.

    1. all()
    bool allC(LogicalVector x) {
      int n = x.size();
    
      for(int i = 0; i < n; ++i) {
        if (!x[i]) return false;
      }
      return true;
    }
    1. cumprod(), cummin(), cummax().
    NumericVector cumprodC(NumericVector x) {
    
      int n = x.size();
      NumericVector out(n);
    
      out[0] = x[0];
      for(int i = 1; i < n; ++i) {
        out[i]  = out[i - 1] * x[i];
      }
      return out;
    }
    
    NumericVector cumminC(NumericVector x) {
      int n = x.size();
      NumericVector out(n);
    
      out[0] = x[0];
      for(int i = 1; i < n; ++i) {
        out[i]  = std::min(out[i - 1], x[i]);
      }
      return out;
    }
    
    NumericVector cummaxC(NumericVector x) {
      int n = x.size();
      NumericVector out(n);
    
      out[0] = x[0];
      for(int i = 1; i < n; ++i) {
        out[i]  = std::max(out[i - 1], x[i]);
      }
    return out;
    }
    1. diff(). Start by assuming lag 1, and then generalise for lag n.
    NumericVector diffC(NumericVector x){
      int n = x.size();
      NumericVector out(n - 1);
    
      for(int i = 1; i < n; i++){
          out[i - 1] = x[i] - x[i - 1];
      }
      return out ;
    }
    
    NumericVector difflagC(NumericVector x, int lag){
      int n = x.size();
      NumericVector out(n - lag);
    
      for(int i = lag; i < n; i++){
          out[i - lag] = x[i] - x[i - lag];
      }
      return out;
    }'
    1. range.
    NumericVector rangeC(NumericVector x){
      double omin, omax;  
      int n = x.size();
      NumericVector out(2);
    
      omin = x[0];
      omax = x[0];
    
      for(int i = 1; i < n; i++){
          omin = std::min(x[i], omin);
          omax = std::max(x[i], omax);
      }
    
      out[0] = omin;
      out[1] = omax;
      return out;
    1. var. Read about the approaches you can take on wikipedia. Whenever implementing a numerical algorithm, it’s always good to check what is already known about the problem.

16.2 Missing values

  1. Q: Rewrite any of the functions from the first exercise to deal with missing values. If na.rm is true, ignore the missing values. If na.rm is false, return a missing value if the input contains any missing values. Some good functions to practice with are min(), max(), range(), mean(), and var().

    A:

    NumericVector minC(NumericVector x, bool narm){
      int n = x.size();
      LogicalVector index = is_na(x);
      NumericVector omin(1);
      bool na_check = false;
      bool na_check_all = true;
    
      for(int i = 0; i<n; i++){
          if (index[i]) na_check = true;
      }
    
      for(int i = 0; i<n; i++){
          if (!index[i]) na_check_all = false;
      }
    
      if (narm) {
          for(int i = n-1; i >= 0; i--){
              if (!index[i]) omin[0] = x[i];
          }
    
          for(int i = 1; i < n; i++) {
              if (!index[i]) omin[0] = std::min(x[i], omin[0]);
          }
    
          if (na_check_all) {
              omin[0] = NA_REAL;
          }
      } else if (na_check) {
          omin[0] = NA_REAL;
      } else {
          omin[0] = x[0];
          for(int i = 1; i < n; i++){
              omin = std::min(x[i], omin[0]);
          }
      }
    
      return omin;
    }
    
    NumericVector maxC(NumericVector x, bool narm){
      int n = x.size();
      LogicalVector index = is_na(x);
      NumericVector omax(1);
      bool na_check = false;
      bool na_check_all = true;
    
      for(int i = 0; i<n; i++){
          if (index[i]) na_check = true;
      }
    
      for(int i = 0; i<n; i++){
          if (!index[i]) na_check_all = false;
      }
    
      if (narm) {
          for(int i = n-1; i >= 0; i--){
              if (!index[i]) omax[0] = x[i];
          }
    
          for(int i = 1; i < n; i++) {
              if (!index[i]) omax[0] = std::max(x[i], omax[0]);
          }
    
          if (na_check_all) {
              omax[0] = NA_REAL;
          }
      } else if (na_check) {
          omax[0] = NA_REAL;
      } else {
          omax[0] = x[0];
          for(int i = 1; i < n; i++){
              omax = std::max(x[i], omax[0]);
          }
      }
    
      return omax;
    }
    
    NumericVector rangeC(NumericVector x, bool narm){
      NumericVector out(2);
      NumericVector omin(1);
      NumericVector omax(1);
    
      omin = minC(x, narm);
      omax = maxC(x, narm);
    
      out[0] = omin[0];
      out[1] = omax[0];
      return out;
      }
    }
    
    NumericVector meanC(NumericVector x, bool narm){
      int n = x.size();
      LogicalVector index = is_na(x);
      bool na_check = false;
      bool na_check_all = true;
      int n_corrected = 0;
      NumericVector out(1);
    
      for(int i = 0; i<n; i++){
          if (index[i]) na_check = true;
      }
    
      for(int i = 0; i<n; i++){
          if (!index[i]) na_check_all = false;
      }
    
      for(int i = 0; i<n; i++){
          if (!index[i]) n_corrected++;
      }
    
      /* narm = T */
      if (narm){
          if (na_check_all) {
              out[0] = NA_REAL;
          }  else {
              out[0] = 0;
              for(int i = 0; i < n; ++i) {
                  if (!index[i]) out[0] += x[i] / n_corrected;
              }
          }
      }
    
      /* narm = F */
      if (!narm){
          if (na_check) {
              out[0] = NA_REAL;
          } else {
              for(int i = 0; i < n; ++i) {
                  out[0] += x[i] / n;
                }
          }
      }
    
      return out;
    }
  2. Q: Rewrite cumsum() and diff() so they can handle missing values. Note that these functions have slightly more complicated behaviour.

    A:

    NumericVector cumsumC(NumericVector x) {
      int n = x.size();
      NumericVector out(n);
      LogicalVector index = is_na(x);
    
      out[0] = x[0];
      for(int i = 1; i < n; ++i) {
        if (index[i - 1]) {
          out[i] = NA_REAL;
      } else{
          out[i] = out[i - 1] + x[i];
        }
      }
    
      return out;
    }
    
    NumericVector difflagC(NumericVector x, int lag){
      int n = x.size();
      NumericVector out(n - lag);
      LogicalVector index = is_na(x);
    
      for(int i = lag; i < n; i++){
          if ((index[i]) || (index[i - lag])) {
              out[i - lag] = NA_REAL;
          } else {
              out[i - lag] = x[i] - x[i - lag];
          }
      }
      return out;
    }

16.3 The STL

To practice using the STL algorithms and data structures, implement the following using R functions in C++, using the hints provided:

  1. median.default() using partial_sort.

    #include <algorithm>
    #include <Rcpp.h>
    using namespace Rcpp;
    
    // [[Rcpp::export]]
    double medianC(NumericVector x) {
      int n = x.size();
      double out;
      if (n % 2 == 0){
        std::partial_sort (x.begin(), x.begin() + n / 2 + 1, x.end());
        out = (x[n / 2 - 1] + x[n / 2]) / 2;
      } else {
        std::partial_sort (x.begin(), x.begin() + (n + 1) / 2, x.end());
        out = x[(n + 1) / 2 - 1];
      }
    
      return out;
    }
  2. %in% using unordered_set and the find() or count() methods.

  3. unique() using an unordered_set (challenge: do it in one line!).

    // [[Rcpp::plugins(cpp11)]]
    #include <Rcpp.h>
    #include <unordered_set>
    using namespace Rcpp;
    
    // [[Rcpp::export]]
    NumericVector uniqueC(NumericVector x) {
      std::unordered_set<int> seen;
      int n = x.size();
      std::vector<double> out;
    
      for (int i = 0; i < n; ++i) {
        if (seen.insert(x[i]).second) out.push_back(x[i]);
        }
    
      return wrap(out);
    }
  4. min() using std::min(), or max() using std::max().

    #include <Rcpp.h>
    using namespace Rcpp;
    
    // [[Rcpp::export]]
    double minC(NumericVector x){
      int n = x.size();
      double out = x[0];
    
      for (int i = 0; i < n; i++){
        out = std::min(out, x[i]);
      }
    
      return out;
    }
  5. which.min() using min_element, or which.max() using max_element.

    #include <Rcpp.h>
    #include <algorithm>
    #include <iterator>
    
    using namespace Rcpp;
    
    // [[Rcpp::export]]
    double which_minC(NumericVector x){
      int out;
      out = std::distance(x.begin(),std::min_element(x.begin(),x.end()));
      out++;
    
      return out;
    }
  6. setdiff(), union(), and intersect() for integers using sorted ranges and set_union, set_intersection and set_difference.