From 92ae47514b9b71929c2548e0d6a7fc7afbb6caac Mon Sep 17 00:00:00 2001
From: Bernhard Meindl <bernhard.meindl@statistik.gv.at>
Date: Wed, 20 Jul 2022 09:58:09 +0200
Subject: [PATCH 1/3] check that carry_along variables are at household-level

---
 DESCRIPTION                   |   4 +-
 NEWS                          |   2 +
 R/createDat.R                 |  43 +++--
 R/recordSwap.R                | 178 +++++++++++--------
 man/recordSwap.Rd             |   5 +-
 src/recordSwap/recordSwap.cpp | 326 ++++++++++++++++++----------------
 vignettes/recordSwapping.Rmd  |   2 +-
 7 files changed, 317 insertions(+), 243 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3ce646ff..0f35d7d8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,8 +2,8 @@ Package: sdcMicro
 Type: Package
 Title: Statistical Disclosure Control Methods for Anonymization of Data and
     Risk Estimation
-Version: 5.7.1
-Date: 2022-07-05
+Version: 5.7.1.99
+Date: 2022-07-20
 Authors@R: c(
   person("Matthias", "Templ", email="matthias.templ@gmail.com", role = c("aut", "cre"), comment=c(ORCID="0000-0002-8638-5276")),
   person("Bernhard", "Meindl", email = "Bernhard.Meindl@statistik.gv.at", role = c("aut")),
diff --git a/NEWS b/NEWS
index 2794a614..8c550562 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,8 @@
 - Fix warnings with clang-devel related targeted record swapping
 - Fix a note in vignette title for TRS
 - Remove travis and use Github workflows to check the package
+- small fix for `createDat()` removing a possible warning
+- only allow household-level variables for argument `carry_along` for TRS
 
 # 5.7.1
 - Bugfix in `extractManipData()` with only a single categorical variable, thx @tamertemizer for reporting
diff --git a/R/createDat.R b/R/createDat.R
index 2e27de1d..95b2e1fc 100644
--- a/R/createDat.R
+++ b/R/createDat.R
@@ -12,7 +12,7 @@
 #' @return `data.table` containing dummy data
 #' @rdname recordSwap
 #' @export
-createDat <- function(N = 10000) {
+createDat <- function(N=10000) {
   stopifnot(is.numeric(N))
   stopifnot(N > 1)
   N <- ceiling(N)
@@ -23,19 +23,32 @@ createDat <- function(N = 10000) {
   hsize <- sample(1:6, N, replace = TRUE)
   htype <- sample(1:10, N, replace = TRUE)
   hincome <- sample(1:10, N, replace = TRUE)
-  dat <- data.table(
-    nuts1 = rep(nuts1, times = hsize),
-    nuts2 = rep(nuts2, times = hsize),
-    nuts3 = rep(nuts3, times = hsize),
-    lau2 = rep(lau2, times = hsize),
-    hid = rep(1:length(hsize), times = hsize),
-    hsize = rep(hsize, times = hsize),
-    ageGroup = sample(1:7, length(hsize), replace = TRUE),
-    gender = sample(c(1, 2), length(hsize), replace = TRUE),
-    national = sample(1:5, length(hsize), replace = TRUE),
-    htype = rep(htype, times = hsize),
-    hincome = rep(hincome, times = hsize)
-  )
+
+  # replicate
+  hid <- rep(1:length(hsize), times = hsize)
+  nuts1 <- rep(nuts1, times = hsize)
+  nuts2 <- rep(nuts2, times = hsize)
+  nuts3 <- rep(nuts3, times = hsize)
+  lau2 <- rep(lau2, times = hsize)
+  htype <- rep(htype, times = hsize)
+  hincome <- rep(hincome, times = hsize)
+  hsize <- rep(hsize, times = hsize)
+  gender <- sample(c(1, 2), length(hsize), replace = TRUE)
+  ageGroup <- sample(1:7, length(hsize), replace = TRUE)
+  national <- sample(1:5, length(hsize), replace = TRUE)
+
+  # create data.table
+  dat <- data.table(nuts1,
+                    nuts2,
+                    nuts3,
+                    lau2,
+                    hid,
+                    hsize,
+                    ageGroup,
+                    gender,
+                    national,
+                    htype,
+                    hincome)
 
   # hierarchy for regional variables
   help_0 <- c("", "0", "00", "000")
@@ -43,5 +56,5 @@ createDat <- function(N = 10000) {
   dat[, nuts3 := paste0(nuts2, help_0[3 - nchar(nuts3)], nuts3)]
   dat[, lau2 := paste0(nuts3, help_0[5 - nchar(nuts3)], lau2)]
   dat[, colnames(dat) := lapply(.SD, as.integer)]
-  return(dat)
+  return(dat[])
 }
diff --git a/R/recordSwap.R b/R/recordSwap.R
index 901ea9ec..c9cba6f6 100644
--- a/R/recordSwap.R
+++ b/R/recordSwap.R
@@ -101,7 +101,8 @@
 #' besides to hierarchy variables. These variables do not interfere with the
 #' procedure of finding a record to swap with or calculating risk. This
 #' parameter is only used at the end of the procedure when swapping the
-#' hierarchies.
+#' hierarchies. However, any variables specified need to be at household
+#' level which means that only identical values within `hid` are allowed.
 #' @param return_swapped_id, boolean if `TRUE` the output includes an
 #' additional column showing the `hid` with which a record was swapped with.
 #' The new column will have the name `paste0(hid,"_swapped")`.
@@ -121,7 +122,7 @@
 #' seed <- 2021
 #' set.seed(seed)
 #' nhid <- 10000
-#' dat <- sdcMicro::createDat(nhid)
+#' dat <- createDat(nhid)
 #'
 #' # define paramters for swapping
 #' k_anonymity <- 1
@@ -256,19 +257,20 @@ recordSwap.default <- function(data, hid, hierarchy, similar,
     stop("return_swapped_id must be logical of length 1")
   }
 
-  if(return_swapped_id==TRUE){
-    orig_id <- cnames[hid+1]
-    swapped_id <- paste0(orig_id,"_swapped")
-    data[,c(swapped_id):=get(orig_id)]
+  if (return_swapped_id == TRUE) {
+    orig_id <- cnames[hid + 1]
+    swapped_id <- paste0(orig_id, "_swapped")
+    data[, c(swapped_id) := get(orig_id)]
     cnames <- copy(colnames(data))
-
-    swapped_id <- checkIndexString(swapped_id,cnames,
-                                   matchLength = 1)
-    carry_along <- c(carry_along,swapped_id)
+    swapped_id <- checkIndexString(swapped_id, cnames, matchLength = 1)
+    carry_along <- c(carry_along, swapped_id)
   }
 
   # check k_anonymity
-  if(!all((!is.null(risk_variables))&checkInteger(k_anonymity)&length(k_anonymity)==1&k_anonymity>=0)){
+  if (!all((!is.null(risk_variables)) &
+           checkInteger(k_anonymity) &
+           length(k_anonymity) == 1 &
+           k_anonymity >= 0)) {
     stop("k_anonymity must be a positiv single integer!")
   }
 
@@ -285,19 +287,22 @@ recordSwap.default <- function(data, hid, hierarchy, similar,
   }
 
   # check risk
-  if(is.null(risk)){
+  if (is.null(risk)) {
     risk <- data.table()
     risk_threshold <- 0
   }
-  if(is.vector(risk)){
-    if(length(risk)!=length(hierarchy)){
+  if (is.vector(risk)) {
+    if (length(risk) != length(hierarchy)) {
       stop("risk and hierarchy need to address the same number of columns!")
     }
-    risk <- checkIndexString(risk,cnames,minLength = 1)
-    risk <- data[,c(risk+1)]
-  }else{
-    if(all(!class(risk)%in%c("data.table","data.frame","matrix"))){
-      stop("If risk is not a vector containing column indices or column names in data then risk must be either a data.table, data.frame or matrix!")
+    risk <- checkIndexString(risk, cnames, minLength = 1)
+    risk <- data[, c(risk + 1)]
+  } else {
+    if (all(!class(risk) %in% c("data.table", "data.frame", "matrix"))) {
+      stop(
+        "If risk is not a vector containing column indices or column names",
+        "in data then risk must be either a data.table, data.frame or matrix!"
+      )
     }
   }
 
@@ -310,16 +315,20 @@ recordSwap.default <- function(data, hid, hierarchy, similar,
   cnamesrisk <- copy(colnames(risk))
   risk <- data.table(risk)
 
-  if(nrow(risk)>0){
-    if(is.null(cnamesrisk)){
-      message("risk does not contain column names; the first column in risk will be used for the first hierarchy level, e.g ",cnames[hierarchy[1]+1]," and so on.")
-    }else{
-      if(!any(cnamesrisk)%in%cnames[hierarchy+1]){
+  if (nrow(risk) > 0) {
+    if (is.null(cnamesrisk)) {
+      message(
+        "risk does not contain column names; the first column in risk will be ",
+        "used for the first hierarchy level, e.g ",
+        cnames[hierarchy[1] + 1], " and so on."
+      )
+    } else {
+      if (!any(cnamesrisk) %in% cnames[hierarchy + 1]) {
         stop("the columnnames of risk do not appear in data")
       }
     }
 
-    if(any(risk<0)||any(!is.numeric(risk))){
+    if (any(risk < 0) || any(!is.numeric(risk))) {
       stop("risk must contain positive real values only!")
     }
   }
@@ -328,80 +337,93 @@ recordSwap.default <- function(data, hid, hierarchy, similar,
   # if(is.character(seed)){
   #   stop("seed must be a single positive integer!")
   # }
-  if(is.null(seed) | any(is.na(seed))){
-    seed <- sample(1e5,1)
+  if (is.null(seed) | any(is.na(seed))) {
+    seed <- sample(1e5, 1)
   }
-  if(!(is.numeric(seed)&&length(seed)==1&&seed%%1==0&&seed>0)){
+  if (!(is.numeric(seed) && length(seed) == 1 && seed %% 1 == 0 && seed > 0)) {
     stop("seed must be a single positive integer!")
   }
-
   ##########################
   # setup data and inputs for c++ function
 
   # order data
-  setkeyv(data,cnames[hid+1])
+  setkeyv(data, cnames[hid + 1])
   # take sub data
-  data[,helpVariableforMergingAfterTRS:=.I]
+  data[, helpVariableforMergingAfterTRS := .I]
   sim_vars <- sort(unique(unlist(similar)))
-  original_cols <- unique(c(hid,hierarchy,risk_variables,sim_vars,carry_along))
-  select_cols <- unique(c(original_cols+1,ncol(data)))
-  data_sw <- copy(data[,.SD,.SDcols=c(select_cols)])
+  original_cols <- unique(c(hid, hierarchy, risk_variables, sim_vars, carry_along))
+  select_cols <- unique(c(original_cols + 1, ncol(data)))
+  data_sw <- copy(data[, .SD, .SDcols = c(select_cols)])
   cnames_sw <- colnames(data_sw) # save column names for later use
   # remove columns from original data except help variable for merging
   drop_cols <- cnames_sw[-length(cnames_sw)]
-  data[,c(drop_cols):=NULL]
+  data[, c(drop_cols) := NULL]
 
   # remap column indices
-  hid <- which(hid %in% original_cols)-1
-  hierarchy <- sapply(hierarchy,function(z){
-    which(original_cols %in% z) -1
+  hid <- which(hid %in% original_cols) - 1
+  hierarchy <- sapply(hierarchy, function(z) {
+    which(original_cols %in% z) - 1
   })
 
-  if(length(similar)>0){
+  if (length(similar) > 0) {
     # remap all similarity variables
-    similar <- lapply(similar,function(z){
-      sapply(z,function(z.s){
-        which(original_cols %in% z) -1
+    similar <- lapply(similar, function(z) {
+      sapply(z, function(z.s) {
+        which(original_cols %in% z) - 1
       })
     })
   }
-  if(length(risk_variables)>0){
-    risk_variables <- sapply(risk_variables,function(z){
-      which(original_cols %in% z) -1
+  if (length(risk_variables) > 0) {
+    risk_variables <- sapply(risk_variables, function(z) {
+      which(original_cols %in% z) - 1
     })
   }
-  if(length(carry_along)>0){
-    carry_along <- sapply(carry_along,function(z){
-      which(original_cols %in% z) -1
+  if (length(carry_along) > 0) {
+    carry_along <- sapply(carry_along, function(z) {
+      which(original_cols %in% z) - 1
     })
   }
 
   # check if any non numeric values are present in data
   if(any(!unlist(apply(data_sw,2,is.numeric)))){
-    stop("Columns specified in hid, hierarchy, similar and carry_along must contain only integer values at this point")
+    stop(
+      "Columns specified in hid, hierarchy, similar ",
+      "and carry_along must contain only integer values at this point"
+    )
   }
 
   # check if any values with NA values are present in data
-  NAOccured <- apply(data_sw,2,function(z){any(is.na(z))})
-  if(any(NAOccured)){
-    stop("data must contain only integer values. \nColumn(s)\n    ",paste( names(which(NAOccured)),collapse=", "),"\ncontain(s) NA values")
+  NAOccured <- apply(data_sw, 2, function(z) {
+    any(is.na(z))
+  })
+  if (any(NAOccured)) {
+    stop(
+      "data must contain only integer values. \nColumn(s)\n    ",
+      paste(names(which(NAOccured)), collapse = ", "),
+      "\ncontain(s) NA values"
+    )
   }
 
   # check if any values with decimal values are present in data
-  decOccured <- apply(data_sw,2,function(z){any((z%%1)!=0)})
-  if(any(decOccured)){
+  decOccured <- apply(data_sw, 2, function(z) {
+    any((z %% 1) != 0)
+  })
+  if (any(decOccured)) {
     decOccured <- names(decOccured)[decOccured]
-    stop("data must contain only integer values.\nColumn(s)\n    ",paste(decOccured,collapse=", "),"\ncontain(s) decimal numbers")
+    stop(
+      "data must contain only integer values.\nColumn(s)\n    ",
+      paste(decOccured, collapse = ", "),
+      "\ncontain(s) decimal numbers"
+    )
   }
 
-
   # transpose data for cpp function
   data_sw <- transpose(data_sw)
 
   # transpose risk
-  if(nrow(risk)>0){
+  if (nrow(risk) > 0) {
     risk <- transpose(risk)
-  }else{
+  } else{
     risk <- numeric(0)
   }
   risk <- numeric(0) # drop this if risk was tested enough
@@ -409,29 +431,39 @@ recordSwap.default <- function(data, hid, hierarchy, similar,
   # take time before starting swapping
   start_time <- Sys.time()
 
-  data_sw <- recordSwap_cpp(data=data_sw, similar_cpp=similar, hierarchy=hierarchy,
-                            risk_variables=risk_variables, hid=hid, k_anonymity=k_anonymity,
-                            swaprate=swaprate,
-                            risk_threshold=risk_threshold, risk=risk,
-                            carry_along = carry_along,
-                            log_file_name = log_file_name,
-                            seed=seed)
+  data_sw <- recordSwap_cpp(
+    data = data_sw,
+    similar_cpp = similar,
+    hierarchy = hierarchy,
+    risk_variables = risk_variables,
+    hid = hid,
+    k_anonymity = k_anonymity,
+    swaprate = swaprate,
+    risk_threshold = risk_threshold,
+    risk = risk,
+    carry_along = carry_along,
+    log_file_name = log_file_name,
+    seed = seed
+  )
 
   # check if swapping was successful
-  if(file.exists(log_file_name) && file.mtime(log_file_name)>start_time){
-    message("Donor household was not found in ",length(readLines(log_file_name))-2," case(s).\nSee ",log_file_name," for a detailed list")
-  }else{
+  if (file.exists(log_file_name) && file.mtime(log_file_name) > start_time) {
+    message(
+      "Donor household was not found in ",
+      length(readLines(log_file_name)) - 2,
+      " case(s).\nSee ", log_file_name, " for a detailed list"
+    )
+  } else{
     message("Recordswapping was successful!\n")
   }
 
   setDT(data_sw)
   data_sw <- transpose(data_sw)
-  setnames(data_sw,colnames(data_sw),cnames_sw)
-  data[data_sw,c(drop_cols):=mget(drop_cols),on=.(helpVariableforMergingAfterTRS)]
+  setnames(data_sw, colnames(data_sw), cnames_sw)
+  data[data_sw, c(drop_cols) := mget(drop_cols), on = .(helpVariableforMergingAfterTRS)]
   rm(data_sw)
-  setcolorder(data,cnames)
-  data[,helpVariableforMergingAfterTRS:=NULL]
-
+  setcolorder(data, cnames)
+  data[, helpVariableforMergingAfterTRS := NULL]
   return(data)
 }
 
diff --git a/man/recordSwap.Rd b/man/recordSwap.Rd
index 385cb816..8fed5312 100644
--- a/man/recordSwap.Rd
+++ b/man/recordSwap.Rd
@@ -77,7 +77,8 @@ rule is applied.}
 besides to hierarchy variables. These variables do not interfere with the
 procedure of finding a record to swap with or calculating risk. This
 parameter is only used at the end of the procedure when swapping the
-hierarchies.}
+hierarchies. However, any variables specified need to be at household
+level which means that only identical values within `hid` are allowed.}
 
 \item{return_swapped_id, }{boolean if `TRUE` the output includes an
 additional column showing the `hid` with which a record was swapped with.
@@ -177,7 +178,7 @@ library(data.table)
 seed <- 2021
 set.seed(seed)
 nhid <- 10000
-dat <- sdcMicro::createDat(nhid)
+dat <- createDat(nhid)
 
 # define paramters for swapping
 k_anonymity <- 1
diff --git a/src/recordSwap/recordSwap.cpp b/src/recordSwap/recordSwap.cpp
index 108d48d7..8d992222 100644
--- a/src/recordSwap/recordSwap.cpp
+++ b/src/recordSwap/recordSwap.cpp
@@ -3,7 +3,7 @@
  * Version: 1.0.1
  */
 
-#include <iostream>     
+#include <iostream>
 #include <algorithm>    // std::count
 #include <vector>       // std::vector
 #include <random>
@@ -17,22 +17,22 @@
 using namespace std;
 
 /*
- * Function to reorder data-set given one index vector 
+ * Function to reorder data-set given one index vector
  */
 std::vector< std::vector<int> > orderData(std::vector< std::vector<int> > &data, int orderIndex){
-  
+
   // initialise ordering vector
   std::vector<int> orderVec(data.size());
   std::iota(orderVec.begin(),orderVec.end(),0);
-  
+
   // order this vector by order of data[orderIndex]
   std::sort(orderVec.begin(),orderVec.end(),
             [&](int a, int b) { return data[a][orderIndex] < data[b][orderIndex]; }
   );
-  
+
   // reorder data without copying it
   for(std::size_t i = 0;i<orderVec.size();i++){
-    // while orderVec[i] is not yet in place 
+    // while orderVec[i] is not yet in place
     // every swap places at least one element in it's proper place
     while(orderVec[i] !=   orderVec[orderVec[i]] ){
       // swap every "row" of data
@@ -43,27 +43,27 @@ std::vector< std::vector<int> > orderData(std::vector< std::vector<int> > &data,
       swap( orderVec[i], orderVec[orderVec[i]] );
     }
   }
-  
+
   return(data);
 }
 
 
 /*
- * Function to define levels 
+ * Function to define levels
  * this function returns the hierarchy level over which a unit/household needs to be swapped
  * 0 meaning the highest hierarchy level, 1 the second highest hierarchy level, and so on....
  */
 std::vector<int> setLevels(std::vector< std::vector<double> > &risk, double risk_threshold) {
-  
+
   // risk: data containing the risk for each hierarchy level and each unit. risk[0] returns the vector of risks for the first unit over all hierarchy levels
   // risk_threshold: double defining the risk threshold beyond which a record/household needs to be swapped. This is understood as risk>risk_threshhold.
-  
+
   // initialise parameters
   int n=risk.size();
   int p=risk[0].size();
   std::vector<int> data_level(n);
   std::fill(data_level.begin(),data_level.end(),p);
-  
+
   for(int i=0;i<n;i++){
     for(int j=0; j<p; j++){
       if(risk[i][j]>risk_threshold){ // risk[i][j]>risk_threshold
@@ -72,23 +72,23 @@ std::vector<int> setLevels(std::vector< std::vector<double> > &risk, double risk
       }
     }
   }
-  
+
   return data_level;
 }
 
 
 /*
- * Function to set the risk for each individual 
+ * Function to set the risk for each individual
  * in each hierarchy level
  * this is then used as sampling probability
  */
 std::vector< std::vector<double> > setRisk(std::vector<std::vector<int> > &data, std::vector<int> &hierarchy, std::vector<int> &risk_variables, int &hid){
-  
+
   // data: data input
   // hierarchy: column indices in data corresponding to geo hierarchy of data read left to right (left highest level - right lowest level)
   // risk_variables: column indices in data corresponding to risk variables which will be considered for estimating counts in the population
   // hid: int correspondig to column index in data which holds the household ID
-  
+
   // initialise parameters
   int n = data.size();
   int nhier = hierarchy.size();
@@ -103,67 +103,67 @@ std::vector< std::vector<double> > setRisk(std::vector<std::vector<int> > &data,
   // prob[1] ~ risk of second record for each hierarchy level
   // and so on ...
   std::vector< std::vector<double> > prob(n,vector<double>(nhier));
-  
+
   //
   std::vector<int> loop_index = risk_variables;
   loop_index.insert(loop_index.end(),hierarchy.begin(),hierarchy.end());
   int loop_n = loop_index.size();
   std::vector<int> groups(loop_n);
-  
+
   // initialise counts for groups in every hierarchy
-  std::map<std::vector<int>,int> group_count; 
+  std::map<std::vector<int>,int> group_count;
   for(int i=0;i<n;i++){
-    
+
     for(int j=0;j<loop_n;j++){
       // ... define group for each hierarchy level
       // risk_variable + hierarchy levels
       groups[j] = data[i][loop_index[j]];
     }
-    
+
     for(int index_hier=0;index_hier<nhier;index_hier++){
       std::vector<int> groups_help(&groups[0],&groups[nrisk+index_hier+1]); // +1 needed here!
       // ... count number for each group using std::map
       group_count[groups_help]++;
     }
   }
-  
+
   // loop over data again and fill with risks
   int i=0;
   int h=0;
   while(i<n){
-    
+
     current_ID = data[i][hid];
     while(i+h<n&&current_ID==data[i+h][hid]){
-      
+
       for(int j=0;j<loop_n;j++){
         // ... define group for each hierarchy level
         // risk_variable + hierarchy levels
         groups[j] = data[i+h][loop_index[j]];
       }
-      
+
       for(int index_hier=0;index_hier<nhier;index_hier++){
         // get each grouping ~ risk_variables + hierarchy level 0-nhier
         std::vector<int> groups_help(&groups[0],&groups[nrisk+index_hier+1]); //+1 needed here
         // select highest risk for hid in each hierarchy level
         risk_value[index_hier] = max(risk_value[index_hier],1.0/group_count[groups_help]);
       }
-      
+
       hsize++;
       h++;
     }
-    
+
     // assign highst risk and revers risk to all household members
     for(int j=0;j<hsize;j++){
       prob[i+j] = risk_value;
     }
-    
+
     // reset parameters
     i = i+hsize;
     hsize=0;
     std::fill(risk_value.begin(),risk_value.end(),0.0);
     h =0;
   }
-  
+
   return prob;
 }
 
@@ -176,7 +176,7 @@ std::vector< std::vector<double> > setRisk(std::vector<std::vector<int> > &data,
  */
 std::vector<int> randSample(std::unordered_set<int> &ID, int N, std::vector<double> &prob, std::mt19937 &mersenne_engine,
                             std::vector<int> &IDused, std::unordered_set<int> &mustSwap){
-  
+
   // initialise parameters
   std::exponential_distribution<double> exp_dist(1.0); // initialise lambda para for exp distribution
   /*
@@ -186,8 +186,8 @@ std::vector<int> randSample(std::unordered_set<int> &ID, int N, std::vector<doub
    * ~ -Exp(1) / prob
    * ~ prob / Exp(1)
    * Here, ~ means "doesn't change order statistics".
-   */  
-  
+   */
+
   // generate random numbers -> prob[i]/exp_dist(mersenne_engine)
   // and store them in priority queue
   // get index of N largest elements in randVal
@@ -195,7 +195,7 @@ std::vector<int> randSample(std::unordered_set<int> &ID, int N, std::vector<doub
   // use priority_queue
   std::priority_queue<std::pair<double, int> > q;
   std::vector<int> sampleID(ID.size());
-  
+
   int z = 0;
   for(auto s : ID){
     if(IDused[s]==0){
@@ -207,11 +207,11 @@ std::vector<int> randSample(std::unordered_set<int> &ID, int N, std::vector<doub
       }
     }
   }
-  // resize sampling vector 
+  // resize sampling vector
   // z values are now in Vector + N are still to come
   N = max(0,min<int>(q.size(),N-z));
   sampleID.resize(N+z);
-  
+
   // build output vector
   if(N>0){
     // select index of top elements from priority_queue
@@ -227,43 +227,43 @@ std::vector<int> randSample(std::unordered_set<int> &ID, int N, std::vector<doub
 // help function to randomly distribute number of units to draw from
 std::map<std::vector<int>,int> distributeRandom(std::map<std::vector<int>,double> &ratioDraws, int &totalDraws,
                                                 std::mt19937 &mersenne_engine){
-  
+
   // ratioDraws map containing ratio of units to draw for each map entry
   // totalDraws integer containing total number of units to draw
   // mersenne_engine random number generator engine
-  
+
   // output
   std::map<std::vector<int>,int> numberDraws;
-  
+
   ///////////////
   // distribute draws at lowest level hierarchy
   double draw_excess_help = 0;
   double x_excess = 0;
   for(auto const&x : ratioDraws){
-    
+
     x_excess = ratioDraws[x.first]*(double)totalDraws;
-    
+
     numberDraws[x.first] = floor(x_excess);
-    
+
     x_excess = x_excess-floor(x_excess);
     draw_excess_help = draw_excess_help + x_excess;
-    
+
   }
-  
+
   int draw_excess = std::round(draw_excess_help);
-  
+
   if(draw_excess==0){
     // return output of nothing need to be distributed
     return numberDraws;
   }
-  
+
   // randomly shuffeld index vector
   // this is similar to randomly round up and down in each group so on average swaprate will be reached
   std::vector<int> add_extra(ratioDraws.size());
   std::iota(add_extra.begin(),add_extra.end(),0);
   std::shuffle(add_extra.begin(),add_extra.end(),mersenne_engine);
   std::sort(add_extra.begin(),add_extra.begin()+draw_excess); // sort first draw_excess elemets in vector
-  
+
   // pick first draw_excess values and add one to them
   int z = 0;
   int v = 0;
@@ -278,7 +278,7 @@ std::map<std::vector<int>,int> distributeRandom(std::map<std::vector<int>,double
     }
     z++;
   }
-  
+
   // return output
   return numberDraws;
 }
@@ -289,12 +289,12 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
                                                 int &nhid, double &swaprate,
                                                 std::uniform_int_distribution<std::mt19937::result_type> &runif01,
                                                 std::mt19937 &mersenne_engine){
-  
+
   // group_hier map which contains all household indices per hierarchy level (only all hierarchy levels are used atm)
   // nhid int containing number of households in total
   // swaprate double containing the swaprate
   // runif01 & mersenne_engine for sampling procedures
-  
+
   ///////////////
   // define total number of swaps according to swaprate
   // swaprate/2 ensures that in the end this percentage of households is swapped
@@ -305,7 +305,7 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
   }else{
     totalDraws = floor(nhid*(swaprate/2));
   }
-  
+
   ///////////////
   // define number of units to swap at lowest level hierarchy
   // loop through all hierarchies
@@ -313,10 +313,10 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
   std::map<std::vector<int>,double > ratioRisk; // get ratio of numbers to draw in lowest level hierarchy
   std::map<std::vector<int>, std::vector<double> > sumRisk; // sum of Risk in each hierarchy level
   int nhier = risk[0].size(); // number of hierarchies
-  
+
   // calcualte sum of risk in each hierarchy level
   for(auto const&x : group_hier){
-    
+
     std::vector<double> sumRisk_help(nhier); // help vector to fill sumRisk
     for(int h=nhier;--h >=0;){
       //std::cout<<"h= "<<h<<"\n";
@@ -324,13 +324,13 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
         sumRisk_help[h] += risk[indexI][h];
       }
     }
-    
+
     sumRisk[x.first] = sumRisk_help;
-    
+
     // get draw ratio ~ percentage of units to draw in each lowest level hierarchy
     ratioRisk[x.first] = sumRisk[x.first].back();
   }
-  
+
   //normalize ratioRisk
   double sum_ratioRisk = 0.0;
   for(auto const&x : ratioRisk){
@@ -339,24 +339,24 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
   for(auto const&x : ratioRisk){
     ratioRisk[x.first] = x.second/sum_ratioRisk;
   }
-  
+
   // calculate number of draws on lowest level hierarchy
   std::map<std::vector<int>,int> numberDraws = distributeRandom(ratioRisk, totalDraws,mersenne_engine);
-  
-  
+
+
   // loop over hierarchy and distribute each entry in numberDraws
   // over the hierarchies vertically
   double helpSum = 0.0;
   std::vector<int> hl;
-  
+
   for(auto const&x : group_hier){
-    
+
     // sum over risks vertically
     helpSum = 0.0;
     for (auto& r : sumRisk[x.first]){
       helpSum += r;
     }
-    
+
     std::map<std::vector<int>,double > ratioHelp;
     hl = x.first;
     hl.push_back(1);
@@ -364,11 +364,11 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
       hl.back() = h+1;
       ratioHelp[hl] = sumRisk[x.first][h]/helpSum;
     }
-    
+
     std::map<std::vector<int>,int> helpDist = distributeRandom(ratioHelp, numberDraws[x.first],mersenne_engine);
-    
+
     for(auto const&x : helpDist){
-      
+
       std::vector<int> help_hier(x.first.begin(),x.first.begin() + x.first.back());
       if(nhier!=x.first.back()){
         // for higher hierarchies
@@ -378,10 +378,10 @@ std::map<std::vector<int>,int> distributeDraws2(std::map<std::vector<int>,std::u
         // for lowest hierarchy overwrite value
         numberDraws[help_hier] = x.second;
       }
-      
+
     }
   }
-  
+
   return numberDraws;
 }
 
@@ -400,11 +400,11 @@ std::map<std::vector<int>,std::pair<int,int>> distributeDraws(std::map<std::vect
   // nhid int containing number of households in total
   // swaprate double containing the swaprate
   // runif01 & mersenne_engine for sampling procedures
-  
-  
+
+
   // swaprate/2 ensures that in the end this percentage of households is swapped
   // so 1 swap is counted double since with each swap 2 households are swapped
-  int total_swaps = 0; 
+  int total_swaps = 0;
   if(runif01(mersenne_engine)==0){
     total_swaps = ceil(nhid*(swaprate/2));
   }else{
@@ -419,15 +419,15 @@ std::map<std::vector<int>,std::pair<int,int>> distributeDraws(std::map<std::vect
   double x_excess = 0;
   for(auto const&x : group_hier){
     draw_group[x.first].first = x.second.size(); // this is needed later on
-    
+
     x_excess = (double)x.second.size()/(double)nhid*(double)total_swaps;
-    
+
     draw_group[x.first].second = floor(x_excess);
-    
+
     x_excess = x_excess-floor(x_excess);
     draw_excess_help = draw_excess_help + x_excess;
   }
-  
+
   int draw_excess = std::round(draw_excess_help);
   // randomly shuffeld index vector
   // this is similar to randomly round up and down in each group so on average swaprate will be reached
@@ -435,7 +435,7 @@ std::map<std::vector<int>,std::pair<int,int>> distributeDraws(std::map<std::vect
   std::iota(add_extra.begin(),add_extra.end(),0);
   std::shuffle(add_extra.begin(),add_extra.end(),mersenne_engine);
   std::sort(add_extra.begin(),add_extra.begin()+draw_excess); // sort first draw_excess elemets in vector
-  
+
   // pick first draw_excess values and add one to them
   int z = 0;
   int v = 0;
@@ -452,7 +452,7 @@ std::map<std::vector<int>,std::pair<int,int>> distributeDraws(std::map<std::vect
     }
     z++;
   }
-  
+
   return draw_group;
 }
 
@@ -464,21 +464,21 @@ std::map<std::vector<int>,std::pair<int,int>> distributeDraws(std::map<std::vect
 std::vector<int> sampleDonor(std::vector< std::vector<int> > &data, std::vector<std::vector<int>> &similar,
                              std::vector<int> &IDswap, std::unordered_set<int> &IDswap_pool,
                              std::map<double,int> &IDdonor_pool, std::vector<int> &IDused, int &hid){
-  
+
   // data: data input data.size() ~ number of records - data.[0].size ~ number of varaibles per record
   // similar: column indices in data corresponding to variables (household/personal) which should be considered when swapping,
-  // e.g. swapping onlys household with same houshoeld size 
+  // e.g. swapping onlys household with same houshoeld size
   // IDswap: vector containing household IDs to be swapped
   // IDswap_pool: unordered set containing sampling pool from which IDswap was drawn
   // IDdono_pool: map containing every possible donor ID (ordered by sampling probability in ascending order)
   // IDused: integer vector which takes on 1 if ID was sampled
-  
+
   // define parameter
   std::vector<int> IDdonor(IDswap.size(),-1); // output initialize with -1
   // if value stays -1 then no donor was found for corresponding value in IDswap
-  bool similar_true=true; 
+  bool similar_true=true;
   int index_donor = 0;
-  
+
   // select donor based on similarity constrains
   // iterate over both unordered sets
   // iterate over IDdonor_pool in reverse order since it is sorted in ascending order by risk
@@ -486,7 +486,7 @@ std::vector<int> sampleDonor(std::vector< std::vector<int> > &data, std::vector<
     // find donor for index_samp
     // iterate over similarity profiles
     for(std::size_t profile=0;profile<similar.size();profile++){
-      
+
       // iterate over complete donor set in reverse order
       for( auto it = IDdonor_pool.end();it!=IDdonor_pool.begin(); ){
         // it->second access the value
@@ -522,8 +522,8 @@ std::vector<int> sampleDonor(std::vector< std::vector<int> > &data, std::vector<
     next_index_samp:
       ;
   }
-  
-  return IDdonor;  
+
+  return IDdonor;
 }
 
 
@@ -532,54 +532,80 @@ std::vector<int> sampleDonor(std::vector< std::vector<int> > &data, std::vector<
  * Function to perform record swapping
  */
 std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data, int hid,
-                                           std::vector<int> hierarchy, 
+                                           std::vector<int> hierarchy,
                                            std::vector< std::vector<int> > similar,
                                            double swaprate,
                                            std::vector< std::vector<double> > risk, double risk_threshold,
-                                           int k_anonymity, std::vector<int> risk_variables,  
+                                           int k_anonymity, std::vector<int> risk_variables,
                                            std::vector<int> carry_along,
                                            int &count_swapped_records,
                                            int &count_swapped_hid,
                                            std::string log_file_name,
                                            int seed = 123456){
-  
-  // data: data input data.size() ~ number of records - data.[0].size ~ number of varaibles per record
-  // hid: int correspondig to column index in data which holds the household ID
+
+  // data: data input data.size() ~ number of records - data.[0].size ~ number of variables per record
+  // hid: int corresponding to column index in data which holds the household ID
   // hierarchy: column indices in data corresponding to geo hierarchy of data read left to right (left highest level - right lowest level)
   // similar: column indices in data corresponding to variables (household/personal) which should be considered when swapping,
-  // e.g. swapping onlys household with same houshoeld size 
+  // e.g. swapping only households with same household size
   // swaprate: double defining the ratio of households to be swapped
   // risk: double vector of vectors containing the risk for each individual in each record - risk_record[0] risk for first record an each hierarchy level
-  // risk_threshold: double cutoff for defining highrisk households. if risk>risk_threshold then household is high risk is will definitely be swapped
+  // risk_threshold: double cutoff for defining high-risk households. if risk>risk_threshold then household is high risk is will definitely be swapped
   // k_anonymity: int defining a threshold, each group with counts lower than the threshold will automatically be swapped.
   // risk_variables: column indices in data corresponding to risk variables which will be considered for estimating counts in the population
-  // carry_along: swap additional variables in addition to hierarchy variable. These variables do not interfere with the procedure of 
+  // carry_along: swap additional variables in addition to hierarchy variable. These variables do not interfere with the procedure of
   // finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies.
   // count_swapped_records, count_swapped_hid: count number of households and records swapped.
   // seed: integer seed for random number generator.
   // log_file_name: name of file to save HIDs of non-swapped households.
-  
-  // initialise parameters
-  int n = data.size(); // number of obesrvations
+
+  // initialize parameters
+  int n = data.size(); // number of observations
   int nhier = hierarchy.size(); // number of hierarchy levels
   std::unordered_set<int> IDnotUsed;
   // needed for running random number generator and
   // set random seed according to input parameter
   std::mt19937 mersenne_engine;
   mersenne_engine.seed(seed);
-  
-  // initialise random number generator for exponential distribution
+
+  // initialize random number generator for exponential distribution
   std::exponential_distribution<double> exp_dist(1.0);
   // initialize random number generator for uniform distribution
   std::uniform_int_distribution<std::mt19937::result_type> runif01(0,1);
-  
+
   ////////////////////////////////////////////////////
-  // order data by hid 
+  // order data by hid
   // not needed at the moment -> order data outside function
   // orderData(data,hid);
   ////////////////////////////////////////////////////
-  
-  
+
+
+  // we order by hid and check that carry_along variables
+  // are at household level
+  int nr_carry_along = carry_along.size();
+  int nr_records = data.size();
+  if (nr_carry_along > 0) {
+    orderData(data, hid);
+    int cur_hid, cur_val, chk_val;
+    for (int j = 0; j < nr_carry_along; j++) {
+      int cur_carryalong = carry_along[j];
+      int prev_hid = -1;
+      for (int i = 0; i < nr_records; i++) {
+        cur_hid = data[i][hid];
+        cur_val = data[i][cur_carryalong];
+        bool new_hh = cur_hid != prev_hid;
+        if (new_hh) {
+          chk_val = cur_val;
+        } else {
+          if (chk_val != cur_val) {
+            throw std::runtime_error("carry_along-variables must have identical values within (household) ids");
+          }
+        }
+        prev_hid = cur_hid;
+      }
+    }
+  }
+
   ////////////////////////////////////////////////////
   // define risk data if not supplied by user
   // using risk_variables and 1/counts
@@ -590,8 +616,8 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
     prob = risk;
   }
   ////////////////////////////////////////////////////
-  
-  
+
+
   ////////////////////////////////////////////////////
   // define minimum swap level for each household
   if(risk_threshold==0){
@@ -602,9 +628,9 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
     }
   }
   std::vector<int> levels = setLevels(prob,risk_threshold);
-  
-  ////////////////////////////////////////////////////  
-  
+
+  ////////////////////////////////////////////////////
+
   ////////////////////////////////////////////////////
   // get household size for each household ID
   // initialise map for household size
@@ -615,14 +641,14 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
     map_hsize[data[i][hid]]++;
   }
   ////////////////////////////////////////////////////
-  
-  
+
+
   ////////////////////////////////////////////////////
   // apply swapping algorithm
   // go from highest to lowest level
   // swapp at each higher level the number of households that have to be swapped at that level according to "k_anonymity" (see setLevels())
   // at lowest level swap remaining number of households (according to swap) if not enough households have been swapped
-  // every household can only be swapped once 
+  // every household can only be swapped once
   std::map<std::vector<int>,std::unordered_set<int> > group_hier; //
   std::map<std::vector<int>,int> countSwaps; // count swaps already done in each hierarchy
   std::unordered_map<int,std::unordered_set<int> > group_levels; // map containing all IDs which must be swapped at a certain level (~key of map)
@@ -633,56 +659,56 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
   std::map<int,std::map<double,int>> samp_order_donor;
   int z=0; // counter used for while() ect...
   int nhid = 0;
-  
+
   /////////////////////////////
   // create map containing subgroups according to hierarchy
   // and IDs of each subgroup
   // use hhsize for this to speed things up
   while(z<n){
-    
+
     // ... define hierarchy group
     for(int j=0;j<nhier;j++){
       hier_help[j] = data[z][hierarchy[j]];
     }
-    
+
     // supply new household index to each group
     // use only indices to speed up construction of output data
     group_hier[hier_help].insert(z);
-    
+
     // create map for levels
     if(levels[z]<nhier){
       group_levels[levels[z]].insert(z);
     }
-    
+
     // create set of IDs for quick lookup
     IDdonor_all.insert(z);
-    
+
     // create map for random numbers (ordered)
     // makes sampling in each iteration obsolete
     // look up in these maps instead
     for(int j=0;j<nhier;j++){
       samp_order_donor[j][prob[z][j]/exp_dist(mersenne_engine)] = z;
     }
-    
+
     // count number of households
     nhid++;
     // skip all other household member, only need first one
     z += map_hsize[data[z][hid]];
-    
+
   }
   /////////////////////////////
-  
+
   /////////////////////////////
   // get number of households to be swapped at the lowest level hierarchy
   // this is only used at lowest hierarchy level
   // draw_group[].first -> number of households in lowest level hierarchy
   // draw_group[].second -> number of swaps in lowest level hierarchy
-  std::map<std::vector<int>,std::pair<int,int>> draw_group =  distributeDraws(group_hier, nhid, swaprate, 
+  std::map<std::vector<int>,std::pair<int,int>> draw_group =  distributeDraws(group_hier, nhid, swaprate,
                                                                               runif01, mersenne_engine);
   //std::map<std::vector<int>,int> draw_group =  distributeDraws2(group_hier, risk, nhid, swaprate,
   //                                                              runif01, mersenne_engine);
   /////////////////////////////
-  
+
   /////////
   // this is needed only for the lowest hierarchy
   // will be changed for the final version
@@ -690,32 +716,32 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
   for(int i=0;i<n;i++){
     prob_help[i] = prob[i][nhier-1];
   }
-  
+
   /////////////////////////////
   // Procedure for swapping starts here:
-  // loop over hierarchies 
+  // loop over hierarchies
   // start at highest hierarchy
   for(int h=0;h<nhier;h++){
     // int h=3;
     // values of map element that must be swapped at current stage
     // if no elements need to be swapped than skip this step
-    
+
     std::unordered_set<int> mustSwap;
-    
+
     if(group_levels.find(h)!=group_levels.end()){
       mustSwap = group_levels[h];
     }
-    
+
     std::map<std::vector<int>,unordered_set<int> > group_hier_help;
     hier_help.resize(h+1);
-    
+
     /////////////////
     // get combined map for hierarchy h
     for(auto const&x : group_hier){
-      
+
       // get higher hierarchy
       std::copy(x.first.begin(),x.first.begin()+h+1,hier_help.begin());
-      
+
       // discard every index that has already been used
       // more efficient to do this at this step then later on in the code
       for(auto s : x.second){
@@ -729,8 +755,8 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
       }
     }
     /////////////////
-    
-    
+
+
     /////////////////
     // int sampSize=0;
     // int countUsed=0;
@@ -738,13 +764,13 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
     /////////////////
     // loop over levels of hierarchy
     for(auto &x : group_hier_help){
-      
+
       // std::vector<int> xfirst = group_hier_help.begin()->first;
       // std::vector<int> xsecond = group_hier_help.begin()->second;
       // get values that need to be swapped at this hierarchy level and which are
       // in this hierarchy stage
       std::vector<int> IDswap(x.second.size());
-      
+
       if(h<(nhier-1)){
         // in all but the last hierarchy level do the follwing:
         if(mustSwap.size()>0){
@@ -766,48 +792,48 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
         // if at lowest level get number of households that need to be swapped
         // according to swap and check if this number was already reached
         // by previous swappings
-        
+
         // not enough households have been swapped
         // when checking at lowest level
         // Number of IDs that need to be swapped - already swapped IDs - IDs that have to be swapped at lowest level:
         countRest = draw_group[x.first].second - countSwaps[x.first];
         countRest = std::max(0,countRest);
-        
+
         std::unordered_set<int> IDswap_draw = x.second;
-        
+
         // apply sampling here -> should still be quick because IDswap_draw will not be extremely large
         // in randSample households that must be swapped are automatically choosen
         std::vector<int> IDswap_help = randSample(IDswap_draw,countRest,prob_help,mersenne_engine,IDused,mustSwap);
         IDswap.resize(IDswap_help.size());
-        IDswap = IDswap_help; 
+        IDswap = IDswap_help;
       }
-      
-      
+
+
       // if any IDs need to be swapped:
       if(IDswap.size()>0){
-        
+
         // get donor set
         // if IDdonor is -1 at a position ==> no donor for IDswap at same position
         std::vector<int> IDdonor = sampleDonor(data, similar, IDswap, x.second,
                                                samp_order_donor[h], IDused, hid);
-        
+
         // set Index to used
         for(std::size_t i=0;i<IDdonor.size();i++){
           if(IDdonor[i]>-1){
             IDused[IDdonor[i]]=1;
             IDused[IDswap[i]]=1;
-            // store results from sampling in swappedIndex 
+            // store results from sampling in swappedIndex
             swappedIndex[IDswap[i]] = IDdonor[i];
           }else{
             IDnotUsed.insert(IDswap[i]);
           }
         }
-        
+
       }
       /////////////////
     }
   }
-  
+
   ////////////////////////////////////////////////////
   // Create output using swappedIndex
   carry_along.insert( carry_along.end(), hierarchy.begin(), hierarchy.end() );
@@ -818,15 +844,15 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
   for(auto const&x : swappedIndex){
     hsize = map_hsize[data[x.first][hid]];
     hsizewith = map_hsize[data[x.second][hid]];
-    
+
     count_swapped_records = count_swapped_records + hsize + hsizewith; // count how many records are swapped
-    
+
     // erase elements if they have been used during the procedure
     // donor was not found on highest hierarchy
     // but donor was found on lowest...this might actually be a bug...
     // IDnotUsed.erase(x.first);
     // IDnotUsed.erase(x.second);
-    
+
     // loop over variables to swapp
     for(int j=0;j<nvalues;j++){
       swap_value = data[x.first][carry_along[j]];
@@ -843,10 +869,10 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
       }
     }
   }
-  
-  // save number of swaped hids 
+
+  // save number of swaped hids
   count_swapped_hid = swappedIndex.size()*2;
-  
+
   if(IDnotUsed.size()>0){
 //    cout<<"Recordswapping was successful!"<<endl;
 //  }else{
@@ -859,7 +885,7 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
     }
     fclose(pFile);
   }
-  
+
   return data;
-  
-}
\ No newline at end of file
+
+}
diff --git a/vignettes/recordSwapping.Rmd b/vignettes/recordSwapping.Rmd
index 430a7151..064df7e0 100644
--- a/vignettes/recordSwapping.Rmd
+++ b/vignettes/recordSwapping.Rmd
@@ -53,7 +53,7 @@ recordSwap(std::vector< std::vector<int> > data, int hid,
 Should be ignored, for now, it is not fully tested yet.
 + **k_anonymity** integer defining the threshold of high risk households (k-anonymity). A record is not at risk if `k_anonymity > counts`.
 + **risk_variables** column indices of variables in \code{data} which will be considered for estimating the risk. This is only used if `risk` was not supplied.
-+ **carry_along** column indices of variables in \code{data} which are additionally swapped. These variables do not interfere with the procedure of finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies.
++ **carry_along** column indices of variables in \code{data} which are additionally swapped. These variables do not interfere with the procedure of finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies. However, the variables need to be at household-level (identical value within **hid**)
 + **count_swapped_records**, **count_swapped_hid** count number of households and records swapped
 + **log_file_name** path for writing a log file. The log file contains a list of household IDs (`hid`) that could not have been swapped and is only created if any such households exist.
 + **seed** integer defining the seed for the random number generator, for reproducibility.

From 911d622963f5c6c89985a3d9bcb7657cc705c18d Mon Sep 17 00:00:00 2001
From: Bernhard Meindl <bernhard.meindl@statistik.gv.at>
Date: Thu, 21 Jul 2022 07:48:08 +0200
Subject: [PATCH 2/3] add test for record-swapping: carry_along variables must
 be at household-level

---
 tests/testthat/test_recordSwap_inputs.R | 66 ++++++++++++++-----------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/tests/testthat/test_recordSwap_inputs.R b/tests/testthat/test_recordSwap_inputs.R
index 68ac9c18..35df32e3 100644
--- a/tests/testthat/test_recordSwap_inputs.R
+++ b/tests/testthat/test_recordSwap_inputs.R
@@ -19,7 +19,7 @@ hid <- "hid"
 
 # test input parameter
 test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_along",{
-  
+
   #################################
   # data
   dat_wrong <- 1:10
@@ -31,7 +31,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           return_swapped_id = TRUE,
                           seed=seed),
                "data must be either a data.table, data.frame")
-  
+
   expect_error(recordSwap(data = as.data.frame(dat), hid = hid, hierarchy = hier,
                           similar = similar, swaprate = swaprate,
                           k_anonymity = k_anonymity,
@@ -40,9 +40,9 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           return_swapped_id = TRUE,
                           seed=seed),
                NA)
-  
+
   #################################
-  
+
   #################################
   # hid
   expect_error(recordSwap(data = dat, hid = -1, hierarchy = hier,
@@ -52,7 +52,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column indices cannot be negative")
-  
+
   expect_error(recordSwap(data = dat, hid = "hhids", hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -60,7 +60,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column name\\(s\\) in hid are not found in data")
-  
+
   expect_error(recordSwap(data = dat, hid = c("HHID","hhids"), hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -69,7 +69,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             return_swapped_id = TRUE,
                             seed=seed),"hid must be an integer \\(column index\\) or character \\(column name\\) of length 1")
   #################################
-  
+
   #################################
   # hierarchy
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = c(10:100),
@@ -79,7 +79,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column index in hierarchy exceeds number of columns in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = c("GEM","BDL","GKZ"),
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -87,7 +87,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column name\\(s\\) in hierarchy are not found in data")
-  
+
   dat[,h_extra:=runif(.N)]
   dat[,h_extra2:=sample(LETTERS,.N,replace=TRUE)]
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = c("h_extra","h_extra2"),
@@ -99,7 +99,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
              seed=seed),"Columns specified in hid, hierarchy, similar and carry\\_along must contain only integer values at this point")
   dat[,c("h_extra","h_extra2"):=NULL]
   #################################
-  
+
   #################################
   # similarity
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -109,7 +109,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"Column indices cannot be negative")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = c("hsize","hstatus"), swaprate = swaprate,
                           k_anonymity = k_anonymity,
@@ -117,7 +117,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"Column name\\(s\\) in X\\[\\[i\\]\\] are not found in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = list(c("hsize","hstatus"),"hsize"), swaprate = swaprate,
                           k_anonymity = k_anonymity,
@@ -125,7 +125,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"Column name\\(s\\) in X\\[\\[i\\]\\] are not found in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = list(c("hsize","htype"),"hsize"), swaprate = swaprate,
                           k_anonymity = k_anonymity,
@@ -133,9 +133,9 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),NA)
-  
+
   #################################
-  
+
   #################################
   # risk_variables
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -145,7 +145,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column index in risk\\_variables exceeds number of columns in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -153,7 +153,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = NULL,
                             return_swapped_id = TRUE,
                             seed=seed),"Column name\\(s\\) in risk\\_variables are not found in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -164,7 +164,15 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
   #################################
 
   #################################
-  # carry_along
+  # carry_along must be at household-level
+  expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
+                          similar = similar, swaprate = swaprate,
+                          k_anonymity = k_anonymity,
+                          risk_variables = risk_variables,
+                          carry_along = "ageGroup",
+                          return_swapped_id = TRUE,
+                          seed=seed),"carry\\_along-variables must have identical values within \\(household\\) ids")
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -172,7 +180,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = -1,
                             return_swapped_id = TRUE,
                             seed=seed),"Column indices cannot be negative")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -180,7 +188,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = c("carry_along"),
                             return_swapped_id = TRUE,
                             seed=seed),"Column name\\(s\\) in carry_along are not found in data")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                             similar = similar, swaprate = swaprate,
                             k_anonymity = k_anonymity,
@@ -188,12 +196,12 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon
                             carry_along = c("lau2"),
                             return_swapped_id = TRUE,
                             seed=seed),NA)
-  
+
 })
 
 
 test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
-  
+
   #################################
   # swaprate
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -203,7 +211,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"swaprate must be a single number between 0 and 1!")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = similar, swaprate = c(0.1,0.5),
                           k_anonymity = k_anonymity,
@@ -211,7 +219,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"swaprate must be a single number between 0 and 1!")
-  
+
   #################################
   # k_anonymity
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -221,7 +229,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"k_anonymity must be a positiv single integer!")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = similar, swaprate = swaprate,
                           k_anonymity = c(1,6),
@@ -229,7 +237,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=seed),"k_anonymity must be a positiv single integer!")
-  
+
   ##################################
   # return_swapped_id
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -239,7 +247,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = c(TRUE,TRUE),
                           seed=seed),"return\\_swapped_id must be logical of length 1")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = similar, swaprate = swaprate,
                           k_anonymity = k_anonymity,
@@ -247,7 +255,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = "HID",
                           seed=seed),"return\\_swapped_id must be logical of length 1")
-  
+
   ##################################
   # seed
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
@@ -257,7 +265,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{
                           carry_along = NULL,
                           return_swapped_id = TRUE,
                           seed=1.5),"seed must be a single positive integer!")
-  
+
   expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier,
                           similar = similar, swaprate = swaprate,
                           k_anonymity = k_anonymity,

From 99215b2690b843c81a6980e8e6fea777f9e24428 Mon Sep 17 00:00:00 2001
From: Bernhard Meindl <bernhard.meindl@statistik.gv.at>
Date: Thu, 21 Jul 2022 12:08:17 +0200
Subject: [PATCH 3/3] bump algorithm-version to 1.0.2 after adding checks for
 carry_along variables

---
 src/recordSwap/recordSwap.cpp |  2 +-
 src/recordSwap/recordSwap.h   | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/recordSwap/recordSwap.cpp b/src/recordSwap/recordSwap.cpp
index 8d992222..c5bbca0f 100644
--- a/src/recordSwap/recordSwap.cpp
+++ b/src/recordSwap/recordSwap.cpp
@@ -1,6 +1,6 @@
 /*
  * Algorithm for targeted record swapping
- * Version: 1.0.1
+ * Version: 1.0.2
  */
 
 #include <iostream>
diff --git a/src/recordSwap/recordSwap.h b/src/recordSwap/recordSwap.h
index a53e63fa..9343c6f5 100644
--- a/src/recordSwap/recordSwap.h
+++ b/src/recordSwap/recordSwap.h
@@ -1,27 +1,27 @@
 /*
  * Open Source Software to apply Statistical Disclosure Control techniques
- * 
- * This program is free software; you can redistribute it and/or 
- * modify it under the terms of the European Union Public Licence 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the European Union Public Licence
  * (EUPL) version 1.1, as published by the European Commission.
- * 
+ *
  * You can find the text of the EUPL v1.1 on
  * https://joinup.ec.europa.eu/software/page/eupl/licence-eupl
- * 
- * This software is distributed on an "AS IS" basis without 
+ *
+ * This software is distributed on an "AS IS" basis without
  * warranties or conditions of any kind, either express or implied.
  */
 
 /*
- * Version: 1.0.1
+ * Version: 1.0.2
  */
 
-/* 
+/*
  * Header file for shared library recordSwap.dll
  * with source code recordSwap.cpp
  * to perform Targeted Record Swapping
  */
-#include <iostream>     
+#include <iostream>
 #include <algorithm>    // std::count
 #include <vector>       // std::vector
 #include <random>
@@ -41,11 +41,11 @@
  * Function to perform record swapping
  */
 std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data, int hid,
-                                           std::vector<int> hierarchy, 
+                                           std::vector<int> hierarchy,
                                            std::vector< std::vector<int> > similar,
                                            double swaprate,
                                            std::vector< std::vector<double> > risk, double risk_threshold,
-                                           int k_anonymity, std::vector<int> risk_variables,  
+                                           int k_anonymity, std::vector<int> risk_variables,
                                            std::vector<int> carry_along,
                                            int &count_swapped_records,
                                            int &count_swapped_hid,
@@ -55,17 +55,17 @@ std::vector< std::vector<int> > recordSwap(std::vector< std::vector<int> > data,
 
 //private:
 /*
- * Function to reorder data-set given one index vector 
+ * Function to reorder data-set given one index vector
  */
 std::vector< std::vector<int> > orderData(std::vector< std::vector<int> > &data, int orderIndex);
 
 /*
- * Function to define levels 
+ * Function to define levels
  */
 std::vector<int> setLevels(std::vector< std::vector<double> > &risk, double risk_threshold);
 
 /*
- * Function to set sampling probability 
+ * Function to set sampling probability
  * and reverse sampling probability (for donor sets)
  */
 std::vector< std::vector<double> > setRisk(std::vector<std::vector<int> > &data, std::vector<int> &hierarchy, std::vector<int> &risk_variables, int &hid);
@@ -85,7 +85,7 @@ std::vector<int> sampleDonor(std::vector< std::vector<int> > &data, std::vector<
                              std::vector<int> &IDswap, std::unordered_set<int> &IDswap_pool,
                              std::map<double,int> &IDdonor_pool, std::vector<int> &IDused, int &hid);
 
-/* 
+/*
  * help function to randomly distribute number of units to draw from
  */
 std::map<std::vector<int>,int> distributeRandom(std::map<std::vector<int>,double> &ratioDraws, int &totalDraws,