From 92ae47514b9b71929c2548e0d6a7fc7afbb6caac Mon Sep 17 00:00:00 2001 From: Bernhard Meindl Date: Wed, 20 Jul 2022 09:58:09 +0200 Subject: [PATCH 1/3] check that carry_along variables are at household-level --- DESCRIPTION | 4 +- NEWS | 2 + R/createDat.R | 43 +++-- R/recordSwap.R | 178 +++++++++++-------- man/recordSwap.Rd | 5 +- src/recordSwap/recordSwap.cpp | 326 ++++++++++++++++++---------------- vignettes/recordSwapping.Rmd | 2 +- 7 files changed, 317 insertions(+), 243 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3ce646ff..0f35d7d8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: sdcMicro Type: Package Title: Statistical Disclosure Control Methods for Anonymization of Data and Risk Estimation -Version: 5.7.1 -Date: 2022-07-05 +Version: 5.7.1.99 +Date: 2022-07-20 Authors@R: c( person("Matthias", "Templ", email="matthias.templ@gmail.com", role = c("aut", "cre"), comment=c(ORCID="0000-0002-8638-5276")), person("Bernhard", "Meindl", email = "Bernhard.Meindl@statistik.gv.at", role = c("aut")), diff --git a/NEWS b/NEWS index 2794a614..8c550562 100644 --- a/NEWS +++ b/NEWS @@ -3,6 +3,8 @@ - Fix warnings with clang-devel related targeted record swapping - Fix a note in vignette title for TRS - Remove travis and use Github workflows to check the package +- small fix for `createDat()` removing a possible warning +- only allow household-level variables for argument `carry_along` for TRS # 5.7.1 - Bugfix in `extractManipData()` with only a single categorical variable, thx @tamertemizer for reporting diff --git a/R/createDat.R b/R/createDat.R index 2e27de1d..95b2e1fc 100644 --- a/R/createDat.R +++ b/R/createDat.R @@ -12,7 +12,7 @@ #' @return `data.table` containing dummy data #' @rdname recordSwap #' @export -createDat <- function(N = 10000) { +createDat <- function(N=10000) { stopifnot(is.numeric(N)) stopifnot(N > 1) N <- ceiling(N) @@ -23,19 +23,32 @@ createDat <- function(N = 10000) { hsize <- sample(1:6, N, replace = TRUE) htype <- sample(1:10, N, replace = TRUE) hincome <- sample(1:10, N, replace = TRUE) - dat <- data.table( - nuts1 = rep(nuts1, times = hsize), - nuts2 = rep(nuts2, times = hsize), - nuts3 = rep(nuts3, times = hsize), - lau2 = rep(lau2, times = hsize), - hid = rep(1:length(hsize), times = hsize), - hsize = rep(hsize, times = hsize), - ageGroup = sample(1:7, length(hsize), replace = TRUE), - gender = sample(c(1, 2), length(hsize), replace = TRUE), - national = sample(1:5, length(hsize), replace = TRUE), - htype = rep(htype, times = hsize), - hincome = rep(hincome, times = hsize) - ) + + # replicate + hid <- rep(1:length(hsize), times = hsize) + nuts1 <- rep(nuts1, times = hsize) + nuts2 <- rep(nuts2, times = hsize) + nuts3 <- rep(nuts3, times = hsize) + lau2 <- rep(lau2, times = hsize) + htype <- rep(htype, times = hsize) + hincome <- rep(hincome, times = hsize) + hsize <- rep(hsize, times = hsize) + gender <- sample(c(1, 2), length(hsize), replace = TRUE) + ageGroup <- sample(1:7, length(hsize), replace = TRUE) + national <- sample(1:5, length(hsize), replace = TRUE) + + # create data.table + dat <- data.table(nuts1, + nuts2, + nuts3, + lau2, + hid, + hsize, + ageGroup, + gender, + national, + htype, + hincome) # hierarchy for regional variables help_0 <- c("", "0", "00", "000") @@ -43,5 +56,5 @@ createDat <- function(N = 10000) { dat[, nuts3 := paste0(nuts2, help_0[3 - nchar(nuts3)], nuts3)] dat[, lau2 := paste0(nuts3, help_0[5 - nchar(nuts3)], lau2)] dat[, colnames(dat) := lapply(.SD, as.integer)] - return(dat) + return(dat[]) } diff --git a/R/recordSwap.R b/R/recordSwap.R index 901ea9ec..c9cba6f6 100644 --- a/R/recordSwap.R +++ b/R/recordSwap.R @@ -101,7 +101,8 @@ #' besides to hierarchy variables. These variables do not interfere with the #' procedure of finding a record to swap with or calculating risk. This #' parameter is only used at the end of the procedure when swapping the -#' hierarchies. +#' hierarchies. However, any variables specified need to be at household +#' level which means that only identical values within `hid` are allowed. #' @param return_swapped_id, boolean if `TRUE` the output includes an #' additional column showing the `hid` with which a record was swapped with. #' The new column will have the name `paste0(hid,"_swapped")`. @@ -121,7 +122,7 @@ #' seed <- 2021 #' set.seed(seed) #' nhid <- 10000 -#' dat <- sdcMicro::createDat(nhid) +#' dat <- createDat(nhid) #' #' # define paramters for swapping #' k_anonymity <- 1 @@ -256,19 +257,20 @@ recordSwap.default <- function(data, hid, hierarchy, similar, stop("return_swapped_id must be logical of length 1") } - if(return_swapped_id==TRUE){ - orig_id <- cnames[hid+1] - swapped_id <- paste0(orig_id,"_swapped") - data[,c(swapped_id):=get(orig_id)] + if (return_swapped_id == TRUE) { + orig_id <- cnames[hid + 1] + swapped_id <- paste0(orig_id, "_swapped") + data[, c(swapped_id) := get(orig_id)] cnames <- copy(colnames(data)) - - swapped_id <- checkIndexString(swapped_id,cnames, - matchLength = 1) - carry_along <- c(carry_along,swapped_id) + swapped_id <- checkIndexString(swapped_id, cnames, matchLength = 1) + carry_along <- c(carry_along, swapped_id) } # check k_anonymity - if(!all((!is.null(risk_variables))&checkInteger(k_anonymity)&length(k_anonymity)==1&k_anonymity>=0)){ + if (!all((!is.null(risk_variables)) & + checkInteger(k_anonymity) & + length(k_anonymity) == 1 & + k_anonymity >= 0)) { stop("k_anonymity must be a positiv single integer!") } @@ -285,19 +287,22 @@ recordSwap.default <- function(data, hid, hierarchy, similar, } # check risk - if(is.null(risk)){ + if (is.null(risk)) { risk <- data.table() risk_threshold <- 0 } - if(is.vector(risk)){ - if(length(risk)!=length(hierarchy)){ + if (is.vector(risk)) { + if (length(risk) != length(hierarchy)) { stop("risk and hierarchy need to address the same number of columns!") } - risk <- checkIndexString(risk,cnames,minLength = 1) - risk <- data[,c(risk+1)] - }else{ - if(all(!class(risk)%in%c("data.table","data.frame","matrix"))){ - stop("If risk is not a vector containing column indices or column names in data then risk must be either a data.table, data.frame or matrix!") + risk <- checkIndexString(risk, cnames, minLength = 1) + risk <- data[, c(risk + 1)] + } else { + if (all(!class(risk) %in% c("data.table", "data.frame", "matrix"))) { + stop( + "If risk is not a vector containing column indices or column names", + "in data then risk must be either a data.table, data.frame or matrix!" + ) } } @@ -310,16 +315,20 @@ recordSwap.default <- function(data, hid, hierarchy, similar, cnamesrisk <- copy(colnames(risk)) risk <- data.table(risk) - if(nrow(risk)>0){ - if(is.null(cnamesrisk)){ - message("risk does not contain column names; the first column in risk will be used for the first hierarchy level, e.g ",cnames[hierarchy[1]+1]," and so on.") - }else{ - if(!any(cnamesrisk)%in%cnames[hierarchy+1]){ + if (nrow(risk) > 0) { + if (is.null(cnamesrisk)) { + message( + "risk does not contain column names; the first column in risk will be ", + "used for the first hierarchy level, e.g ", + cnames[hierarchy[1] + 1], " and so on." + ) + } else { + if (!any(cnamesrisk) %in% cnames[hierarchy + 1]) { stop("the columnnames of risk do not appear in data") } } - if(any(risk<0)||any(!is.numeric(risk))){ + if (any(risk < 0) || any(!is.numeric(risk))) { stop("risk must contain positive real values only!") } } @@ -328,80 +337,93 @@ recordSwap.default <- function(data, hid, hierarchy, similar, # if(is.character(seed)){ # stop("seed must be a single positive integer!") # } - if(is.null(seed) | any(is.na(seed))){ - seed <- sample(1e5,1) + if (is.null(seed) | any(is.na(seed))) { + seed <- sample(1e5, 1) } - if(!(is.numeric(seed)&&length(seed)==1&&seed%%1==0&&seed>0)){ + if (!(is.numeric(seed) && length(seed) == 1 && seed %% 1 == 0 && seed > 0)) { stop("seed must be a single positive integer!") } - ########################## # setup data and inputs for c++ function # order data - setkeyv(data,cnames[hid+1]) + setkeyv(data, cnames[hid + 1]) # take sub data - data[,helpVariableforMergingAfterTRS:=.I] + data[, helpVariableforMergingAfterTRS := .I] sim_vars <- sort(unique(unlist(similar))) - original_cols <- unique(c(hid,hierarchy,risk_variables,sim_vars,carry_along)) - select_cols <- unique(c(original_cols+1,ncol(data))) - data_sw <- copy(data[,.SD,.SDcols=c(select_cols)]) + original_cols <- unique(c(hid, hierarchy, risk_variables, sim_vars, carry_along)) + select_cols <- unique(c(original_cols + 1, ncol(data))) + data_sw <- copy(data[, .SD, .SDcols = c(select_cols)]) cnames_sw <- colnames(data_sw) # save column names for later use # remove columns from original data except help variable for merging drop_cols <- cnames_sw[-length(cnames_sw)] - data[,c(drop_cols):=NULL] + data[, c(drop_cols) := NULL] # remap column indices - hid <- which(hid %in% original_cols)-1 - hierarchy <- sapply(hierarchy,function(z){ - which(original_cols %in% z) -1 + hid <- which(hid %in% original_cols) - 1 + hierarchy <- sapply(hierarchy, function(z) { + which(original_cols %in% z) - 1 }) - if(length(similar)>0){ + if (length(similar) > 0) { # remap all similarity variables - similar <- lapply(similar,function(z){ - sapply(z,function(z.s){ - which(original_cols %in% z) -1 + similar <- lapply(similar, function(z) { + sapply(z, function(z.s) { + which(original_cols %in% z) - 1 }) }) } - if(length(risk_variables)>0){ - risk_variables <- sapply(risk_variables,function(z){ - which(original_cols %in% z) -1 + if (length(risk_variables) > 0) { + risk_variables <- sapply(risk_variables, function(z) { + which(original_cols %in% z) - 1 }) } - if(length(carry_along)>0){ - carry_along <- sapply(carry_along,function(z){ - which(original_cols %in% z) -1 + if (length(carry_along) > 0) { + carry_along <- sapply(carry_along, function(z) { + which(original_cols %in% z) - 1 }) } # check if any non numeric values are present in data if(any(!unlist(apply(data_sw,2,is.numeric)))){ - stop("Columns specified in hid, hierarchy, similar and carry_along must contain only integer values at this point") + stop( + "Columns specified in hid, hierarchy, similar ", + "and carry_along must contain only integer values at this point" + ) } # check if any values with NA values are present in data - NAOccured <- apply(data_sw,2,function(z){any(is.na(z))}) - if(any(NAOccured)){ - stop("data must contain only integer values. \nColumn(s)\n ",paste( names(which(NAOccured)),collapse=", "),"\ncontain(s) NA values") + NAOccured <- apply(data_sw, 2, function(z) { + any(is.na(z)) + }) + if (any(NAOccured)) { + stop( + "data must contain only integer values. \nColumn(s)\n ", + paste(names(which(NAOccured)), collapse = ", "), + "\ncontain(s) NA values" + ) } # check if any values with decimal values are present in data - decOccured <- apply(data_sw,2,function(z){any((z%%1)!=0)}) - if(any(decOccured)){ + decOccured <- apply(data_sw, 2, function(z) { + any((z %% 1) != 0) + }) + if (any(decOccured)) { decOccured <- names(decOccured)[decOccured] - stop("data must contain only integer values.\nColumn(s)\n ",paste(decOccured,collapse=", "),"\ncontain(s) decimal numbers") + stop( + "data must contain only integer values.\nColumn(s)\n ", + paste(decOccured, collapse = ", "), + "\ncontain(s) decimal numbers" + ) } - # transpose data for cpp function data_sw <- transpose(data_sw) # transpose risk - if(nrow(risk)>0){ + if (nrow(risk) > 0) { risk <- transpose(risk) - }else{ + } else{ risk <- numeric(0) } risk <- numeric(0) # drop this if risk was tested enough @@ -409,29 +431,39 @@ recordSwap.default <- function(data, hid, hierarchy, similar, # take time before starting swapping start_time <- Sys.time() - data_sw <- recordSwap_cpp(data=data_sw, similar_cpp=similar, hierarchy=hierarchy, - risk_variables=risk_variables, hid=hid, k_anonymity=k_anonymity, - swaprate=swaprate, - risk_threshold=risk_threshold, risk=risk, - carry_along = carry_along, - log_file_name = log_file_name, - seed=seed) + data_sw <- recordSwap_cpp( + data = data_sw, + similar_cpp = similar, + hierarchy = hierarchy, + risk_variables = risk_variables, + hid = hid, + k_anonymity = k_anonymity, + swaprate = swaprate, + risk_threshold = risk_threshold, + risk = risk, + carry_along = carry_along, + log_file_name = log_file_name, + seed = seed + ) # check if swapping was successful - if(file.exists(log_file_name) && file.mtime(log_file_name)>start_time){ - message("Donor household was not found in ",length(readLines(log_file_name))-2," case(s).\nSee ",log_file_name," for a detailed list") - }else{ + if (file.exists(log_file_name) && file.mtime(log_file_name) > start_time) { + message( + "Donor household was not found in ", + length(readLines(log_file_name)) - 2, + " case(s).\nSee ", log_file_name, " for a detailed list" + ) + } else{ message("Recordswapping was successful!\n") } setDT(data_sw) data_sw <- transpose(data_sw) - setnames(data_sw,colnames(data_sw),cnames_sw) - data[data_sw,c(drop_cols):=mget(drop_cols),on=.(helpVariableforMergingAfterTRS)] + setnames(data_sw, colnames(data_sw), cnames_sw) + data[data_sw, c(drop_cols) := mget(drop_cols), on = .(helpVariableforMergingAfterTRS)] rm(data_sw) - setcolorder(data,cnames) - data[,helpVariableforMergingAfterTRS:=NULL] - + setcolorder(data, cnames) + data[, helpVariableforMergingAfterTRS := NULL] return(data) } diff --git a/man/recordSwap.Rd b/man/recordSwap.Rd index 385cb816..8fed5312 100644 --- a/man/recordSwap.Rd +++ b/man/recordSwap.Rd @@ -77,7 +77,8 @@ rule is applied.} besides to hierarchy variables. These variables do not interfere with the procedure of finding a record to swap with or calculating risk. This parameter is only used at the end of the procedure when swapping the -hierarchies.} +hierarchies. However, any variables specified need to be at household +level which means that only identical values within `hid` are allowed.} \item{return_swapped_id, }{boolean if `TRUE` the output includes an additional column showing the `hid` with which a record was swapped with. @@ -177,7 +178,7 @@ library(data.table) seed <- 2021 set.seed(seed) nhid <- 10000 -dat <- sdcMicro::createDat(nhid) +dat <- createDat(nhid) # define paramters for swapping k_anonymity <- 1 diff --git a/src/recordSwap/recordSwap.cpp b/src/recordSwap/recordSwap.cpp index 108d48d7..8d992222 100644 --- a/src/recordSwap/recordSwap.cpp +++ b/src/recordSwap/recordSwap.cpp @@ -3,7 +3,7 @@ * Version: 1.0.1 */ -#include +#include #include // std::count #include // std::vector #include @@ -17,22 +17,22 @@ using namespace std; /* - * Function to reorder data-set given one index vector + * Function to reorder data-set given one index vector */ std::vector< std::vector > orderData(std::vector< std::vector > &data, int orderIndex){ - + // initialise ordering vector std::vector orderVec(data.size()); std::iota(orderVec.begin(),orderVec.end(),0); - + // order this vector by order of data[orderIndex] std::sort(orderVec.begin(),orderVec.end(), [&](int a, int b) { return data[a][orderIndex] < data[b][orderIndex]; } ); - + // reorder data without copying it for(std::size_t i = 0;i > orderData(std::vector< std::vector > &data, swap( orderVec[i], orderVec[orderVec[i]] ); } } - + return(data); } /* - * Function to define levels + * Function to define levels * this function returns the hierarchy level over which a unit/household needs to be swapped * 0 meaning the highest hierarchy level, 1 the second highest hierarchy level, and so on.... */ std::vector setLevels(std::vector< std::vector > &risk, double risk_threshold) { - + // risk: data containing the risk for each hierarchy level and each unit. risk[0] returns the vector of risks for the first unit over all hierarchy levels // risk_threshold: double defining the risk threshold beyond which a record/household needs to be swapped. This is understood as risk>risk_threshhold. - + // initialise parameters int n=risk.size(); int p=risk[0].size(); std::vector data_level(n); std::fill(data_level.begin(),data_level.end(),p); - + for(int i=0;irisk_threshold){ // risk[i][j]>risk_threshold @@ -72,23 +72,23 @@ std::vector setLevels(std::vector< std::vector > &risk, double risk } } } - + return data_level; } /* - * Function to set the risk for each individual + * Function to set the risk for each individual * in each hierarchy level * this is then used as sampling probability */ std::vector< std::vector > setRisk(std::vector > &data, std::vector &hierarchy, std::vector &risk_variables, int &hid){ - + // data: data input // hierarchy: column indices in data corresponding to geo hierarchy of data read left to right (left highest level - right lowest level) // risk_variables: column indices in data corresponding to risk variables which will be considered for estimating counts in the population // hid: int correspondig to column index in data which holds the household ID - + // initialise parameters int n = data.size(); int nhier = hierarchy.size(); @@ -103,67 +103,67 @@ std::vector< std::vector > setRisk(std::vector > &data, // prob[1] ~ risk of second record for each hierarchy level // and so on ... std::vector< std::vector > prob(n,vector(nhier)); - + // std::vector loop_index = risk_variables; loop_index.insert(loop_index.end(),hierarchy.begin(),hierarchy.end()); int loop_n = loop_index.size(); std::vector groups(loop_n); - + // initialise counts for groups in every hierarchy - std::map,int> group_count; + std::map,int> group_count; for(int i=0;i groups_help(&groups[0],&groups[nrisk+index_hier+1]); // +1 needed here! // ... count number for each group using std::map group_count[groups_help]++; } } - + // loop over data again and fill with risks int i=0; int h=0; while(i groups_help(&groups[0],&groups[nrisk+index_hier+1]); //+1 needed here // select highest risk for hid in each hierarchy level risk_value[index_hier] = max(risk_value[index_hier],1.0/group_count[groups_help]); } - + hsize++; h++; } - + // assign highst risk and revers risk to all household members for(int j=0;j > setRisk(std::vector > &data, */ std::vector randSample(std::unordered_set &ID, int N, std::vector &prob, std::mt19937 &mersenne_engine, std::vector &IDused, std::unordered_set &mustSwap){ - + // initialise parameters std::exponential_distribution exp_dist(1.0); // initialise lambda para for exp distribution /* @@ -186,8 +186,8 @@ std::vector randSample(std::unordered_set &ID, int N, std::vector prob[i]/exp_dist(mersenne_engine) // and store them in priority queue // get index of N largest elements in randVal @@ -195,7 +195,7 @@ std::vector randSample(std::unordered_set &ID, int N, std::vector > q; std::vector sampleID(ID.size()); - + int z = 0; for(auto s : ID){ if(IDused[s]==0){ @@ -207,11 +207,11 @@ std::vector randSample(std::unordered_set &ID, int N, std::vector(q.size(),N-z)); sampleID.resize(N+z); - + // build output vector if(N>0){ // select index of top elements from priority_queue @@ -227,43 +227,43 @@ std::vector randSample(std::unordered_set &ID, int N, std::vector,int> distributeRandom(std::map,double> &ratioDraws, int &totalDraws, std::mt19937 &mersenne_engine){ - + // ratioDraws map containing ratio of units to draw for each map entry // totalDraws integer containing total number of units to draw // mersenne_engine random number generator engine - + // output std::map,int> numberDraws; - + /////////////// // distribute draws at lowest level hierarchy double draw_excess_help = 0; double x_excess = 0; for(auto const&x : ratioDraws){ - + x_excess = ratioDraws[x.first]*(double)totalDraws; - + numberDraws[x.first] = floor(x_excess); - + x_excess = x_excess-floor(x_excess); draw_excess_help = draw_excess_help + x_excess; - + } - + int draw_excess = std::round(draw_excess_help); - + if(draw_excess==0){ // return output of nothing need to be distributed return numberDraws; } - + // randomly shuffeld index vector // this is similar to randomly round up and down in each group so on average swaprate will be reached std::vector add_extra(ratioDraws.size()); std::iota(add_extra.begin(),add_extra.end(),0); std::shuffle(add_extra.begin(),add_extra.end(),mersenne_engine); std::sort(add_extra.begin(),add_extra.begin()+draw_excess); // sort first draw_excess elemets in vector - + // pick first draw_excess values and add one to them int z = 0; int v = 0; @@ -278,7 +278,7 @@ std::map,int> distributeRandom(std::map,double } z++; } - + // return output return numberDraws; } @@ -289,12 +289,12 @@ std::map,int> distributeDraws2(std::map,std::u int &nhid, double &swaprate, std::uniform_int_distribution &runif01, std::mt19937 &mersenne_engine){ - + // group_hier map which contains all household indices per hierarchy level (only all hierarchy levels are used atm) // nhid int containing number of households in total // swaprate double containing the swaprate // runif01 & mersenne_engine for sampling procedures - + /////////////// // define total number of swaps according to swaprate // swaprate/2 ensures that in the end this percentage of households is swapped @@ -305,7 +305,7 @@ std::map,int> distributeDraws2(std::map,std::u }else{ totalDraws = floor(nhid*(swaprate/2)); } - + /////////////// // define number of units to swap at lowest level hierarchy // loop through all hierarchies @@ -313,10 +313,10 @@ std::map,int> distributeDraws2(std::map,std::u std::map,double > ratioRisk; // get ratio of numbers to draw in lowest level hierarchy std::map, std::vector > sumRisk; // sum of Risk in each hierarchy level int nhier = risk[0].size(); // number of hierarchies - + // calcualte sum of risk in each hierarchy level for(auto const&x : group_hier){ - + std::vector sumRisk_help(nhier); // help vector to fill sumRisk for(int h=nhier;--h >=0;){ //std::cout<<"h= "<,int> distributeDraws2(std::map,std::u sumRisk_help[h] += risk[indexI][h]; } } - + sumRisk[x.first] = sumRisk_help; - + // get draw ratio ~ percentage of units to draw in each lowest level hierarchy ratioRisk[x.first] = sumRisk[x.first].back(); } - + //normalize ratioRisk double sum_ratioRisk = 0.0; for(auto const&x : ratioRisk){ @@ -339,24 +339,24 @@ std::map,int> distributeDraws2(std::map,std::u for(auto const&x : ratioRisk){ ratioRisk[x.first] = x.second/sum_ratioRisk; } - + // calculate number of draws on lowest level hierarchy std::map,int> numberDraws = distributeRandom(ratioRisk, totalDraws,mersenne_engine); - - + + // loop over hierarchy and distribute each entry in numberDraws // over the hierarchies vertically double helpSum = 0.0; std::vector hl; - + for(auto const&x : group_hier){ - + // sum over risks vertically helpSum = 0.0; for (auto& r : sumRisk[x.first]){ helpSum += r; } - + std::map,double > ratioHelp; hl = x.first; hl.push_back(1); @@ -364,11 +364,11 @@ std::map,int> distributeDraws2(std::map,std::u hl.back() = h+1; ratioHelp[hl] = sumRisk[x.first][h]/helpSum; } - + std::map,int> helpDist = distributeRandom(ratioHelp, numberDraws[x.first],mersenne_engine); - + for(auto const&x : helpDist){ - + std::vector help_hier(x.first.begin(),x.first.begin() + x.first.back()); if(nhier!=x.first.back()){ // for higher hierarchies @@ -378,10 +378,10 @@ std::map,int> distributeDraws2(std::map,std::u // for lowest hierarchy overwrite value numberDraws[help_hier] = x.second; } - + } } - + return numberDraws; } @@ -400,11 +400,11 @@ std::map,std::pair> distributeDraws(std::map,std::pair> distributeDraws(std::map,std::pair> distributeDraws(std::map,std::pair> distributeDraws(std::map,std::pair> distributeDraws(std::map sampleDonor(std::vector< std::vector > &data, std::vector> &similar, std::vector &IDswap, std::unordered_set &IDswap_pool, std::map &IDdonor_pool, std::vector &IDused, int &hid){ - + // data: data input data.size() ~ number of records - data.[0].size ~ number of varaibles per record // similar: column indices in data corresponding to variables (household/personal) which should be considered when swapping, - // e.g. swapping onlys household with same houshoeld size + // e.g. swapping onlys household with same houshoeld size // IDswap: vector containing household IDs to be swapped // IDswap_pool: unordered set containing sampling pool from which IDswap was drawn // IDdono_pool: map containing every possible donor ID (ordered by sampling probability in ascending order) // IDused: integer vector which takes on 1 if ID was sampled - + // define parameter std::vector IDdonor(IDswap.size(),-1); // output initialize with -1 // if value stays -1 then no donor was found for corresponding value in IDswap - bool similar_true=true; + bool similar_true=true; int index_donor = 0; - + // select donor based on similarity constrains // iterate over both unordered sets // iterate over IDdonor_pool in reverse order since it is sorted in ascending order by risk @@ -486,7 +486,7 @@ std::vector sampleDonor(std::vector< std::vector > &data, std::vector< // find donor for index_samp // iterate over similarity profiles for(std::size_t profile=0;profilesecond access the value @@ -522,8 +522,8 @@ std::vector sampleDonor(std::vector< std::vector > &data, std::vector< next_index_samp: ; } - - return IDdonor; + + return IDdonor; } @@ -532,54 +532,80 @@ std::vector sampleDonor(std::vector< std::vector > &data, std::vector< * Function to perform record swapping */ std::vector< std::vector > recordSwap(std::vector< std::vector > data, int hid, - std::vector hierarchy, + std::vector hierarchy, std::vector< std::vector > similar, double swaprate, std::vector< std::vector > risk, double risk_threshold, - int k_anonymity, std::vector risk_variables, + int k_anonymity, std::vector risk_variables, std::vector carry_along, int &count_swapped_records, int &count_swapped_hid, std::string log_file_name, int seed = 123456){ - - // data: data input data.size() ~ number of records - data.[0].size ~ number of varaibles per record - // hid: int correspondig to column index in data which holds the household ID + + // data: data input data.size() ~ number of records - data.[0].size ~ number of variables per record + // hid: int corresponding to column index in data which holds the household ID // hierarchy: column indices in data corresponding to geo hierarchy of data read left to right (left highest level - right lowest level) // similar: column indices in data corresponding to variables (household/personal) which should be considered when swapping, - // e.g. swapping onlys household with same houshoeld size + // e.g. swapping only households with same household size // swaprate: double defining the ratio of households to be swapped // risk: double vector of vectors containing the risk for each individual in each record - risk_record[0] risk for first record an each hierarchy level - // risk_threshold: double cutoff for defining highrisk households. if risk>risk_threshold then household is high risk is will definitely be swapped + // risk_threshold: double cutoff for defining high-risk households. if risk>risk_threshold then household is high risk is will definitely be swapped // k_anonymity: int defining a threshold, each group with counts lower than the threshold will automatically be swapped. // risk_variables: column indices in data corresponding to risk variables which will be considered for estimating counts in the population - // carry_along: swap additional variables in addition to hierarchy variable. These variables do not interfere with the procedure of + // carry_along: swap additional variables in addition to hierarchy variable. These variables do not interfere with the procedure of // finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies. // count_swapped_records, count_swapped_hid: count number of households and records swapped. // seed: integer seed for random number generator. // log_file_name: name of file to save HIDs of non-swapped households. - - // initialise parameters - int n = data.size(); // number of obesrvations + + // initialize parameters + int n = data.size(); // number of observations int nhier = hierarchy.size(); // number of hierarchy levels std::unordered_set IDnotUsed; // needed for running random number generator and // set random seed according to input parameter std::mt19937 mersenne_engine; mersenne_engine.seed(seed); - - // initialise random number generator for exponential distribution + + // initialize random number generator for exponential distribution std::exponential_distribution exp_dist(1.0); // initialize random number generator for uniform distribution std::uniform_int_distribution runif01(0,1); - + //////////////////////////////////////////////////// - // order data by hid + // order data by hid // not needed at the moment -> order data outside function // orderData(data,hid); //////////////////////////////////////////////////// - - + + + // we order by hid and check that carry_along variables + // are at household level + int nr_carry_along = carry_along.size(); + int nr_records = data.size(); + if (nr_carry_along > 0) { + orderData(data, hid); + int cur_hid, cur_val, chk_val; + for (int j = 0; j < nr_carry_along; j++) { + int cur_carryalong = carry_along[j]; + int prev_hid = -1; + for (int i = 0; i < nr_records; i++) { + cur_hid = data[i][hid]; + cur_val = data[i][cur_carryalong]; + bool new_hh = cur_hid != prev_hid; + if (new_hh) { + chk_val = cur_val; + } else { + if (chk_val != cur_val) { + throw std::runtime_error("carry_along-variables must have identical values within (household) ids"); + } + } + prev_hid = cur_hid; + } + } + } + //////////////////////////////////////////////////// // define risk data if not supplied by user // using risk_variables and 1/counts @@ -590,8 +616,8 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, prob = risk; } //////////////////////////////////////////////////// - - + + //////////////////////////////////////////////////// // define minimum swap level for each household if(risk_threshold==0){ @@ -602,9 +628,9 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, } } std::vector levels = setLevels(prob,risk_threshold); - - //////////////////////////////////////////////////// - + + //////////////////////////////////////////////////// + //////////////////////////////////////////////////// // get household size for each household ID // initialise map for household size @@ -615,14 +641,14 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, map_hsize[data[i][hid]]++; } //////////////////////////////////////////////////// - - + + //////////////////////////////////////////////////// // apply swapping algorithm // go from highest to lowest level // swapp at each higher level the number of households that have to be swapped at that level according to "k_anonymity" (see setLevels()) // at lowest level swap remaining number of households (according to swap) if not enough households have been swapped - // every household can only be swapped once + // every household can only be swapped once std::map,std::unordered_set > group_hier; // std::map,int> countSwaps; // count swaps already done in each hierarchy std::unordered_map > group_levels; // map containing all IDs which must be swapped at a certain level (~key of map) @@ -633,56 +659,56 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, std::map> samp_order_donor; int z=0; // counter used for while() ect... int nhid = 0; - + ///////////////////////////// // create map containing subgroups according to hierarchy // and IDs of each subgroup // use hhsize for this to speed things up while(z number of households in lowest level hierarchy // draw_group[].second -> number of swaps in lowest level hierarchy - std::map,std::pair> draw_group = distributeDraws(group_hier, nhid, swaprate, + std::map,std::pair> draw_group = distributeDraws(group_hier, nhid, swaprate, runif01, mersenne_engine); //std::map,int> draw_group = distributeDraws2(group_hier, risk, nhid, swaprate, // runif01, mersenne_engine); ///////////////////////////// - + ///////// // this is needed only for the lowest hierarchy // will be changed for the final version @@ -690,32 +716,32 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, for(int i=0;i mustSwap; - + if(group_levels.find(h)!=group_levels.end()){ mustSwap = group_levels[h]; } - + std::map,unordered_set > group_hier_help; hier_help.resize(h+1); - + ///////////////// // get combined map for hierarchy h for(auto const&x : group_hier){ - + // get higher hierarchy std::copy(x.first.begin(),x.first.begin()+h+1,hier_help.begin()); - + // discard every index that has already been used // more efficient to do this at this step then later on in the code for(auto s : x.second){ @@ -729,8 +755,8 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, } } ///////////////// - - + + ///////////////// // int sampSize=0; // int countUsed=0; @@ -738,13 +764,13 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, ///////////////// // loop over levels of hierarchy for(auto &x : group_hier_help){ - + // std::vector xfirst = group_hier_help.begin()->first; // std::vector xsecond = group_hier_help.begin()->second; // get values that need to be swapped at this hierarchy level and which are // in this hierarchy stage std::vector IDswap(x.second.size()); - + if(h<(nhier-1)){ // in all but the last hierarchy level do the follwing: if(mustSwap.size()>0){ @@ -766,48 +792,48 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, // if at lowest level get number of households that need to be swapped // according to swap and check if this number was already reached // by previous swappings - + // not enough households have been swapped // when checking at lowest level // Number of IDs that need to be swapped - already swapped IDs - IDs that have to be swapped at lowest level: countRest = draw_group[x.first].second - countSwaps[x.first]; countRest = std::max(0,countRest); - + std::unordered_set IDswap_draw = x.second; - + // apply sampling here -> should still be quick because IDswap_draw will not be extremely large // in randSample households that must be swapped are automatically choosen std::vector IDswap_help = randSample(IDswap_draw,countRest,prob_help,mersenne_engine,IDused,mustSwap); IDswap.resize(IDswap_help.size()); - IDswap = IDswap_help; + IDswap = IDswap_help; } - - + + // if any IDs need to be swapped: if(IDswap.size()>0){ - + // get donor set // if IDdonor is -1 at a position ==> no donor for IDswap at same position std::vector IDdonor = sampleDonor(data, similar, IDswap, x.second, samp_order_donor[h], IDused, hid); - + // set Index to used for(std::size_t i=0;i-1){ IDused[IDdonor[i]]=1; IDused[IDswap[i]]=1; - // store results from sampling in swappedIndex + // store results from sampling in swappedIndex swappedIndex[IDswap[i]] = IDdonor[i]; }else{ IDnotUsed.insert(IDswap[i]); } } - + } ///////////////// } } - + //////////////////////////////////////////////////// // Create output using swappedIndex carry_along.insert( carry_along.end(), hierarchy.begin(), hierarchy.end() ); @@ -818,15 +844,15 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, for(auto const&x : swappedIndex){ hsize = map_hsize[data[x.first][hid]]; hsizewith = map_hsize[data[x.second][hid]]; - + count_swapped_records = count_swapped_records + hsize + hsizewith; // count how many records are swapped - + // erase elements if they have been used during the procedure // donor was not found on highest hierarchy // but donor was found on lowest...this might actually be a bug... // IDnotUsed.erase(x.first); // IDnotUsed.erase(x.second); - + // loop over variables to swapp for(int j=0;j > recordSwap(std::vector< std::vector > data, } } } - - // save number of swaped hids + + // save number of swaped hids count_swapped_hid = swappedIndex.size()*2; - + if(IDnotUsed.size()>0){ // cout<<"Recordswapping was successful!"< > recordSwap(std::vector< std::vector > data, } fclose(pFile); } - + return data; - -} \ No newline at end of file + +} diff --git a/vignettes/recordSwapping.Rmd b/vignettes/recordSwapping.Rmd index 430a7151..064df7e0 100644 --- a/vignettes/recordSwapping.Rmd +++ b/vignettes/recordSwapping.Rmd @@ -53,7 +53,7 @@ recordSwap(std::vector< std::vector > data, int hid, Should be ignored, for now, it is not fully tested yet. + **k_anonymity** integer defining the threshold of high risk households (k-anonymity). A record is not at risk if `k_anonymity > counts`. + **risk_variables** column indices of variables in \code{data} which will be considered for estimating the risk. This is only used if `risk` was not supplied. -+ **carry_along** column indices of variables in \code{data} which are additionally swapped. These variables do not interfere with the procedure of finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies. ++ **carry_along** column indices of variables in \code{data} which are additionally swapped. These variables do not interfere with the procedure of finding a record to swap with. This parameter is only used at the end of the procedure when swapping the hierarchies. However, the variables need to be at household-level (identical value within **hid**) + **count_swapped_records**, **count_swapped_hid** count number of households and records swapped + **log_file_name** path for writing a log file. The log file contains a list of household IDs (`hid`) that could not have been swapped and is only created if any such households exist. + **seed** integer defining the seed for the random number generator, for reproducibility. From 911d622963f5c6c89985a3d9bcb7657cc705c18d Mon Sep 17 00:00:00 2001 From: Bernhard Meindl Date: Thu, 21 Jul 2022 07:48:08 +0200 Subject: [PATCH 2/3] add test for record-swapping: carry_along variables must be at household-level --- tests/testthat/test_recordSwap_inputs.R | 66 ++++++++++++++----------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/tests/testthat/test_recordSwap_inputs.R b/tests/testthat/test_recordSwap_inputs.R index 68ac9c18..35df32e3 100644 --- a/tests/testthat/test_recordSwap_inputs.R +++ b/tests/testthat/test_recordSwap_inputs.R @@ -19,7 +19,7 @@ hid <- "hid" # test input parameter test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_along",{ - + ################################# # data dat_wrong <- 1:10 @@ -31,7 +31,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon return_swapped_id = TRUE, seed=seed), "data must be either a data.table, data.frame") - + expect_error(recordSwap(data = as.data.frame(dat), hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -40,9 +40,9 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon return_swapped_id = TRUE, seed=seed), NA) - + ################################# - + ################################# # hid expect_error(recordSwap(data = dat, hid = -1, hierarchy = hier, @@ -52,7 +52,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column indices cannot be negative") - + expect_error(recordSwap(data = dat, hid = "hhids", hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -60,7 +60,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in hid are not found in data") - + expect_error(recordSwap(data = dat, hid = c("HHID","hhids"), hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -69,7 +69,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon return_swapped_id = TRUE, seed=seed),"hid must be an integer \\(column index\\) or character \\(column name\\) of length 1") ################################# - + ################################# # hierarchy expect_error(recordSwap(data = dat, hid = hid, hierarchy = c(10:100), @@ -79,7 +79,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column index in hierarchy exceeds number of columns in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = c("GEM","BDL","GKZ"), similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -87,7 +87,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in hierarchy are not found in data") - + dat[,h_extra:=runif(.N)] dat[,h_extra2:=sample(LETTERS,.N,replace=TRUE)] expect_error(recordSwap(data = dat, hid = hid, hierarchy = c("h_extra","h_extra2"), @@ -99,7 +99,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon seed=seed),"Columns specified in hid, hierarchy, similar and carry\\_along must contain only integer values at this point") dat[,c("h_extra","h_extra2"):=NULL] ################################# - + ################################# # similarity expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -109,7 +109,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column indices cannot be negative") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = c("hsize","hstatus"), swaprate = swaprate, k_anonymity = k_anonymity, @@ -117,7 +117,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in X\\[\\[i\\]\\] are not found in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = list(c("hsize","hstatus"),"hsize"), swaprate = swaprate, k_anonymity = k_anonymity, @@ -125,7 +125,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in X\\[\\[i\\]\\] are not found in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = list(c("hsize","htype"),"hsize"), swaprate = swaprate, k_anonymity = k_anonymity, @@ -133,9 +133,9 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),NA) - + ################################# - + ################################# # risk_variables expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -145,7 +145,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column index in risk\\_variables exceeds number of columns in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -153,7 +153,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = NULL, return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in risk\\_variables are not found in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -164,7 +164,15 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon ################################# ################################# - # carry_along + # carry_along must be at household-level + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, + similar = similar, swaprate = swaprate, + k_anonymity = k_anonymity, + risk_variables = risk_variables, + carry_along = "ageGroup", + return_swapped_id = TRUE, + seed=seed),"carry\\_along-variables must have identical values within \\(household\\) ids") + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -172,7 +180,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = -1, return_swapped_id = TRUE, seed=seed),"Column indices cannot be negative") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -180,7 +188,7 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = c("carry_along"), return_swapped_id = TRUE, seed=seed),"Column name\\(s\\) in carry_along are not found in data") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -188,12 +196,12 @@ test_that("test para - data, hid, hierarchy, similar, risk_variables, carry_alon carry_along = c("lau2"), return_swapped_id = TRUE, seed=seed),NA) - + }) test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ - + ################################# # swaprate expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -203,7 +211,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = TRUE, seed=seed),"swaprate must be a single number between 0 and 1!") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = c(0.1,0.5), k_anonymity = k_anonymity, @@ -211,7 +219,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = TRUE, seed=seed),"swaprate must be a single number between 0 and 1!") - + ################################# # k_anonymity expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -221,7 +229,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = TRUE, seed=seed),"k_anonymity must be a positiv single integer!") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = c(1,6), @@ -229,7 +237,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = TRUE, seed=seed),"k_anonymity must be a positiv single integer!") - + ################################## # return_swapped_id expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -239,7 +247,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = c(TRUE,TRUE), seed=seed),"return\\_swapped_id must be logical of length 1") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, @@ -247,7 +255,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = "HID", seed=seed),"return\\_swapped_id must be logical of length 1") - + ################################## # seed expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, @@ -257,7 +265,7 @@ test_that("test para - swaprate, k_anonymity, return_swapped_id, seed",{ carry_along = NULL, return_swapped_id = TRUE, seed=1.5),"seed must be a single positive integer!") - + expect_error(recordSwap(data = dat, hid = hid, hierarchy = hier, similar = similar, swaprate = swaprate, k_anonymity = k_anonymity, From 99215b2690b843c81a6980e8e6fea777f9e24428 Mon Sep 17 00:00:00 2001 From: Bernhard Meindl Date: Thu, 21 Jul 2022 12:08:17 +0200 Subject: [PATCH 3/3] bump algorithm-version to 1.0.2 after adding checks for carry_along variables --- src/recordSwap/recordSwap.cpp | 2 +- src/recordSwap/recordSwap.h | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/recordSwap/recordSwap.cpp b/src/recordSwap/recordSwap.cpp index 8d992222..c5bbca0f 100644 --- a/src/recordSwap/recordSwap.cpp +++ b/src/recordSwap/recordSwap.cpp @@ -1,6 +1,6 @@ /* * Algorithm for targeted record swapping - * Version: 1.0.1 + * Version: 1.0.2 */ #include diff --git a/src/recordSwap/recordSwap.h b/src/recordSwap/recordSwap.h index a53e63fa..9343c6f5 100644 --- a/src/recordSwap/recordSwap.h +++ b/src/recordSwap/recordSwap.h @@ -1,27 +1,27 @@ /* * Open Source Software to apply Statistical Disclosure Control techniques - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the European Union Public Licence + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the European Union Public Licence * (EUPL) version 1.1, as published by the European Commission. - * + * * You can find the text of the EUPL v1.1 on * https://joinup.ec.europa.eu/software/page/eupl/licence-eupl - * - * This software is distributed on an "AS IS" basis without + * + * This software is distributed on an "AS IS" basis without * warranties or conditions of any kind, either express or implied. */ /* - * Version: 1.0.1 + * Version: 1.0.2 */ -/* +/* * Header file for shared library recordSwap.dll * with source code recordSwap.cpp * to perform Targeted Record Swapping */ -#include +#include #include // std::count #include // std::vector #include @@ -41,11 +41,11 @@ * Function to perform record swapping */ std::vector< std::vector > recordSwap(std::vector< std::vector > data, int hid, - std::vector hierarchy, + std::vector hierarchy, std::vector< std::vector > similar, double swaprate, std::vector< std::vector > risk, double risk_threshold, - int k_anonymity, std::vector risk_variables, + int k_anonymity, std::vector risk_variables, std::vector carry_along, int &count_swapped_records, int &count_swapped_hid, @@ -55,17 +55,17 @@ std::vector< std::vector > recordSwap(std::vector< std::vector > data, //private: /* - * Function to reorder data-set given one index vector + * Function to reorder data-set given one index vector */ std::vector< std::vector > orderData(std::vector< std::vector > &data, int orderIndex); /* - * Function to define levels + * Function to define levels */ std::vector setLevels(std::vector< std::vector > &risk, double risk_threshold); /* - * Function to set sampling probability + * Function to set sampling probability * and reverse sampling probability (for donor sets) */ std::vector< std::vector > setRisk(std::vector > &data, std::vector &hierarchy, std::vector &risk_variables, int &hid); @@ -85,7 +85,7 @@ std::vector sampleDonor(std::vector< std::vector > &data, std::vector< std::vector &IDswap, std::unordered_set &IDswap_pool, std::map &IDdonor_pool, std::vector &IDused, int &hid); -/* +/* * help function to randomly distribute number of units to draw from */ std::map,int> distributeRandom(std::map,double> &ratioDraws, int &totalDraws,