diff --git a/cpp/bench/sg/fil.cu b/cpp/bench/sg/fil.cu index 7c9f0b6f7d..4444dd95ee 100644 --- a/cpp/bench/sg/fil.cu +++ b/cpp/bench/sg/fil.cu @@ -78,7 +78,7 @@ class FIL : public RegressionFixture { ML::RandomForestRegressorF rf_model; auto* mPtr = &rf_model; size_t train_nrows = std::min(params.nrows, 1000); - fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf); + fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf, nullptr); handle->sync_stream(stream); ML::build_treelite_forest(&model, &rf_model, params.ncols); @@ -166,8 +166,7 @@ std::vector getInputs() 8, /* n_streams */ 128 /* max_batch_size */, 0, /* minTreesPerGroupFold */ - 0, /* foldGroupSize */ - -1 /* group_col_idx */ + 0 /* foldGroupSize */ ); using ML::fil::algo_t; diff --git a/cpp/bench/sg/filex.cu b/cpp/bench/sg/filex.cu index ffc0d2b157..b64b35b7af 100644 --- a/cpp/bench/sg/filex.cu +++ b/cpp/bench/sg/filex.cu @@ -61,7 +61,7 @@ class FILEX : public RegressionFixture { ML::RandomForestRegressorF rf_model; auto* mPtr = &rf_model; auto train_nrows = std::min(params.nrows, 1000); - fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf); + fit(*handle, mPtr, data.X.data(), train_nrows, params.ncols, data.y.data(), p_rest.rf, nullptr); handle->sync_stream(stream); ML::build_treelite_forest(&model, &rf_model, params.ncols); @@ -266,8 +266,7 @@ std::vector getInputs() 8, /* n_streams */ 128, /* max_batch_size */ 0, /* minTreesPerGroupFold */ - 0, /* foldGroupSize */ - -1 /* group_col_idx */ + 0 /* foldGroupSize */ ); using ML::fil::algo_t; diff --git a/cpp/bench/sg/rf_classifier.cu b/cpp/bench/sg/rf_classifier.cu index f0936ee31e..8d466b4bb7 100644 --- a/cpp/bench/sg/rf_classifier.cu +++ b/cpp/bench/sg/rf_classifier.cu @@ -66,7 +66,8 @@ class RFClassifier : public BlobsFixture { this->params.ncols, this->data.y.data(), this->params.nclasses, - rfParams); + rfParams, + nullptr /* groups */); this->handle->sync_stream(this->stream); }); } @@ -110,8 +111,7 @@ std::vector getInputs() 8, /* n_streams */ 128, /* max_batch_size */ 0, /* minTreesPerGroupFold */ - 0, /* foldGroupSize */ - -1 /* group_col_idx */ + 0 /* foldGroupSize */ ); std::vector rowcols = { diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index d978339891..e3bf4caff8 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -136,12 +136,6 @@ struct RF_params { * entire fold of groups left out. */ int foldGroupSize; - - /** - * group_col_idx - * The numeric index of the column to be used for group processing - */ - int group_col_idx; /** * Decision tree training hyper parameter struct. @@ -196,6 +190,12 @@ void build_treelite_forest(ModelHandle* model, const RandomForestMetaData* forest, int num_features); +template +int get_tree_row_meta_info( + int ix_tree, + int ix_sample, + const RandomForestMetaData* forest); + ModelHandle concatenate_trees(std::vector treelite_handles); void compare_concat_forest_to_subforests(ModelHandle concat_tree_handle, @@ -213,6 +213,7 @@ void fit(const raft::handle_t& user_handle, int* labels, int n_unique_labels, RF_params rf_params, + int* groups = nullptr, int verbosity = CUML_LEVEL_INFO); void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest, @@ -222,6 +223,7 @@ void fit(const raft::handle_t& user_handle, int* labels, int n_unique_labels, RF_params rf_params, + int* groups = nullptr, int verbosity = CUML_LEVEL_INFO); void predict(const raft::handle_t& user_handle, @@ -271,8 +273,7 @@ RF_params set_rf_params(int max_depth, int cfg_n_streams, int max_batch_size, int minTreesPerGroupFold, - int foldGroupSize, - int group_col_idx); + int foldGroupSize); // ----------------------------- Regression ----------------------------------- // @@ -286,7 +287,9 @@ void fit(const raft::handle_t& user_handle, int n_cols, float* labels, RF_params rf_params, + int* groups = nullptr, int verbosity = CUML_LEVEL_INFO); + void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest, double* input, @@ -294,6 +297,7 @@ void fit(const raft::handle_t& user_handle, int n_cols, double* labels, RF_params rf_params, + int* groups = nullptr, int verbosity = CUML_LEVEL_INFO); void predict(const raft::handle_t& user_handle, diff --git a/cpp/include/cuml/tree/decisiontree.hpp b/cpp/include/cuml/tree/decisiontree.hpp index cfedb1e057..f710a54bde 100644 --- a/cpp/include/cuml/tree/decisiontree.hpp +++ b/cpp/include/cuml/tree/decisiontree.hpp @@ -117,6 +117,8 @@ void set_tree_params(DecisionTreeParams& params, int cfg_max_batch_size = 4096, bool cfg_oob_honesty = false); +enum SplitAvgUnusedEnum {unused = 0, split, avg, group_split_unselected, group_avg_unselected, invalid}; + template struct TreeMetaDataNode { int treeid; @@ -125,6 +127,7 @@ struct TreeMetaDataNode { double train_time; std::vector vector_leaf; std::vector> sparsetree; + std::vector split_avg_enums; int num_outputs; }; diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index 0cc1b6f51f..6a7fb385d7 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -85,7 +85,7 @@ class NodeQueue { int nTrain = int(instance.count - instance.avg_count); if (nTrain < params.min_samples_split_splitting) return false; - if (params.oob_honesty and instance.avg_count < params.min_samples_split_averaging) return false; + if (params.oob_honesty and static_cast(instance.avg_count) < params.min_samples_split_averaging) return false; if (params.max_leaves != -1 && tree->leaf_counter >= params.max_leaves) return false; return true; } diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 3817bcef9a..bc7358724f 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -449,6 +449,7 @@ void fit(const raft::handle_t& user_handle, int* labels, int n_unique_labels, RF_params rf_params, + int* groups, int verbosity) { raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu"); @@ -459,7 +460,18 @@ void fit(const raft::handle_t& user_handle, std::shared_ptr> rf_classifier = std::make_shared>(rf_params, RF_type::CLASSIFICATION); - rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest); + rf_classifier->fit(user_handle, input, groups, n_rows, n_cols, labels, n_unique_labels, forest); +} + +template +int get_tree_row_meta_info( + int ix_tree, + int ix_sample, + const RandomForestMetaData* forest) +{ + if (ix_tree >= forest->trees.size()) return SplitAvgUnusedEnum::invalid; + if (ix_sample >= forest->trees[ix_tree]->split_avg_enums.size()) return SplitAvgUnusedEnum::invalid; + return forest->trees[ix_tree]->split_avg_enums[ix_sample]; } void fit(const raft::handle_t& user_handle, @@ -470,6 +482,7 @@ void fit(const raft::handle_t& user_handle, int* labels, int n_unique_labels, RF_params rf_params, + int* groups, int verbosity) { raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu"); @@ -480,7 +493,7 @@ void fit(const raft::handle_t& user_handle, std::shared_ptr> rf_classifier = std::make_shared>(rf_params, RF_type::CLASSIFICATION); - rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest); + rf_classifier->fit(user_handle, input, groups, n_rows, n_cols, labels, n_unique_labels, forest); } /** @} */ @@ -596,8 +609,7 @@ RF_params set_rf_params(int max_depth, int cfg_n_streams, int max_batch_size, int minTreesPerGroupFold, - int foldGroupSize, - int group_col_idx) + int foldGroupSize) { DT::DecisionTreeParams tree_params; DT::set_tree_params(tree_params, @@ -625,7 +637,6 @@ RF_params set_rf_params(int max_depth, rf_params.tree_params = tree_params; rf_params.minTreesPerGroupFold = minTreesPerGroupFold; rf_params.foldGroupSize = foldGroupSize; - rf_params.group_col_idx = group_col_idx; validity_check(rf_params); return rf_params; } @@ -654,6 +665,7 @@ void fit(const raft::handle_t& user_handle, int n_cols, float* labels, RF_params rf_params, + int* groups, int verbosity) { raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu"); @@ -664,7 +676,7 @@ void fit(const raft::handle_t& user_handle, std::shared_ptr> rf_regressor = std::make_shared>(rf_params, RF_type::REGRESSION); - rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, 1, forest); + rf_regressor->fit(user_handle, input, groups, n_rows, n_cols, labels, 1, forest); } void fit(const raft::handle_t& user_handle, @@ -674,6 +686,7 @@ void fit(const raft::handle_t& user_handle, int n_cols, double* labels, RF_params rf_params, + int* groups, int verbosity) { raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu"); @@ -684,7 +697,7 @@ void fit(const raft::handle_t& user_handle, std::shared_ptr> rf_regressor = std::make_shared>(rf_params, RF_type::REGRESSION); - rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, 1, forest); + rf_regressor->fit(user_handle, input, groups, n_rows, n_cols, labels, 1, forest); } /** @} */ @@ -776,6 +789,11 @@ template std::string get_rf_summary_text(const RandomForestClassifi template std::string get_rf_summary_text(const RandomForestRegressorF* forest); template std::string get_rf_summary_text(const RandomForestRegressorD* forest); +template int get_tree_row_meta_info(int, int, const RandomForestClassifierF* forest); +template int get_tree_row_meta_info(int, int, const RandomForestClassifierD* forest); +template int get_tree_row_meta_info(int, int, const RandomForestRegressorF* forest); +template int get_tree_row_meta_info(int, int, const RandomForestRegressorD* forest); + template std::string get_rf_detailed_text(const RandomForestClassifierF* forest); template std::string get_rf_detailed_text(const RandomForestClassifierD* forest); template std::string get_rf_detailed_text(const RandomForestRegressorF* forest); diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh index 2419f7f265..7e1fa77dfc 100644 --- a/cpp/src/randomforest/randomforest.cuh +++ b/cpp/src/randomforest/randomforest.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -61,18 +62,79 @@ struct set_mask_functor { namespace { -__global__ void log10(int* array) { - for (int ix = 0; ix < 10; ++ix) { +using ML::DT::SplitAvgUnusedEnum; + +__global__ void log(int* array, int n_samples) { + for (int ix = 0; ix < n_samples; ++ix) { printf("array %d = %d\n", ix, array[ix]); } } -__global__ void log10groups(const int* row_ids, const int* group_ids) { - for (int ix = 0; ix < 10; ++ix) { +__global__ void loggroups(const int* row_ids, const int* group_ids, const int n_samples) { + for (int ix = 0; ix < n_samples; ++ix) { printf("group ix %d, row %d = %d\n", ix, row_ids[ix], group_ids[row_ids[ix]]); } } +__global__ void assign_standard_honesty_vec( + SplitAvgUnusedEnum* split_enums, + const int n_total_selected_rows, + const int* selected_rows, + const int n_splitting_rows) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_total_selected_rows) return; + + if (idx < n_splitting_rows) { + split_enums[selected_rows[idx]] = SplitAvgUnusedEnum::split; + } else { + split_enums[selected_rows[idx]] = SplitAvgUnusedEnum::avg; + } +} + +template +__device__ int lower_bound(const T search_val, const U* array, int count) { + int it, step; + int first = 0; + while (count > 0) { + step = count / 2; + it = first + step; + if (array[it] < search_val) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first; +} + +template +__device__ int lower_bound_search(const T search_val, const U* array, int count) { + int res = lower_bound(search_val, array, count); + return res < count and array[res] == search_val; +} + +// In this second stage for groups, we assign any "unused" member of the particular group of groups to +// indicate that it was part of this group, but not selected +__global__ void assign_group_based_standard_honesty_vec_stage1( + SplitAvgUnusedEnum* split_enums, + const int n_rows, + const int* sample_group_ids, + const int* considered_group_ids, + const int considered_group_id_count, + SplitAvgUnusedEnum enum_val) +{ + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n_rows) return; + + const int group_id = sample_group_ids[idx]; + + if (lower_bound_search(group_id, considered_group_ids, considered_group_id_count)) { + split_enums[idx] = enum_val; + } +} + void assign_groups_to_folds( int n_groups, int n_folds, @@ -82,39 +144,28 @@ void assign_groups_to_folds( { std::vector group_indices(n_groups); std::iota(group_indices.begin(), group_indices.end(), 0); - std::shuffle(group_indices.begin(), group_indices.end(), rng); - - for (int ix_fold = 0; ix_fold < n_folds - 1; ++ix_fold) { + + int ix_fold = 0; + for (; ix_fold < n_folds - 1; ++ix_fold) { + fold_memberships[ix_fold].resize(fold_size); std::copy(group_indices.begin() + ix_fold*fold_size, group_indices.begin() + (ix_fold+1)*fold_size, fold_memberships[ix_fold].begin()); + + std::sort(fold_memberships[ix_fold].begin(), fold_memberships[ix_fold].end()); } // Last fold could be smaller - const int last_fold_start = (n_folds - 1) * fold_size; + const int last_fold_start = (ix_fold) * fold_size; const int last_fold_size = n_groups - last_fold_start; - fold_memberships[n_folds - 1].resize(last_fold_size); + fold_memberships[ix_fold].resize(last_fold_size); + for (int ix = 0; ix < last_fold_size; ++ix) { - fold_memberships[n_folds - 1][ix] = group_indices[last_fold_start + ix]; + fold_memberships[ix_fold][ix] = group_indices[last_fold_start + ix]; } -} -template -__device__ int lower_bound(const T search_val, const U* array, int count) { - int it, step; - int first = 0; - while (count > 0) { - step = count / 2; - it = first + step; - if (array[it] < search_val) { - first = ++it; - count -= step + 1; - } else { - count = step; - } - } - return first; + std::sort(fold_memberships[ix_fold].begin(), fold_memberships[ix_fold].end()); } template @@ -148,8 +199,7 @@ struct LeaveOutSamplesCopyIfFunctor { __device__ bool operator()(const int ix_sample) { // Do a quick lower_bound search const int group_id = sample_group_ids[ix_sample]; - int it = lower_bound(group_id, remaining_groups, num_rem_groups); - return remaining_groups[it] == group_id; + return lower_bound_search(group_id, remaining_groups, num_rem_groups); } }; @@ -211,7 +261,6 @@ void leave_groups_out_sample( raft::update_device(remaining_groups->data(), remaining_groups_host.data(), remaining_groups_host.size(), stream); remaining_groups->resize(remaining_groups_host.size(), stream); - generate_row_indices_from_remaining_groups( remaining_groups, remaining_samples, sample_group_ids, num_samples, stream, rng); @@ -278,10 +327,16 @@ class RandomForest { const int n_groups, const int group_tree_count, const std::vector>& fold_memberships, // each group belongs to one fold + rmm::device_uvector* split_avg_enums, const cudaStream_t stream) { // Todo: split group_fold_rng across threads raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh"); + if (split_avg_enums) { + // Initialize to unused + assert(SplitAvgUnusedEnum::unused == 0); + cudaMemsetAsync(split_avg_enums->data(), 0, sizeof(SplitAvgUnusedEnum) * n_rows, stream); + } // Hash these together so they are uncorrelated auto random_seed = DT::fnv1a32_basis; @@ -298,6 +353,8 @@ class RandomForest { std::vector> honest_group_assignments(2); auto& splitting_groups = honest_group_assignments[0]; auto& averaging_groups = honest_group_assignments[1]; + + if (n_groups > 0) { // Special handling for groups. We don't support split ratio honesty const std::vector* current_fold_groups; @@ -328,7 +385,6 @@ class RandomForest { splitting_groups.resize(honest_split_size); averaging_groups.resize(restricted_ix_size - honest_split_size); - assign_groups_to_folds( restricted_ix_size, 2, @@ -338,10 +394,12 @@ class RandomForest { // Replace indices with the actual groups for (int ix_group = 0; ix_group < honest_split_size; ix_group++) { + // printf("split group %d is %d\n", ix_group, restricted_group_ixs_diff[honest_group_assignments[0][ix_group]]); honest_group_assignments[0][ix_group] = restricted_group_ixs_diff[honest_group_assignments[0][ix_group]]; } for (int ix_group = 0; ix_group < restricted_ix_size - honest_split_size; ix_group++) { + // printf("avg group %d is %d\n", ix_group, restricted_group_ixs_diff[honest_group_assignments[1][ix_group]]); honest_group_assignments[1][ix_group] = restricted_group_ixs_diff[honest_group_assignments[1][ix_group]]; } } else { @@ -366,11 +424,33 @@ class RandomForest { group_fold_rng); } + const int block_dim = 256; leave_groups_out_sample(remaining_groups, remaining_samples, selected_rows, workspace, groups, splitting_groups, n_sampled_rows, 0, stream, rng); - + const int stage1_grid_dim = (n_rows + block_dim - 1) / block_dim; + assign_group_based_standard_honesty_vec_stage1<<>>( + split_avg_enums->data(), + n_rows, + groups, + remaining_groups->data(), + remaining_groups->size(), + SplitAvgUnusedEnum::group_split_unselected); + + cudaStreamSynchronize(stream); leave_groups_out_sample(remaining_groups, remaining_samples, selected_rows, workspace, groups, averaging_groups, n_sampled_rows, n_sampled_rows, stream, rng); + assign_group_based_standard_honesty_vec_stage1<<>>( + split_avg_enums->data(), + n_rows, + groups, + remaining_groups->data(), + remaining_groups->size(), + SplitAvgUnusedEnum::group_avg_unselected); + + const int total_sampled_rows = 2 * n_sampled_rows; + const int standard_grid_dim = (total_sampled_rows + block_dim - 1) / block_dim; + assign_standard_honesty_vec<<>>( + split_avg_enums->data(), total_sampled_rows, selected_rows->data(), n_sampled_rows); update_averaging_mask(split_row_mask, n_sampled_rows, stream); @@ -380,6 +460,7 @@ class RandomForest { // Just don't use samples from the current fold for splitting. No averaging. leave_groups_out_sample(remaining_groups, remaining_samples, selected_rows, workspace, groups, restricted_group_ixs_diff, n_sampled_rows, 0, stream, rng); + return 0; // no averaging samples } } @@ -418,11 +499,20 @@ class RandomForest { // Get the avg selected rows either as the remaining data, or bootstrapped again selected_rows->resize(n_sampled_rows * 2, stream); + int total_sampled_rows = n_sampled_rows; if (rf_params.double_bootstrap) { sample_rows_from_remaining_rows(remaining_samples, selected_rows, workspace, n_sampled_rows, n_sampled_rows, stream, rng); } else { + num_avg_samples = remaining_samples->size(); thrust::copy(thrust::cuda::par.on(stream), remaining_samples->begin(), remaining_samples->end(), selected_rows->begin() + n_sampled_rows); } + + total_sampled_rows += num_avg_samples; + + const int block_dim = 256; + const int grid_dim = (total_sampled_rows + block_dim - 1) / block_dim; + assign_standard_honesty_vec<<>>( + split_avg_enums->data(), total_sampled_rows, selected_rows->data(), n_sampled_rows); update_averaging_mask(split_row_mask, n_sampled_rows, stream); } @@ -475,6 +565,7 @@ class RandomForest { */ void fit(const raft::handle_t& user_handle, const T* input, + int* groups, int n_rows, int n_cols, L* labels, @@ -507,38 +598,33 @@ class RandomForest { std::vector> foldMemberships; const int foldGroupSize = this->rf_params.foldGroupSize; const int minTreesPerGroupFold = this->rf_params.minTreesPerGroupFold; - std::unique_ptr> groups; int n_groups = 0; - if (this->rf_params.group_col_idx >= 0) { + if (groups != nullptr) { // Here we'll do a unique on the group array, then build a vector of indices into the unique vector cudaStream_t stream = handle.get_stream_from_stream_pool(0); - rmm::device_uvector input_groups(n_rows, stream); - rmm::device_uvector input_groups_unique(n_rows, stream); - groups = std::make_unique>(n_rows, stream); - cudaMemcpyAsync(input_groups.data(), - input + n_rows * this->rf_params.group_col_idx, - n_rows * sizeof(T), cudaMemcpyDefault, stream); + rmm::device_uvector input_groups_unique(n_rows, stream); cudaMemcpyAsync(input_groups_unique.data(), - input + n_rows * this->rf_params.group_col_idx, - n_rows * sizeof(T), cudaMemcpyDefault, stream); + groups, + n_rows * sizeof(int), cudaMemcpyDefault, stream); // Sadly we have to sort the entire array for unique to work. Is there // a way to just unique the unsorted array? thrust::sort(thrust::cuda::par.on(stream), input_groups_unique.data(), input_groups_unique.data() + n_rows); - T* new_end = thrust::unique(thrust::cuda::par.on(stream), + int* new_end = thrust::unique(thrust::cuda::par.on(stream), input_groups_unique.data(), input_groups_unique.data() + n_rows); + // Now we'll have n_groups and can use some iterator to find the values for each group n_groups = new_end - input_groups_unique.data(); UniqueTransformFunctor transform_fn{input_groups_unique.data(), n_groups}; thrust::transform( thrust::cuda::par.on(stream), - input_groups.data(), - input_groups.data() + n_rows, - groups->data(), + groups, + groups + n_rows, + groups, transform_fn); } @@ -586,6 +672,7 @@ class RandomForest { std::deque> workspaces; std::deque> remaining_groups_vec; std::deque> remaining_samples_vec; + std::deque> split_avg_enum_vec; const bool use_extra_vecs = this->rf_params.oob_honesty or this->rf_params.minTreesPerGroupFold > 0; size_t max_sample_row_size = this->rf_params.oob_honesty ? n_sampled_rows * 2 : n_sampled_rows; @@ -596,7 +683,9 @@ class RandomForest { split_row_masks.emplace_back(max_sample_row_size, s); workspaces.emplace_back(n_rows, s); remaining_samples_vec.emplace_back(n_rows, s); + split_avg_enum_vec.emplace_back(n_rows, s); } + if (n_groups > 0) { remaining_groups_vec.emplace_back(n_groups, s); } @@ -611,7 +700,7 @@ class RandomForest { rmm::device_uvector* workspace = use_extra_vecs ? &workspaces[stream_id] : nullptr; rmm::device_uvector* remaining_groups = n_groups > 0 ? &remaining_groups_vec[stream_id] : nullptr; rmm::device_uvector* split_row_mask = use_extra_vecs ? &split_row_masks[stream_id] : nullptr; - int* this_groups = n_groups > 0 ? groups->data() : nullptr; + rmm::device_uvector* split_avg_enums = use_extra_vecs ? &split_avg_enum_vec[stream_id] : nullptr; auto n_avg_samples = this->get_row_sample( i, n_rows, n_sampled_rows, &selected_rows[stream_id], @@ -619,10 +708,11 @@ class RandomForest { remaining_groups, remaining_samples, workspace, - this_groups, + groups, n_groups, group_tree_count, foldMemberships, + split_avg_enums, s); /* Build individual tree in the forest. @@ -634,6 +724,8 @@ class RandomForest { (b) a pointer to a list of row numbers w.r.t original data. */ if (this->rf_params.oob_honesty) { + std::vector split_avg_enum_host(n_rows); + // Copy split_avg_enum_dev to host forest->trees[i] = DT::DecisionTree::fit(handle, s, input, @@ -648,6 +740,9 @@ class RandomForest { this->rf_params.seed, quantiles, i); + forest->trees[i]->split_avg_enums = std::vector(n_rows); + raft::update_host(forest->trees[i]->split_avg_enums.data(), + split_avg_enums->data(), n_rows, s); } else { forest->trees[i] = DT::DecisionTree::fit(handle, s, diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu index e2f185d30f..5479f7f2d8 100644 --- a/cpp/test/sg/rf_test.cu +++ b/cpp/test/sg/rf_test.cu @@ -241,8 +241,7 @@ auto TrainScore( params.n_streams, 128, 0, - 0, - -1); + 0); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); @@ -576,7 +575,7 @@ TEST(RfTests, IntegerOverflow) auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 0, 2, 0, 0.0, false, false, false, 1, 1.0, 0, CRITERION::MSE, 4, 128, 0, 0, -1); + set_rf_params(3, 100, 1.0, 256, 1, 0, 2, 0, 0.0, false, false, false, 1, 1.0, 0, CRITERION::MSE, 4, 128, 0, 0); fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -602,35 +601,26 @@ TEST(RfTests, IntegerOverflow) fil::predict(handle, fil_forest, pred.data().get(), X.data().get(), m, false); } -namespace { - struct TransformFunctor { - __device__ float operator()(float input) { - return roundf(input); - } - }; -} - - TEST(RfTests, Honesty) { std::size_t m = 10000; std::size_t n = 2150; thrust::device_vector X(m * n); thrust::device_vector y(m); + raft::random::Rng r(4); r.normal(X.data().get(), X.size(), 0.0f, 2.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); + auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 1, 128, 0, 0, -1); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 1, 128, 0, 0); fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -658,7 +648,7 @@ TEST(RfTests, Honesty) TEST(RfTests, SmallHonestFolds) { - std::size_t m = 1000; + std::size_t m = 100; std::size_t n = 10; thrust::device_vector X(m * n); thrust::device_vector y(m); @@ -666,20 +656,29 @@ TEST(RfTests, SmallHonestFolds) r.normal(X.data().get(), X.size(), 0.0f, 1.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); + int n_trees = 5; RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 1, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, n_trees, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); + // Log out what's returned + // for(int ix_tree = 0; ix_tree < n_trees; ++ix_tree) { + // for (int ix_sample = 0; ix_sample < m; ++ix_sample) { + // printf("tree %d sample %d enum %d\n", ix_tree, ix_sample, forest->trees[ix_tree]->split_avg_enums[ix_sample]); + // } + // } + // See if fil overflows thrust::device_vector pred(m); ModelHandle model; @@ -704,7 +703,7 @@ TEST(RfTests, SmallHonestFolds) TEST(RfTests, SmallHonestFoldsWithFallback) { - std::size_t m = 1000; + std::size_t m = 100; std::size_t n = 10; thrust::device_vector X(m * n); thrust::device_vector y(m); @@ -712,17 +711,19 @@ TEST(RfTests, SmallHonestFoldsWithFallback) r.normal(X.data().get(), X.size(), 0.0f, 1.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); + auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 100, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 100, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -757,17 +758,18 @@ TEST(RfTests, SmallDishonestFoldsWithFallback) r.normal(X.data().get(), X.size(), 0.0f, 1.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, false, false, 100, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, false, false, 100, 1.0, 0, CRITERION::MSE, 1, 128, 1, 5); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -802,17 +804,18 @@ TEST(RfTests, HonestFolds) r.normal(X.data().get(), X.size(), 0.0f, 2.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 4, 128, 2, 2, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 1, 128, 2, 2); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -847,17 +850,18 @@ TEST(RfTests, HonestGroups) r.normal(X.data().get(), X.size(), 0.0f, 2.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); + r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 4, 128, 0, 0, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, true, true, 5, 1.0, 0, CRITERION::MSE, 4, 128, 0, 0); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -892,17 +896,17 @@ TEST(RfTests, DishonestFolds) r.normal(X.data().get(), X.size(), 0.0f, 2.0f, nullptr); cudaStream_t stream; cudaStreamCreate(&stream); - thrust::transform(thrust::cuda::par.on(stream), X.data(), - X.data() + m, X.data(), TransformFunctor{}); - // quantize the first column so that we can use it for meaningful groups + + thrust::device_vector groups(m); + r.uniformInt(groups.data().get(), m, 0, 10, nullptr); r.normal(y.data().get(), y.size(), 0.0f, 2.0f, nullptr); auto forest = std::make_shared>(); auto forest_ptr = forest.get(); auto stream_pool = std::make_shared(4); raft::handle_t handle(rmm::cuda_stream_per_thread, stream_pool); RF_params rf_params = - set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, false, false, 5, 1.0, 0, CRITERION::MSE, 4, 128, 2, 2, 0); - fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params); + set_rf_params(3, 100, 1.0, 256, 1, 1, 2, 2, 0.0, true, false, false, 5, 1.0, 0, CRITERION::MSE, 4, 128, 2, 2); + fit(handle, forest_ptr, X.data().get(), m, n, y.data().get(), rf_params, groups.data().get()); // Check we have actually learned something EXPECT_GT(forest->trees[0]->leaf_counter, 1); @@ -1143,7 +1147,7 @@ INSTANTIATE_TEST_CASE_P(RfTests, RFQuantileVariableBinsTestD, ::testing::ValuesI TEST(RfTest, TextDump) { - RF_params rf_params = set_rf_params(2, 2, 1.0, 2, 1, 0, 2, 0, 0.0, false, false, false, 1, 1.0, 0, GINI, 1, 128, 0, 0, -1); + RF_params rf_params = set_rf_params(2, 2, 1.0, 2, 1, 0, 2, 0, 0.0, false, false, false, 1, 1.0, 0, GINI, 1, 128, 0, 0); auto forest = std::make_shared>(); std::vector X_host = {1, 2, 3, 6, 7, 8}; diff --git a/honesty_test.py b/honesty_test.py index 1d6c044cf9..ed508cbbfe 100755 --- a/honesty_test.py +++ b/honesty_test.py @@ -23,6 +23,7 @@ states_map[state] = float(ix_state) input["state"] = input["state"].map(states_map) +input.drop(labels=["state"], axis=1) # state fields are redundant with state value # state_fields = ["d_st_AK","d_st_AR","d_st_AZ","d_st_CO","d_st_FL","d_st_GA","d_st_IA","d_st_KS","d_st_KY","d_st_LA","d_st_ME","d_st_MI","d_st_NC","d_st_NH","d_st_SD","d_st_TX","d_st_WI"] @@ -30,6 +31,7 @@ input = input.astype('float32') input = input.dropna() +groups = input["state"].astype('int') # Choose how many index include for random selection num_rows = input.shape[0] @@ -41,20 +43,27 @@ x_train = x.iloc[ix_train, :] y_train = y.iloc[ix_train] +groups_train = groups.iloc[ix_train] x_test = x.iloc[ix_test, :] y_test = y.iloc[ix_test] n_trees = 100 + # Start group call -- note we're not using groups to specify OOB predictions # Note, here we specify a column index to use for groups. Then the fit() function # will use the GPU to compute unique group ids for every sample. -group_col_idx = x.columns.get_loc("state") random_forest_regress = RFR(n_estimators=n_trees, oob_honesty=True, split_criterion=2, - random_state=42, minTreesPerGroupFold=5, foldGroupSize=1, group_col_idx=group_col_idx) + random_state=42, minTreesPerGroupFold=20, foldGroupSize=3) start = time.time() -trainedRFR = random_forest_regress.fit(x_train, y_train) +trainedRFR = random_forest_regress.fit(x_train, y_train, groups_train) + +# Shows the full meta information. The callback allows you to fill arrays yourself as needed +for ix_tree in range(10): + for ix_row in range(3): + print(f"tree {ix_tree} row {ix_row} enum {random_forest_regress.get_tree_sample_honesty_group_meta(ix_tree, ix_row)}") + end = time.time() pred_test_regress = trainedRFR.predict(x_test) mse = cuml.metrics.mean_squared_error(y_test, pred_test_regress) @@ -66,6 +75,12 @@ end = time.time() pred_test_regress = trainedRFR.predict(x_test) mse = cuml.metrics.mean_squared_error(y_test, pred_test_regress) + +# Note this shows invalid -- the metadata isn't filled in when honesty is disabled +for ix_tree in range(10): + for ix_row in range(3): + print(f"tree {ix_tree} row {ix_row} enum {random_forest_regress.get_tree_sample_honesty_group_meta(ix_tree, ix_row)}") + print(f"No honesty {mse} time {end-start}") random_forest_regress = RFR(n_estimators=n_trees, oob_honesty=True, split_criterion=2, random_state=42) @@ -74,5 +89,12 @@ end = time.time() pred_test_regress = trainedRFR.predict(x_test) mse = cuml.metrics.mean_squared_error(y_test, pred_test_regress) + +# Shows meta info without groups +for ix_tree in range(10): + for ix_row in range(3): + print(f"tree {ix_tree} row {ix_row} enum {random_forest_regress.get_tree_sample_honesty_group_meta(ix_tree, ix_row)}") + + print(f"Honesty {mse} time {end-start}") diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 6bb0c34a89..c3845f13cb 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -197,10 +197,6 @@ class RandomForestClassifier( as it will contain the remaining groups). Then minTreesPerGroupFold are grown with each entire fold of groups left out. - group_col_idx : int (default = -1) - The numeric index of the column to be used for group processing - - Examples -------- For usage examples, please see the RAPIDS notebooks repository: @@ -308,6 +304,7 @@ def fit(self, X, y, convert_dtype=False, broadcast_data=False): is trained on its partition """ + print("class outer fit") self.unique_classes = cp.asarray( y.unique().compute().sort_values(ignore_index=True) ) diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index 1ea049b678..e2cea52b80 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -187,10 +187,6 @@ class RandomForestRegressor( (if foldGroupSize doesn't evenly divide the number of groups, a single fold will be smaller, as it will contain the remaining groups). Then minTreesPerGroupFold are grown with each entire fold of groups left out. - - group_col_idx : int (default = -1) - The numeric index of the column to be used for group processing - """ def __init__( @@ -291,6 +287,7 @@ def fit(self, X, y, convert_dtype=False, broadcast_data=False): is trained on its partition """ + print("regress outer fit") self.internal_model = None self._fit( model=self.rfs, diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 52ca2f3ceb..6739e7346e 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -20,6 +20,7 @@ import math import warnings import typing from inspect import signature +from enum import Enum from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') @@ -40,6 +41,13 @@ from cuml.common import input_to_cuml_array from cuml.common.array_descriptor import CumlArrayDescriptor from cuml.prims.label.classlabels import make_monotonic, check_labels +class SplitAvgUnusedEnum(Enum): + unused = 0 + split = 1 + avg = 2 + group_split_unselected = 3 + group_avg_unselected = 4 + invalid = 5 class BaseRandomForestModel(Base): _param_names = ['n_estimators', 'max_depth', 'handle', @@ -58,8 +66,7 @@ class BaseRandomForestModel(Base): 'output_type', 'min_weight_fraction_leaf', 'n_jobs', 'max_leaf_nodes', 'min_impurity_split', 'oob_score', 'random_state', 'warm_start', 'class_weight', - 'criterion', 'minTreesPerGroupFold', 'foldGroupSize', - 'group_col_idx'] + 'criterion', 'minTreesPerGroupFold', 'foldGroupSize'] criterion_dict = {'0': GINI, 'gini': GINI, '1': ENTROPY, 'entropy': ENTROPY, @@ -87,7 +94,6 @@ class BaseRandomForestModel(Base): max_batch_size=4096, minTreesPerGroupFold=0, foldGroupSize=1, - group_col_idx=-1, **kwargs): sklearn_params = {"criterion": criterion, @@ -159,6 +165,7 @@ class BaseRandomForestModel(Base): self.double_bootstrap = double_bootstrap self.n_bins = n_bins self.n_cols = None + self.tree_group_metas = None self.dtype = dtype self.accuracy_metric = accuracy_metric self.max_batch_size = max_batch_size @@ -171,7 +178,6 @@ class BaseRandomForestModel(Base): self.treelite_serialized_model = None self.minTreesPerGroupFold = minTreesPerGroupFold self.foldGroupSize = foldGroupSize - self.group_col_idx = group_col_idx def _get_max_feat_val(self) -> float: if type(self.max_features) == int: @@ -277,6 +283,7 @@ class BaseRandomForestModel(Base): get_output_type=False) def _dataset_setup_for_fit( self, X, y, + groups, convert_dtype) -> typing.Tuple[CumlArray, CumlArray, float]: # Reset the old tree data for new fit call self._reset_forest_data() @@ -314,6 +321,14 @@ class BaseRandomForestModel(Base): convert_to_dtype=(self.dtype if convert_dtype else None), check_rows=self.n_rows, check_cols=1) + + group_m = None + if groups is not None: + group_m, _, _, group_dtype = \ + input_to_cuml_array( + groups, + convert_to_dtype=None, + check_rows=self.n_rows, check_cols=1) if self.dtype == np.float64: warnings.warn("To use pickling first train using float32 data " @@ -332,7 +347,7 @@ class BaseRandomForestModel(Base): if type(self.min_samples_split_averaging) == float: self.min_samples_split_averaging = \ max(2, math.ceil(self.min_samples_split_averaging * self.n_rows)) - return X_m, y_m, max_feature_val + return X_m, y_m, group_m,max_feature_val def _tl_handle_from_bytes(self, treelite_serialized_model): if not treelite_serialized_model: diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index f4d03d89ca..5ee34b5fd8 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -83,6 +83,14 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int ) except + + # + # Extra method for filling the tree meta information + # + cdef int get_tree_row_meta_info[T,L](int, + int, + RandomForestMetaData[T, L]* + ) except + + cdef void delete_rf_metadata[T, L](RandomForestMetaData[T, L]*) except + # @@ -112,7 +120,6 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int, int, int, - int, int) except + cdef vector[unsigned char] save_model(ModelHandle) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 284f811a7c..c04ddccedf 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -39,7 +39,7 @@ from cuml.common.doc_utils import insert_into_docstring from pylibraft.common.handle import Handle from cuml.common import input_to_cuml_array -from cuml.ensemble.randomforest_common import BaseRandomForestModel +from cuml.ensemble.randomforest_common import BaseRandomForestModel, SplitAvgUnusedEnum from cuml.ensemble.randomforest_common import _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * @@ -72,6 +72,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int*, int, RF_params, + int*, int) except + cdef void fit(handle_t& handle, @@ -82,6 +83,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int*, int, RF_params, + int*, int) except + cdef void predict(handle_t& handle, @@ -283,9 +285,6 @@ class RandomForestClassifier(BaseRandomForestModel, as it will contain the remaining groups). Then minTreesPerGroupFold are grown with each entire fold of groups left out. - group_col_idx : int (default = -1) - The numeric index of the column to be used for group processing - Notes ----- **Known Limitations**\n @@ -475,7 +474,7 @@ class RandomForestClassifier(BaseRandomForestModel, @cuml.internals.api_base_return_any(set_output_type=False, set_output_dtype=True, set_n_features_in=False) - def fit(self, X, y, convert_dtype=True): + def fit(self, X, y, groups=None, group_meta=None, convert_dtype=True): """ Perform Random Forest Classification on the input data @@ -486,16 +485,18 @@ class RandomForestClassifier(BaseRandomForestModel, y to be of dtype int32. This will increase memory used for the method. """ - - X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, + X_m, y_m, groups_m, max_feature_val = self._dataset_setup_for_fit(X, y, groups, convert_dtype) # Track the labels to see if update is necessary self.update_labels = not check_labels(y_m, self.classes_) - cdef uintptr_t X_ptr, y_ptr + cdef uintptr_t X_ptr, y_ptr, groups_ptr X_ptr = X_m.ptr y_ptr = y_m.ptr + if groups is not None: + groups_ptr = groups_m.ptr + cdef handle_t* handle_ =\ self.handle.getHandle() @@ -530,8 +531,7 @@ class RandomForestClassifier(BaseRandomForestModel, self.n_streams, self.max_batch_size, self.minTreesPerGroupFold, - self.foldGroupSize, - self.group_col_idx) + self.foldGroupSize) if self.dtype == np.float32: fit(handle_[0], @@ -542,6 +542,7 @@ class RandomForestClassifier(BaseRandomForestModel, y_ptr, self.num_classes, rf_params, + groups_ptr, self.verbose) elif self.dtype == np.float64: @@ -554,19 +555,33 @@ class RandomForestClassifier(BaseRandomForestModel, y_ptr, self.num_classes, rf_params64, + groups_ptr, self.verbose) else: raise TypeError("supports only np.float32 and np.float64 input," " but input of type '%s' passed." % (str(self.dtype))) + # make sure that the `fit` is complete before the following delete # call happens self.handle.sync() del X_m del y_m + del groups_m return self + def get_tree_sample_honesty_group_meta(self, tree_id, row_id): + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest + cdef RandomForestMetaData[double, int] *rf_forest64 = \ + self.rf_forest64 + + if self.dtype == np.float32: + return SplitAvgUnusedEnum(get_tree_row_meta_info(tree_id, row_id, rf_forest)) + else: + return SplitAvgUnusedEnum(get_tree_row_meta_info(tree_id, row_id, rf_forest64)) + @cuml.internals.api_base_return_array(get_output_dtype=True) def _predict_model_on_cpu(self, X, convert_dtype) -> CumlArray: cdef uintptr_t X_ptr diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 85d25b1fde..42ea83ef08 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -39,9 +39,10 @@ from cuml.common.doc_utils import insert_into_docstring from pylibraft.common.handle import Handle from cuml.common import input_to_cuml_array -from cuml.ensemble.randomforest_common import BaseRandomForestModel +from cuml.ensemble.randomforest_common import BaseRandomForestModel, SplitAvgUnusedEnum from cuml.ensemble.randomforest_common import _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * +from cuml.internals.mem_type import MemoryType from cuml.fil.fil import TreeliteModel @@ -70,6 +71,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int, float*, RF_params, + int*, int) except + cdef void fit(handle_t& handle, @@ -79,6 +81,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int, double*, RF_params, + int*, int) except + cdef void predict(handle_t& handle, @@ -286,9 +289,6 @@ class RandomForestRegressor(BaseRandomForestModel, as it will contain the remaining groups). Then minTreesPerGroupFold are grown with each entire fold of groups left out. - group_col_idx : int (default = -1) - The numeric index of the column to be used for group processing - Notes ----- **Known Limitations**\n @@ -468,20 +468,24 @@ class RandomForestRegressor(BaseRandomForestModel, domain="cuml_python") @generate_docstring() @cuml.internals.api_base_return_any_skipall - def fit(self, X, y, convert_dtype=True): + def fit(self, X, y, groups=None, convert_dtype=True): """ Perform Random Forest Regression on the input data """ - - X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, + X_m, y_m, groups_m, max_feature_val = self._dataset_setup_for_fit(X, y, groups, convert_dtype) # Reset the old tree data for new fit call - cdef uintptr_t X_ptr, y_ptr + cdef uintptr_t X_ptr, y_ptr, groups_ptr X_ptr = X_m.ptr y_ptr = y_m.ptr + if groups is not None: + groups_ptr = groups_m.ptr + else: + groups_ptr = NULL + cdef handle_t* handle_ =\ self.handle.getHandle() @@ -515,8 +519,8 @@ class RandomForestRegressor(BaseRandomForestModel, self.n_streams, self.max_batch_size, self.minTreesPerGroupFold, - self.foldGroupSize, - self.group_col_idx) + self.foldGroupSize) + n_trees = rf_params.n_trees if self.dtype == np.float32: fit(handle_[0], @@ -526,6 +530,7 @@ class RandomForestRegressor(BaseRandomForestModel, self.n_cols, y_ptr, rf_params, + groups_ptr, self.verbose) else: @@ -537,14 +542,30 @@ class RandomForestRegressor(BaseRandomForestModel, self.n_cols, y_ptr, rf_params64, + groups_ptr, self.verbose) + + cdef uintptr_t meta_ptr + # make sure that the `fit` is complete before the following delete # call happens self.handle.sync() del X_m del y_m + del groups_m return self + def get_tree_sample_honesty_group_meta(self, tree_id, row_id): + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest + cdef RandomForestMetaData[double, int] *rf_forest64 = \ + self.rf_forest64 + + if self.dtype == np.float32: + return SplitAvgUnusedEnum(get_tree_row_meta_info(tree_id, row_id, rf_forest)) + else: + return SplitAvgUnusedEnum(get_tree_row_meta_info(tree_id, row_id, rf_forest64)) + def _predict_model_on_cpu(self, X, convert_dtype) -> CumlArray: cdef uintptr_t X_ptr X_m, n_rows, n_cols, dtype = \