From c12d63504b30ea501cabf87f2034aac7765a3f1c Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 00:46:35 +0800 Subject: [PATCH 1/8] use BOOST_VERSION to switch the source of progress_display --- src/hashed_model_matrix.cpp | 12 +++++++++--- src/hashed_model_matrix.h | 5 +++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/hashed_model_matrix.cpp b/src/hashed_model_matrix.cpp index a948e87..cf61b7f 100644 --- a/src/hashed_model_matrix.cpp +++ b/src/hashed_model_matrix.cpp @@ -20,6 +20,12 @@ using namespace Rcpp; +#if BOOST_VERSION >= 108400 +typedef boost::timer::progress_display progress_display; +#else +typedef boost::progress_display progress_display; +#endif + template NameClassMapping get_class(DataFrameLike data) { Function lapply("lapply"); @@ -206,9 +212,9 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size #ifdef NOISY_DEBUG Rprintf("nrow(data): %d length(converters): %d\n", data.nrows(), converters.size()); #endif - std::shared_ptr pd(NULL); + std::shared_ptr pd(NULL); if (transpose) { - if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout)); + if (progress) pd.reset(new progress_display(data.nrows(), Rcpp::Rcout)); for(auto i = 0;i < data.nrows();i++) { if (progress) ++(*pd); if (is_intercept) { @@ -234,7 +240,7 @@ SEXP hashed_model_matrix(RObject tf, DataFrameLike data, unsigned long hash_size } } else { - if (progress) pd.reset(new boost::progress_display(data.nrows(), Rcpp::Rcout)); + if (progress) pd.reset(new progress_display(data.nrows(), Rcpp::Rcout)); std::map< uint32_t, std::pair< std::vector, std::vector > > cache; if (is_intercept) { std::pair< std::vector, std::vector >& k(cache[0]); diff --git a/src/hashed_model_matrix.h b/src/hashed_model_matrix.h index 59f2f41..ee7f19a 100644 --- a/src/hashed_model_matrix.h +++ b/src/hashed_model_matrix.h @@ -24,7 +24,12 @@ #include #include "hash_function.h" #include "vector_converter.h" +#include +#if BOOST_VERSION >= 108400 +#include +#else #include +#endif typedef std::map< std::string, std::string > NameClassMapping; typedef std::vector< std::string > StrVec; From 33e5b30cc792813e54408dafb069e13b43adb285 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 00:48:50 +0800 Subject: [PATCH 2/8] bump version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 53bfcda..7cd789c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FeatureHashing Type: Package Title: Creates a Model Matrix via Feature Hashing with a Formula Interface -Version: 0.9.1.5 +Version: 0.9.1.6 Date: 2019-11-24 Authors@R: c( person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")), From bb5056fd6f530ba8361c7e67a5eccdab0201ae73 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 01:26:11 +0800 Subject: [PATCH 3/8] drop system requirements of C++11 and add the minimum required version of R as 4.0 --- DESCRIPTION | 5 ++--- src/Makevars | 1 - src/Makevars.win | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7cd789c..a5c4ecd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: FeatureHashing Type: Package Title: Creates a Model Matrix via Feature Hashing with a Formula Interface -Version: 0.9.1.6 +Version: 0.9.2 Date: 2019-11-24 Authors@R: c( person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")), @@ -16,7 +16,7 @@ Description: Feature hashing, also called as the hashing trick, is a method to t Please see the README in for more information. License: GPL(>= 3) | file LICENSE Depends: - R (>= 3.1), + R (>= 4.0), methods Imports: Rcpp (>= 0.11), @@ -25,7 +25,6 @@ Imports: magrittr (>= 1.5) LinkingTo: Rcpp, digest(>= 0.6.8), BH(>= 1.54.0-1) Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, pROC -SystemRequirements: C++11 BugReports: https://github.com/wush978/FeatureHashing/issues URL: https://github.com/wush978/FeatureHashing VignetteBuilder: knitr diff --git a/src/Makevars b/src/Makevars index a7f3510..e69de29 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +0,0 @@ -CXX_STD = CXX11 diff --git a/src/Makevars.win b/src/Makevars.win index a7f3510..e69de29 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1 +0,0 @@ -CXX_STD = CXX11 From 072fdf56d12e3798dc5cde3551dca77120251bb8 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 01:29:56 +0800 Subject: [PATCH 4/8] roxygenise --- DESCRIPTION | 2 +- NAMESPACE | 2 +- R/hashed.model.matrix.R | 6 +-- man/CSCMatrix-class.Rd | 12 ++--- man/hash.mapping.Rd | 6 +-- man/hash.size.Rd | 12 ++--- man/hashed.model.matrix.Rd | 93 ++++++++++++++++++++------------------ man/intToRaw.Rd | 3 +- man/ipinyou.Rd | 21 +++++---- man/simulate.split.Rd | 11 ++--- man/test.tag.Rd | 13 +++--- 11 files changed, 94 insertions(+), 87 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a5c4ecd..032ac09 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -28,4 +28,4 @@ Suggests: RUnit, glmnet, knitr, xgboost, rmarkdown, pROC BugReports: https://github.com/wush978/FeatureHashing/issues URL: https://github.com/wush978/FeatureHashing VignetteBuilder: knitr -RoxygenNote: 7.2.1 +RoxygenNote: 7.2.3 diff --git a/NAMESPACE b/NAMESPACE index 208e638..2339e75 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,4 +1,4 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(hash.mapping) export(hash.sign) diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 4e75f97..436580e 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -25,7 +25,7 @@ #'@details #'The \code{hashed.model.matrix} hashes the feature during #'the construction of the model matrix. It uses the 32-bit variant of MurmurHash3 -#'\url{https://code.google.com/p/smhasher/wiki/MurmurHash3}. Weinberger +#'\url{https://github.com/aappleby/smhasher}. Weinberger #'et. al. (2009) used two separate hashing function \eqn{h}(\code{hashed.value}) and #'\eqn{\xi}(\code{hash.sign}) to determine the indices and the sign of the values #'respectively. Different seeds are used to implement the hashing function @@ -201,9 +201,9 @@ hashed.model.matrix <- function(formula, data, hash.size = 2^18, transpose = FAL progress = FALSE) { stopifnot(hash.size >= 0) stopifnot(is.data.frame(data)) - stopifnot(class(formula) %in% c("formula", "character")) + stopifnot(inherits(formula, "formula") | inherits(formula, "character")) - if(class(formula) == "character") formula %<>% paste(collapse = " + ") %>% paste("~", .) %>% as.formula + if(inherits(formula, "character")) formula %<>% paste(collapse = " + ") %>% paste("~", .) %>% as.formula tf.idf.string <- "type = \"tf-idf\"" diff --git a/man/CSCMatrix-class.Rd b/man/CSCMatrix-class.Rd index 649cc28..1e7b05a 100644 --- a/man/CSCMatrix-class.Rd +++ b/man/CSCMatrix-class.Rd @@ -1,15 +1,15 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/matrix.R \docType{class} \name{CSCMatrix-class} \alias{CSCMatrix-class} +\alias{dim<-,CSCMatrix-method} +\alias{dim,CSCMatrix-method} +\alias{\%*\%,CSCMatrix,numeric-method} +\alias{\%*\%,numeric,CSCMatrix-method} \alias{[,CSCMatrix,missing,numeric,ANY-method} \alias{[,CSCMatrix,numeric,missing,ANY-method} \alias{[,CSCMatrix,numeric,numeric,ANY-method} -\alias{\%*\%,CSCMatrix,numeric-method} -\alias{\%*\%,numeric,CSCMatrix-method} -\alias{dim,CSCMatrix-method} -\alias{dim<-,CSCMatrix-method} \title{CSCMatrix} \description{ The structure of \code{CSCMatrix} is the same @@ -37,6 +37,7 @@ The result of matrix-vector multiplication should be the same. The returned object is a numeric vector. } } + \examples{ # construct a CSCMatrix m <- hashed.model.matrix(~ ., CO2, 8) @@ -46,4 +47,3 @@ m2 <- as(m, "dgCMatrix") \seealso{ \code{\link{dgCMatrix-class}} } - diff --git a/man/hash.mapping.Rd b/man/hash.mapping.Rd index e0196a4..7566cfe 100644 --- a/man/hash.mapping.Rd +++ b/man/hash.mapping.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/hash.mapping.R \name{hash.mapping} \alias{hash.mapping} @@ -16,7 +16,7 @@ a named \code{numeric} vector Extract mapping between hash and original values } \details{ -Generate a mapping between original values and hashes. +Generate a mapping between original values and hashes. Option \code{create.mapping = T} needs to be used in function \code{hashed.model.matrix}. @@ -27,8 +27,8 @@ data(ipinyou) m <- hashed.model.matrix(~., ipinyou.train, 2^10, create.mapping = TRUE) mapping <- hash.mapping(m) + } \author{ Michael Benesty } - diff --git a/man/hash.size.Rd b/man/hash.size.Rd index 749477a..19db171 100644 --- a/man/hash.size.Rd +++ b/man/hash.size.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/hash.size.R \name{hash.size} \alias{hash.size} @@ -16,19 +16,19 @@ The hash size of feature hashing as a positive integer. Compute minimum hash size to reduce collision rate } \details{ -To reduce collision rate, the hash size should be +To reduce collision rate, the hash size should be equal or superior to the nearest power of two to the number of unique values in the input \code{data.frame}. The value computed is a theorical minimum hash size. -It just means that in the best situation it may be +It just means that in the best situation it may be possible that all computed hash can be stored with this hash size. Intuitively, if the distribution of hash generated by the algorithm was perfect, when the computed size is used, each permutation of bits of the hash vector would correspond to one unique original value of -your \code{data.frame}. +your \code{data.frame}. Because a bit value is \code{\{0,1\}}, the computed size is \code{2^x} with a \code{x} big enough to have a hash vector containing each @@ -43,7 +43,7 @@ The only known solution to have zero collision is to build a dictionnary of values, and for each new value to hash, check in the dictionnary if the hash value already exists. It is not performant at all. -If you increase the computed size (by multiplying it by \code{2^x}, +If you increase the computed size (by multiplying it by \code{2^x}, it is up to you to choose a \code{x}), you will reduce the collision rate. If you use a value under the computed size, there is a 100% chance of collisions. @@ -71,8 +71,8 @@ mat2 <- hashed.model.matrix(~., ipinyou.train, size, create.mapping = TRUE) mapping2 <- hash.mapping(mat2) #Rate of collision mean(duplicated(mapping2)) + } \author{ Michael Benesty } - diff --git a/man/hashed.model.matrix.Rd b/man/hashed.model.matrix.Rd index 93bb621..93f1fd5 100644 --- a/man/hashed.model.matrix.Rd +++ b/man/hashed.model.matrix.Rd @@ -1,15 +1,22 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/hashed.model.matrix.R \name{hashed.model.matrix} -\alias{hash.sign} -\alias{hashed.interaction.value} \alias{hashed.model.matrix} \alias{hashed.value} +\alias{hash.sign} +\alias{hashed.interaction.value} \title{Create a model matrix with feature hashing} \usage{ -hashed.model.matrix(formula, data, hash.size = 2^18, transpose = FALSE, - create.mapping = FALSE, is.dgCMatrix = TRUE, signed.hash = FALSE, - progress = FALSE) +hashed.model.matrix( + formula, + data, + hash.size = 2^18, + transpose = FALSE, + create.mapping = FALSE, + is.dgCMatrix = TRUE, + signed.hash = FALSE, + progress = FALSE +) } \arguments{ \item{formula}{\code{formula} or a \code{character} vector of column names (will be expanded to a \code{formula})} @@ -21,8 +28,8 @@ hashed.model.matrix(formula, data, hash.size = 2^18, transpose = FALSE, \item{transpose}{logical value. Indicating if the transpose should be returned. It affects the space of the returned object when the dimension is imbalanced. Please see the details.} -\item{create.mapping}{logical value. The indicator of whether storing the hash mapping or not. -The mapping might miss some interaction terms which involves \code{split}ed features. +\item{create.mapping}{logical value. The indicator of whether storing the hash mapping or not. +The mapping might miss some interaction terms which involves \code{split}ed features. Please see the details.} \item{is.dgCMatrix}{logical value. Indicating if the result is \code{dgCMatrix} or \code{CSCMatrix}} @@ -37,18 +44,18 @@ Create a model matrix with feature hashing } \details{ The \code{hashed.model.matrix} hashes the feature during -the construction of the model matrix. It uses the 32-bit variant of MurmurHash3 -\url{https://code.google.com/p/smhasher/wiki/MurmurHash3}. Weinberger -et. al. (2009) used two separate hashing function \eqn{h}(\code{hashed.value}) and +the construction of the model matrix. It uses the 32-bit variant of MurmurHash3 +\url{https://github.com/aappleby/smhasher}. Weinberger +et. al. (2009) used two separate hashing function \eqn{h}(\code{hashed.value}) and \eqn{\xi}(\code{hash.sign}) to determine the indices and the sign of the values -respectively. Different seeds are used to implement the hashing function +respectively. Different seeds are used to implement the hashing function \eqn{h} and \eqn{\xi} with MurmurHash3. The formula is parsed via \code{\link{terms.formula}} with "split" as special -keyword. The interaction term is hashed (the reader can try to expl)in different ways. Please see example for +keyword. The interaction term is hashed (the reader can try to expl)in different ways. Please see example for the detailed implementation. We provide a helper function: \code{\link{hashed.interaction.value}} to show show the index after interaction. The "\code{split}" is used to expand the concatenated feature -such as "10129,10024,13866,10111,10146,10120,10115,10063" which represents the occurrence of +such as "10129,10024,13866,10111,10146,10120,10115,10063" which represents the occurrence of multiple categorical variable: "10129", "10024", "13866", "10111", "10146", "10120", "10115", and "10063". The \code{hashed.model.matrix} will expand the concatenated feature and produce the related model matrix. @@ -65,21 +72,21 @@ The user could explore the behavior via function \code{\link{simulate.split}}. The argument \code{transpose} affects the size of the returned object in the following way. For a \eqn{m \times n} matrix with \eqn{k} non-zero elements, the returned \code{dgCMatrix} requires -\eqn{O(n) + O(k)} space. For details, please check the documentation of +\eqn{O(n) + O(k)} space. For details, please check the documentation of the \code{\link{dgCMatrix-class}}. Note that the \code{rownames} of the returned \code{dgCMatrix} is \code{character(0)} so the space complexity does not contain the term \eqn{O(m)}. The \code{mapping} created by enabling \code{create.mapping} might miss the interaction term which involves \code{split}ed features. For example, suppose there are two columns \code{a} and \code{b} -while the value are 1 and 1,2,3 respectively. The user marks the column \code{b} with -\code{split}. If the hashed value of \code{b1} and \code{b2} are collided, then the interaction -\code{a1:b1} will not appear in the returned mapping table. Because this package is originally -designed for predictive analysis and the mapping should not play an -important role of predictive analysis. If you have a test case and want to ask us to fix this, +while the value are 1 and 1,2,3 respectively. The user marks the column \code{b} with +\code{split}. If the hashed value of \code{b1} and \code{b2} are collided, then the interaction +\code{a1:b1} will not appear in the returned mapping table. Because this package is originally +designed for predictive analysis and the mapping should not play an +important role of predictive analysis. If you have a test case and want to ask us to fix this, please provide us a test case in \url{https://github.com/wush978/FeatureHashing/issues/67}. } \examples{ -# The following scripts show how to fit a logistic regression +# The following scripts show how to fit a logistic regression # after feature hashing \dontrun{ data(ipinyou) @@ -103,8 +110,8 @@ auc(ipinyou.test$IsClick, p.lr) ## Per-Coordinate FTRL-Proximal with $L_1$ and $L_2$ Regularization for Logistic Regression -# The following scripts use an implementation of the FTRL-Proximal for Logistic Regresion, -# which is published in McMahan, Holt and Sculley et al. (2013), to predict the probability +# The following scripts use an implementation of the FTRL-Proximal for Logistic Regresion, +# which is published in McMahan, Holt and Sculley et al. (2013), to predict the probability # (1-step prediction) and update the model simultaneously. @@ -114,17 +121,17 @@ ftprl <- initialize.ftprl(0.1, 1, 0.1, 0.1, 2^16) ftprl <- update.ftprl(ftprl, m.train, ipinyou.train$IsClick, predict = TRUE) auc(ipinyou.train$IsClick, attr(ftprl, "predict")) -# If we use the same algorithm to predict the click through rate of the 3rd season of iPinYou, -# the overall AUC will be 0.77 which is comparable to the overall AUC of the +# If we use the same algorithm to predict the click through rate of the 3rd season of iPinYou, +# the overall AUC will be 0.77 which is comparable to the overall AUC of the # 3rd season 0.76 reported in Zhang, Yuan, Wang, et al. (2014). } # The following scripts show the implementation of the FeatureHashing. # Below the original values will be project in a space of 2^6 dimensions -m <- hashed.model.matrix(~ ., CO2, 2^6, create.mapping = TRUE, +m <- hashed.model.matrix(~ ., CO2, 2^6, create.mapping = TRUE, transpose = TRUE, is.dgCMatrix = FALSE) - + # Print the matrix via dgCMatrix as(m, "dgCMatrix") @@ -132,12 +139,12 @@ as(m, "dgCMatrix") mapping <- hash.mapping(m) # To check the rate of collisions, we will extract the indices of the hash -# values through the modulo-division method, count how many duplicates +# values through the modulo-division method, count how many duplicates # we have (in best case it should be zero) and perform a mean. mean(duplicated(mapping)) -# The type of the result produced by the function `hashed.model.matrix` -# is a CSCMatrix. It supports simple subsetting +# The type of the result produced by the function `hashed.model.matrix` +# is a CSCMatrix. It supports simple subsetting # and matrix-vector multiplication rnorm(2^6) \%*\% m @@ -146,9 +153,9 @@ rnorm(2^6) \%*\% m # Below we will apply this function to the feature names vectHash <- hashed.value(names(mapping)) -# Now we will check that the result is the same than the one got with +# Now we will check that the result is the same than the one got with # the more generation `hashed.model.matrix` function. -# We will use the Modulo-division method (that's the [\%\% 2^6] below) +# We will use the Modulo-division method (that's the [\%\% 2^6] below) # to find the address in hash table easily. stopifnot(all(vectHash \%\% 2^6 + 1 == mapping)) @@ -156,25 +163,25 @@ stopifnot(all(vectHash \%\% 2^6 + 1 == mapping)) hash.sign(names(mapping)) ## The interaction term is implemented as follow: -m2 <- hashed.model.matrix(~ .^2, CO2, 2^6, create.mapping = TRUE, +m2 <- hashed.model.matrix(~ .^2, CO2, 2^6, create.mapping = TRUE, transpose = TRUE, is.dgCMatrix = FALSE) -# The ^ operator indicates crossing to the specified degree. -# For example (a+b+c)^2 is identical to (a+b+c)*(a+b+c) +# The ^ operator indicates crossing to the specified degree. +# For example (a+b+c)^2 is identical to (a+b+c)*(a+b+c) # which in turn expands to a formula containing the main effects -# for a, b and c together with their second-order interactions. - +# for a, b and c together with their second-order interactions. + # Extract the mapping mapping2 <- hash.mapping(m2) -# Get the hash of combination of two items, PlantQn2 and uptake -mapping2["PlantQn2:uptake"] +# Get the hash of combination of two items, PlantQn2 and uptake +mapping2["PlantQn2:uptake"] # Extract hash of each item h1 <- hashed.value("PlantQn2") h2 <- hashed.value("uptake") # Computation of hash of both items combined -h3 <- hashed.value(rawToChar(c(intToRaw(h1), intToRaw(h2)))) +h3 <- hashed.value(rawToChar(c(intToRaw(h1), intToRaw(h2)))) stopifnot(h3 \%\% 2^6 + 1 == mapping2["PlantQn2:uptake"]) # The concatenated feature, i.e. the array type in hive @@ -185,6 +192,7 @@ m <- hashed.model.matrix(~ split(a, delim = ",", type = "existence"):b, df, 2^6, # The column `a` is splitted by "," and have an interaction with "b": mapping <- hash.mapping(m) names(mapping) + } \references{ H. B. McMahan, G. Holt, D. Sculley, et al. "Ad click @@ -196,12 +204,11 @@ J. He, R. L. Grossman and R. Uthurusamy. ACM, 2013, pp. 1222-1230. DOI: 10.1145/2487575.2488200. . -Kilian Q. Weinberger, Anirban Dasgupta, John Langford, -Alexander J. Smola, and Josh Attenberg. ICML, volume 382 of ACM +Kilian Q. Weinberger, Anirban Dasgupta, John Langford, +Alexander J. Smola, and Josh Attenberg. ICML, volume 382 of ACM International Conference Proceeding Series, page 140. ACM, (2009) W. Zhang, S. Yuan, J. Wang, et al. "Real-Time Bidding Benchmarking with iPinYou Dataset". In: _arXiv preprint arXiv:1407.7073_ (2014). } - diff --git a/man/intToRaw.Rd b/man/intToRaw.Rd index c1c275f..c821bcc 100644 --- a/man/intToRaw.Rd +++ b/man/intToRaw.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R \name{intToRaw} \alias{intToRaw} @@ -15,4 +15,3 @@ raw vector with length 4 \description{ Convert the integer to raw vector with endian correction } - diff --git a/man/ipinyou.Rd b/man/ipinyou.Rd index 47eaed3..1895add 100644 --- a/man/ipinyou.Rd +++ b/man/ipinyou.Rd @@ -1,20 +1,22 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/zzz.R \name{ipinyou} \alias{ipinyou} -\alias{ipinyou.test} \alias{ipinyou.train} +\alias{ipinyou.test} \title{iPinYou Real-Time Bidding Dataset for Computational Advertising Research} -\format{The column name of the data is the description of the data in Zhang, Yuan, Wang, et al. (2014). -Most of the columns should be clearly described by their column names. +\format{ +The column name of the data is the description of the data in Zhang, Yuan, Wang, et al. (2014). +Most of the columns should be clearly described by their column names. For the details of the dataset, please read the Zhang, Yuan, Wang, et al. (2014). \code{BidID}, the id of the RTB which is the unique identifier of the events. \code{Adid}, the advertiser id. -\code{UserTag}, the user tags (segments) in iPinYou's proprietary audience database. -This is also a real example of the concatenated feature.} +\code{UserTag}, the user tags (segments) in iPinYou's proprietary audience database. +This is also a real example of the concatenated feature. +} \source{ \url{http://data.computational-advertising.org/} } @@ -23,12 +25,11 @@ data(ipinyou) } \description{ This is a sample from the iPinYou Real-Time Bidding dataset. -The data.frame named \code{ipinyou.train} is a sample from the data of 2013-10-19 and +The data.frame named \code{ipinyou.train} is a sample from the data of 2013-10-19 and the data.frame named \code{ipinyou.test} is a sample from the data of 2013-10-20. } \references{ -W. Zhang, S. Yuan, J. Wang, et al. -"Real-Time Bidding Benchmarking with iPinYou Dataset". +W. Zhang, S. Yuan, J. Wang, et al. +"Real-Time Bidding Benchmarking with iPinYou Dataset". In: arXiv preprint arXiv:1407.7073 (2014). } - diff --git a/man/simulate.split.Rd b/man/simulate.split.Rd index d2d3e91..7906c73 100644 --- a/man/simulate.split.Rd +++ b/man/simulate.split.Rd @@ -1,8 +1,8 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/simulate.split.R \name{simulate.split} \alias{simulate.split} -\title{Simulate how \code{split} work in \code{hashed.model.matrix} to split the string into +\title{Simulate how \code{split} work in \code{hashed.model.matrix} to split the string into tokens} \usage{ simulate.split(x, delim = ",", type = c("existence", "count")) @@ -12,15 +12,14 @@ simulate.split(x, delim = ",", type = c("existence", "count")) \item{delim}{character value. The string to use for splitting.} -\item{type}{character value. Either "\code{count}" or "\code{existence}". -"\code{count}" indicates the number of occurrence of the token. +\item{type}{character value. Either "\code{count}" or "\code{existence}". +"\code{count}" indicates the number of occurrence of the token. "\code{existence}" indicates the boolean that whether the token exist or not.} } \value{ integer vector for \code{type = "count"} and logical vector for \code{type = "existence"}. } \description{ -Simulate how \code{split} work in \code{hashed.model.matrix} to split the string into +Simulate how \code{split} work in \code{hashed.model.matrix} to split the string into tokens } - diff --git a/man/test.tag.Rd b/man/test.tag.Rd index be2d0d4..f329f8f 100644 --- a/man/test.tag.Rd +++ b/man/test.tag.Rd @@ -1,14 +1,15 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/zzz.R \name{test.tag} \alias{test.tag} \title{test.tag} -\format{For each element, the string represents the occurrence +\format{ +For each element, the string represents the occurrence of different tags. For example, the string "1,27,19,25,tp,tw" -of the first instance represents that the feature `1` is TRUE, the feature `27` is -TRUE, et. al. On the contrary, the missing feature such as `2` -is FALSE.} +of the first instance represents that the feature `1` is TRUE, the feature `27` is +TRUE, et. al. On the contrary, the missing feature such as `2` +is FALSE. +} \description{ This is a vector to demo the concatenated feature. } - From b9d88b56393940e8bf59d15484cda903557a3c40 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 01:35:34 +0800 Subject: [PATCH 5/8] update urls --- README.Rmd | 8 ++++---- README.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.Rmd b/README.Rmd index 734afbe..efcb9d7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -6,14 +6,14 @@ output: FeatureHashing ============== -Linux: [![Travis-ci Status](https://travis-ci.org/wush978/FeatureHashing.svg?branch=master)](https://travis-ci.org/wush978/FeatureHashing) +Linux: [![Travis-ci Status](https://app.travis-ci.com/wush978/FeatureHashing.svg?branch=master)](https://app.travis-ci.com/wush978/FeatureHashing) Win : [![Build status](https://ci.appveyor.com/api/projects/status/bm4lpxn5f07d8klj/branch/master?svg=true)](https://ci.appveyor.com/project/wush978/featurehashing/branch/master) -OS X: [![Travis-ci Status](https://travis-ci.org/wush978/FeatureHashing.svg?branch=osx)](https://travis-ci.org/wush978/FeatureHashing) +OS X: [![Travis-ci Status](https://app.travis-ci.com/wush978/FeatureHashing.svg?branch=osx)](https://app.travis-ci.com/wush978/FeatureHashing) -[![Coverage Status](https://img.shields.io/coveralls/wush978/FeatureHashing.svg)](https://coveralls.io/r/wush978/FeatureHashing?branch=master) +[![Coverage Status](https://img.shields.io/coveralls/wush978/FeatureHashing.svg)](https://coveralls.io/github/wush978/FeatureHashing) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/FeatureHashing)](https://cran.r-project.org/package=FeatureHashing/) -[![rstudio mirror downloads](https://cranlogs.r-pkg.org/badges/FeatureHashing)](https://github.com/metacran/cranlogs.app) +[![rstudio mirror downloads](https://cranlogs.r-pkg.org/badges/FeatureHashing)](https://github.com/r-hub/cranlogs.app) Implement feature hashing with R diff --git a/README.md b/README.md index 5edc3d5..a94407d 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,14 @@ output: FeatureHashing ============== -Linux: [![Travis-ci Status](https://travis-ci.org/wush978/FeatureHashing.svg?branch=master)](https://travis-ci.org/wush978/FeatureHashing) +Linux: [![Travis-ci Status](https://app.travis-ci.com/wush978/FeatureHashing.svg?branch=master)](https://app.travis-ci.com/wush978/FeatureHashing) Win : [![Build status](https://ci.appveyor.com/api/projects/status/bm4lpxn5f07d8klj/branch/master?svg=true)](https://ci.appveyor.com/project/wush978/featurehashing/branch/master) -OS X: [![Travis-ci Status](https://travis-ci.org/wush978/FeatureHashing.svg?branch=osx)](https://travis-ci.org/wush978/FeatureHashing) +OS X: [![Travis-ci Status](https://app.travis-ci.com/wush978/FeatureHashing.svg?branch=osx)](https://app.travis-ci.com/wush978/FeatureHashing) -[![Coverage Status](https://img.shields.io/coveralls/wush978/FeatureHashing.svg)](https://coveralls.io/r/wush978/FeatureHashing?branch=master) +[![Coverage Status](https://img.shields.io/coveralls/wush978/FeatureHashing.svg)](https://coveralls.io/github/wush978/FeatureHashing) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/FeatureHashing)](https://cran.r-project.org/package=FeatureHashing/) -[![rstudio mirror downloads](https://cranlogs.r-pkg.org/badges/FeatureHashing)](https://github.com/metacran/cranlogs.app) +[![rstudio mirror downloads](https://cranlogs.r-pkg.org/badges/FeatureHashing)](https://github.com/r-hub/cranlogs.app) Implement feature hashing with R From d645121ae816a18ecea7baf1bbfdd8c43b31dbec Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 01:38:39 +0800 Subject: [PATCH 6/8] bump date --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 032ac09..bcf25b7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: FeatureHashing Type: Package Title: Creates a Model Matrix via Feature Hashing with a Formula Interface Version: 0.9.2 -Date: 2019-11-24 +Date: 2024-01-10 Authors@R: c( person("Wush", "Wu", email = "wush978@gmail.com", role = c("aut", "cre")), person("Michael", "Benesty", email = "michael@benesty.fr", role = c("aut", "ctb"))) From c1ae4fafee928dc22ed4fe81eab2458ed73515f9 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 01:43:46 +0800 Subject: [PATCH 7/8] remove empty files --- src/Makevars | 0 src/Makevars.win | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/Makevars delete mode 100644 src/Makevars.win diff --git a/src/Makevars b/src/Makevars deleted file mode 100644 index e69de29..0000000 diff --git a/src/Makevars.win b/src/Makevars.win deleted file mode 100644 index e69de29..0000000 From 4d6ee1bf4128b4b8390993fc7bf2decee4aca770 Mon Sep 17 00:00:00 2001 From: Wush Wu Date: Wed, 10 Jan 2024 21:50:19 +0800 Subject: [PATCH 8/8] move url --- R/hashed.model.matrix.R | 2 +- man/hashed.model.matrix.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/hashed.model.matrix.R b/R/hashed.model.matrix.R index 436580e..4b91a9d 100644 --- a/R/hashed.model.matrix.R +++ b/R/hashed.model.matrix.R @@ -73,7 +73,7 @@ #'Dhillon, Y. Koren, R. Ghani, T. E. Senator, P. Bradley, R. Parekh, #'J. He, R. L. Grossman and R. Uthurusamy. ACM, 2013, pp. 1222-1230. #'DOI: 10.1145/2487575.2488200. . +#'\url{https://doi.acm.org/10.1145/2487575.2488200}>. #' #'Kilian Q. Weinberger, Anirban Dasgupta, John Langford, #'Alexander J. Smola, and Josh Attenberg. ICML, volume 382 of ACM diff --git a/man/hashed.model.matrix.Rd b/man/hashed.model.matrix.Rd index 93f1fd5..9b07b97 100644 --- a/man/hashed.model.matrix.Rd +++ b/man/hashed.model.matrix.Rd @@ -202,7 +202,7 @@ KDD 2013, Chicago, IL, USA, August 11-14, 2013_. Ed. by I. S. Dhillon, Y. Koren, R. Ghani, T. E. Senator, P. Bradley, R. Parekh, J. He, R. L. Grossman and R. Uthurusamy. ACM, 2013, pp. 1222-1230. DOI: 10.1145/2487575.2488200. . +\url{https://doi.acm.org/10.1145/2487575.2488200}>. Kilian Q. Weinberger, Anirban Dasgupta, John Langford, Alexander J. Smola, and Josh Attenberg. ICML, volume 382 of ACM