From 3243d24e125a8032f2ec32a2df9708b7bc148da1 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Thu, 16 Feb 2017 10:15:47 -0800 Subject: [PATCH 01/18] Tentative working SMF --- scripts/testsfa_v2.ssc | 53 ++++++++++ src/main/scala/BIDMach/models/SFA.scala | 69 ++----------- src/main/scala/BIDMach/models/SMF.scala | 125 +++++++++++++++++------- 3 files changed, 150 insertions(+), 97 deletions(-) create mode 100755 scripts/testsfa_v2.ssc diff --git a/scripts/testsfa_v2.ssc b/scripts/testsfa_v2.ssc new file mode 100755 index 00000000..73a3e6b7 --- /dev/null +++ b/scripts/testsfa_v2.ssc @@ -0,0 +1,53 @@ +:silent +import BIDMach.models.SMF + +/** + * Test SMF code on netflix data. + */ + +val dir = "/data/netflix/" +val a = loadSMat(dir+"newtrain.smat.lz4") +val ta = loadSMat(dir+"newtest.smat.lz4") +val d = 256 + +val (nn,opts) = SMF.learner1(a, d) + +// Daniel Seita: I'm not sure if these values are good for netflix. +opts.batchSize = 1000 +opts.uiter = 2 +// for acceptance = 1 +opts.urate = 0.01f +opts.lrate = 0.01f +// for computed acceptance +opts.urate = 0.1f +opts.lrate = 0.1 +opts.npasses = 1 + +val lambda = 4f +opts.lambdau = lambda; +opts.regumean = lambda; +opts.lambdam = lambda / 500000 * 20; +opts.regmmean = opts.lambdam +opts.evalStep = 31 +opts.doUsers = false +opts.lsgd = 0.010f +opts.what +nn.train + +val model = nn.model.asInstanceOf[SMF] +val xa = (ta != 0) +val (mm, mopts) = SMF.predictor1(model, a, xa); +mopts.batchSize = 10000 +mopts.uiter = 6 +mopts.urate = opts.urate +mopts.lsgd = 0.0f +mm.predict + +val pa = SMat(mm.preds(1)); +min(pa.contents,5,pa.contents) +max(pa.contents,1,pa.contents) +val diff = ta.contents - pa.contents +val rmse = sqrt((diff ^* diff) / diff.length) +println("rmse = %f" format rmse.v); + +sys.exit diff --git a/src/main/scala/BIDMach/models/SFA.scala b/src/main/scala/BIDMach/models/SFA.scala index ff7b65a0..055fbec4 100755 --- a/src/main/scala/BIDMach/models/SFA.scala +++ b/src/main/scala/BIDMach/models/SFA.scala @@ -216,6 +216,10 @@ class SFA(override val opts:SFA.Opts = new SFA.Options) extends FactorModel(opts Minv <-- inv(50f/nfeats*FMat(mm *^ mm) + opts.lambdau * diagM); } + /** + * The evalfun normally called during training. Returns -RMSE on training + * data minibatch (sdata). + */ def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val preds = DDS(mm, user, sdata) + (iavg + avg); if (ogmats != null) { @@ -230,6 +234,10 @@ class SFA(override val opts:SFA.Opts = new SFA.Options) extends FactorModel(opts -sqrt(row(vv/sdata.nnz)) } + /** + * The evalfun normally called during testing and predicting. Returns -RMSE + * on training data minibatch (sdata). + */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { val spreds = DDS(mm, user, sdata) + (iavg + avg); val dc = sdata.contents; @@ -267,25 +275,6 @@ object SFA { } class Options extends Opts {} - def learner(mat0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with Grad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = -1 - opts.npasses = 4 - opts.lrate = 0.1 - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0:Mat), opts), - new SFA(opts), - null, - new Grad(opts), - null, - opts) - (nn, opts) - } - def learnerX(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts @@ -306,25 +295,6 @@ object SFA { (nn, opts) } - def learner(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with Grad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SFA(opts), - null, - new Grad(opts), - null, - opts) - (nn, opts) - } - def learnerX(mat0:Mat, user0:Mat, d:Int) = { class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts @@ -345,28 +315,9 @@ object SFA { (nn, opts) } - def learnerY(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SFA.Opts with MatSource.Opts with ADAGrad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SFA(opts), - null, - new ADAGrad(opts), - null, - opts) - (nn, opts) - } - - class PredOpts extends Learner.Options with SFA.Opts with MatSource.Opts with MatSink.Opts + class PredOpts extends Learner.Options with SFA.Opts with MatSource.Opts with MatSink.Opts - def predictor(model0:Model, mat1:Mat, preds:Mat) = { + def predictor(model0:Model, mat1:Mat, preds:Mat) = { val model = model0.asInstanceOf[SFA] val nopts = new PredOpts; nopts.batchSize = math.min(10000, mat1.ncols/30 + 1) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 4d8657e5..7294ca58 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -5,6 +5,7 @@ import BIDMat.MatFunctions._ import BIDMat.SciFunctions._ import BIDMat.Solvers._ import BIDMach.datasources._ +import BIDMach.datasinks._ import BIDMach.updaters._ import BIDMach.Learner @@ -232,15 +233,46 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts } } + /** + * The evalfun normally called during training. Returns -RMSE on training + * data minibatch (sdata). + */ def evalfun(sdata0:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val sdata = sdata0 - (iavg + avg); val preds = DDS(mm, user, sdata); + if (ogmats != null) { + ogmats(0) = user; + if (ogmats.length > 1) { + ogmats(1) = preds; + } + } val dc = sdata.contents val pc = preds.contents val diff = dc - pc; val vv = diff ddot diff; -sqrt(row(vv/sdata.nnz)) } + + /** + * The evalfun normally called during testing and predicting. Returns -RMSE + * on training data minibatch (sdata). + */ + override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { + val spreds = DDS(mm, user, sdata) + (iavg + avg); + val dc = sdata.contents; + val pc = spreds.contents; + val vv = (dc - pc) ddot (dc - pc); + val xpreds = DDS(mm, user, preds) + (iavg + avg); + if (ogmats != null) { + ogmats(0) = user; + if (ogmats.length > 1) { + ogmats(1) = xpreds; + } + } + preds.contents <-- xpreds.contents; + -sqrt(row(vv/sdata.nnz)) + } + } object SMF { @@ -282,24 +314,25 @@ object SMF { opts) (nn, opts) } - - def learnerX(mat0:Mat, d:Int) = { + + /** Learner with single (training data) matrix as datasource, and ADAGrad Opts. */ + def learner1(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts - opts.dim = d + opts.dim = d opts.putBack = -1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1); - opts.aopts = opts; - val nn = new Learner( - new MatSource(Array(mat0:Mat), opts), - new SMF(opts), - null, - null, - null, - opts); + opts.npasses = 4 + opts.lrate = 0.1 + opts.initUval = 0f; + opts.batchSize = math.min(100000, mat0.ncols/30 + 1) + opts.aopts = opts + val nn = new Learner( + new MatSource(Array(mat0:Mat), opts), + new SMF(opts), + null, + new ADAGrad(opts), + null, + opts) (nn, opts) } @@ -322,27 +355,7 @@ object SMF { (nn, opts) } - def learnerX(mat0:Mat, user0:Mat, d:Int) = { - class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts - val opts = new xopts - opts.dim = d - opts.putBack = 1 - opts.npasses = 4 - opts.lrate = 0.1; - opts.initUval = 0f; - opts.batchSize = math.min(100000, mat0.ncols/30 + 1); - opts.aopts = opts; - val nn = new Learner( - new MatSource(Array(mat0, user0), opts), - new SMF(opts), - null, - null, - null, - opts) - (nn, opts) - } - - def predictor(model0:Model, mat1:Mat, preds:Mat) = { + def predictor(model0:Model, mat1:Mat, preds:Mat) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with Grad.Opts val model = model0.asInstanceOf[SMF] val nopts = new xopts; @@ -369,6 +382,42 @@ object SMF { nopts) (nn, nopts) } -} - + + /** A class for one of the SMF predictors. */ + class PredOpts extends Learner.Options with SMF.Opts with MatSource.Opts with MatSink.Opts with Grad.Opts with ADAGrad.Opts + /** + * A predictor which will store the predictions in the predictor model + * matrices. It forms an empty matrix to be populated by the `user` matrices, + * which turns into the second factor matrix. It mirrors an SFA predictor code + * which also forms this empty matrix into the matrix datasource, with the + * only difference being the lack of an Minv option for `newmod`. + */ + def predictor1(model0:Model, mat1:Mat, preds:Mat) = { + val model = model0.asInstanceOf[SMF] + val nopts = new PredOpts; + nopts.batchSize = math.min(10000, mat1.ncols/30 + 1) + nopts.putBack = -1 + val newmod = new SMF(nopts); + newmod.refresh = false + newmod.copyFrom(model); + val mopts = model.opts.asInstanceOf[SMF.Opts]; + nopts.dim = mopts.dim; + nopts.uconvg = mopts.uconvg; + nopts.miter = mopts.miter; + nopts.lambdau = mopts.lambdau; + nopts.lambdam = mopts.lambdam; + nopts.regumean = mopts.regumean; + nopts.doUsers = mopts.doUsers; + nopts.weightByUser = mopts.weightByUser; + nopts.nmats = 2; + val nn = new Learner( + new MatSource(Array(mat1, zeros(mopts.dim, mat1.ncols), preds), nopts), + newmod, + null, + null, + new MatSink(nopts), + nopts) + (nn, nopts) + } +} \ No newline at end of file From 53051bc2910cdd769957446624bcaa2d1f223b3e Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Thu, 16 Feb 2017 10:17:52 -0800 Subject: [PATCH 02/18] Renamed file --- scripts/{testsfa_v2.ssc => testsmf_v2.ssc} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{testsfa_v2.ssc => testsmf_v2.ssc} (100%) diff --git a/scripts/testsfa_v2.ssc b/scripts/testsmf_v2.ssc similarity index 100% rename from scripts/testsfa_v2.ssc rename to scripts/testsmf_v2.ssc From bb5ebe3d33cc55204165d81433c051c445043d52 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 8 Mar 2017 09:15:00 -0800 Subject: [PATCH 03/18] I think the MH test is working with SMF w/random walk proposer. Now the next step is to figure out how to use ADAGrad. A couple of points: (1) this will assume that sigma^2 can be computed in one minibatch, (2) this will assume IID components in the matrix, which is clearly violated with, say, one user's column, (3) results are odd, I don't know why RMSE is rougly 0.7 on the training when there's barely any learning signal, (4) RMSE on test set oddly increases, 0.91 to 1 as you increase the MB size. I'm still lost on this. =( --- ..._v2.ssc => daniel_smf_netflix_adagrad.ssc} | 39 +++++----- scripts/daniel_smf_netflix_mhtest.ssc | 72 +++++++++++++++++++ src/main/scala/BIDMach/Learner.scala | 8 ++- src/main/scala/BIDMach/models/SMF.scala | 55 +++++++++++--- src/main/scala/BIDMach/updaters/MHTest.scala | 31 +++++--- 5 files changed, 164 insertions(+), 41 deletions(-) rename scripts/{testsmf_v2.ssc => daniel_smf_netflix_adagrad.ssc} (51%) create mode 100755 scripts/daniel_smf_netflix_mhtest.ssc diff --git a/scripts/testsmf_v2.ssc b/scripts/daniel_smf_netflix_adagrad.ssc similarity index 51% rename from scripts/testsmf_v2.ssc rename to scripts/daniel_smf_netflix_adagrad.ssc index 73a3e6b7..487acf2b 100755 --- a/scripts/testsmf_v2.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -2,31 +2,31 @@ import BIDMach.models.SMF /** - * Test SMF code on netflix data. + * Test SMF code on netflix data. This will use default ADAGrad, which already + * gets roughly 0.845 RMSE so I assume we have to beat that. We'll also need to + * run this with different random seeds. */ +// Get random seed set up. +// TODO random seed code + +// Now get back to the real netflix data. First, load data and set things up: val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 - val (nn,opts) = SMF.learner1(a, d) // Daniel Seita: I'm not sure if these values are good for netflix. -opts.batchSize = 1000 -opts.uiter = 2 -// for acceptance = 1 -opts.urate = 0.01f -opts.lrate = 0.01f -// for computed acceptance -opts.urate = 0.1f -opts.lrate = 0.1 -opts.npasses = 1 - +opts.batchSize = 2000 +opts.uiter = 5 +opts.urate = 0.05f +opts.lrate = 0.05f +opts.npasses = 2 val lambda = 4f -opts.lambdau = lambda; -opts.regumean = lambda; -opts.lambdam = lambda / 500000 * 20; +opts.lambdau = lambda +opts.regumean = lambda +opts.lambdam = lambda / 500000 * 20 opts.regmmean = opts.lambdam opts.evalStep = 31 opts.doUsers = false @@ -36,9 +36,9 @@ nn.train val model = nn.model.asInstanceOf[SMF] val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa); +val (mm, mopts) = SMF.predictor1(model, a, xa) mopts.batchSize = 10000 -mopts.uiter = 6 +mopts.uiter = 5 mopts.urate = opts.urate mopts.lsgd = 0.0f mm.predict @@ -48,6 +48,5 @@ min(pa.contents,5,pa.contents) max(pa.contents,1,pa.contents) val diff = ta.contents - pa.contents val rmse = sqrt((diff ^* diff) / diff.length) -println("rmse = %f" format rmse.v); - -sys.exit +println("rmse = %f" format rmse.v) +//sys.exit diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc new file mode 100755 index 00000000..5656ef0e --- /dev/null +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -0,0 +1,72 @@ +:silent +import BIDMach.models.SMF + +/** + * Test SMF code on netflix data. This will use OUR MHTest updater, which I put + * in as a new updater (SMF.learner2) to make this script more concise. + */ + +// Get random seed set up. +// TODO random seed code + +// Now get back to the real netflix data. First, load data and set things up: +val dir = "/data/netflix/" +val a = loadSMat(dir+"newtrain.smat.lz4") +val ta = loadSMat(dir+"newtest.smat.lz4") +val d = 256 +val (nn,opts) = SMF.learner2(a, d) +println("size(a)="+size(a)+", with a.nnz="+a.nnz) + +// Daniel Seita: stuff for the MH Test updater. OH ... and our N is going to be +// super-large. Ahhhh... we may need temperature then. +opts.smf = true // IMPORTANT, this affects some of the code. +opts.N = a.nnz +opts.temp = a.nnz / 1000 +opts.Nknown = true +opts.n2lsigma = 1.0f +opts.nn2l = 4000 +opts.sigmaProposer = 0.01f +opts.continueDespiteFull = false +opts.verboseMH = false +opts.collectData = false +opts.collectDataDir = "tmp/" +opts.exitTheta = false +opts.initThetaHere = false +opts.burnIn = -1 + +// For the SMF +opts.matrixOfScores = true + +// Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" +// due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. +opts.batchSize = 1000 +opts.uiter = 5 +opts.urate = 0.05f +opts.lrate = 0.05f +opts.npasses = 3 +val lambda = 4f +opts.lambdau = lambda +opts.regumean = lambda +opts.lambdam = lambda / 500000 * 20 +opts.regmmean = opts.lambdam +opts.evalStep = 31 +opts.doUsers = false +opts.lsgd = 0.010f +opts.what +nn.train + +val model = nn.model.asInstanceOf[SMF] +val xa = (ta != 0) +val (mm, mopts) = SMF.predictor1(model, a, xa) +mopts.batchSize = 10000 +mopts.uiter = 5 +mopts.urate = opts.urate +mopts.lsgd = 0.0f +mm.predict + +val pa = SMat(mm.preds(1)); +min(pa.contents,5,pa.contents) +max(pa.contents,1,pa.contents) +val diff = ta.contents - pa.contents +val rmse = sqrt((diff ^* diff) / diff.length) +println("rmse = %f" format rmse.v) diff --git a/src/main/scala/BIDMach/Learner.scala b/src/main/scala/BIDMach/Learner.scala index 5648164d..e6b07e6a 100755 --- a/src/main/scala/BIDMach/Learner.scala +++ b/src/main/scala/BIDMach/Learner.scala @@ -137,7 +137,12 @@ case class Learner( if (mixins != null) mixins map (_ compute(mats, here)); if (updater != null) updater.update(ipass, here, gprogress); } - val scores = model.evalbatchg(mats, ipass, here); + + // Daniel: I needed to change the following line to the one after it: + // val scores = model.evalbatchg(mats, ipass, here); + val scores = mean(model.evalbatchg(mats, ipass, here)).v; + // in orer for the MH test to work with different-sized minibatches. + if (datasink != null) datasink.put; reslist.append(scores.newcopy) samplist.append(here) @@ -879,7 +884,6 @@ object Learner { def scores2FMat(reslist:ListBuffer[FMat]):FMat = { if (reslist.length == 0) return zeros(0, 0) - val out = FMat(reslist(0).nrows, reslist.length) var i = 0; while (i < reslist.length) { diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 7294ca58..bbd4d4d8 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -234,8 +234,10 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts } /** - * The evalfun normally called during training. Returns -RMSE on training - * data minibatch (sdata). + * The evalfun normally called during training. Returns -RMSE on training data + * minibatch (sdata). It has an extra option to return a matrix of scores, + * which will be useful for minibatch MH test updaters. We need a 1/(2*sigma^2) + * if we're assuming a Gaussian error distribution. */ def evalfun(sdata0:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val sdata = sdata0 - (iavg + avg); @@ -249,13 +251,19 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts val dc = sdata.contents val pc = preds.contents val diff = dc - pc; - val vv = diff ddot diff; - -sqrt(row(vv/sdata.nnz)) + if (opts.matrixOfScores) { + // TODO Temporary but should be OK for now (b/c we almost never increment MB). + val sigma_sq = variance(diff).dv + -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + } else { + val vv = diff ddot diff; + -sqrt(row(vv/sdata.nnz)) + } } /** - * The evalfun normally called during testing and predicting. Returns -RMSE - * on training data minibatch (sdata). + * The evalfun normally called during testing and predicting. Returns -RMSE on + * training data minibatch (sdata). We do not need the matrix of scores here. */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { val spreds = DDS(mm, user, sdata) + (iavg + avg); @@ -292,8 +300,9 @@ object SMF { var aopts:ADAGrad.Opts = null; var minv = 1f; var maxv = 5f; - + var matrixOfScores = false; } + class Options extends Opts {} def learner(mat0:Mat, d:Int) = { @@ -315,7 +324,10 @@ object SMF { (nn, opts) } - /** Learner with single (training data) matrix as datasource, and ADAGrad Opts. */ + /** + * Learner with single (training data) matrix as datasource, and ADAGrad Opts. + * We will benchmark this with the learner using MHTest. + */ def learner1(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts val opts = new xopts @@ -335,7 +347,32 @@ object SMF { opts) (nn, opts) } - + + /** + * Learner with single (training data) matrix as datasource, and using our + * MHTest updater. Use this for running experiments to benchmark with default + * ADAGrad. + */ + def learner2(mat0:Mat, d:Int) = { + class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts with MHTest.Opts + val opts = new xopts + opts.dim = d + opts.putBack = -1 + opts.npasses = 4 + opts.lrate = 0.1 + opts.initUval = 0f; + opts.batchSize = math.min(100000, mat0.ncols/30 + 1) + opts.aopts = opts + val nn = new Learner( + new MatSource(Array(mat0:Mat), opts), + new SMF(opts), + null, + new MHTest(opts), + null, + opts) + (nn, opts) + } + def learner(mat0:Mat, user0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with Grad.Opts val opts = new xopts diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 8bfc10d9..750f0b5b 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -131,19 +131,28 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { if (newMinibatch) beforeEachMinibatch() - b += model.datasource.opts.batchSize n += 1.0f // (Part 1) Compute scores for theta and theta', scaled by N/T. - scores0 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + if (opts.smf) { + scores0 = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + } else { + scores0 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + } + b += scores0.length // With SMF, using scores0.length as the MB size generalizes better. if (scores0.length == 1) { throw new RuntimeException("Need individual scores, but getting a scalar.") } for (i <- 0 until modelmats.length) { modelmats(i) <-- proposedTheta(i) } - scores1 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) - diff ~ scores1 - scores0 + if (opts.smf) { + scores1 = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + diff = scores1 - scores0 + } else { + scores1 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + diff ~ scores1 - scores0 + } // (Part 2) Update our \Delta* and sample variance of \Delta*. sumOfSquares += sum((diff)*@(diff)).v @@ -152,7 +161,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -312,11 +321,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float) { - println("b="+b+", n="+n+", logu="+logu+ ", b-mbSize="+(b - model.datasource.opts.batchSize).toInt) - println("mean(scores0) = "+mean(scores0,2).dv+", mean(scores1) = "+mean(scores1,2).dv) - println("sampleVar = " +sampleVariance) - println("delta* = " + deltaStar) + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double) { + val s1 = mean(scores1).dv + val s0 = mean(scores0).dv + println("b="+b+", n="+n+", logu="+logu) + println("mean(scores1) = "+s1+" - mean(scores0) = "+s0+" = "+(s1-s0)) + println("sampleVar = " +sampleVariance+ ", delta* = " +deltaStar+ ", numStd = " +numStd) } } @@ -341,6 +351,7 @@ object MHTest { var exitThetaAmount = 3000 var initThetaHere = false var burnIn = -1 + var smf = false } class Options extends Opts {} From af060e06a97a6ad5edb7c6db830994e0b6340338 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 8 Mar 2017 10:22:12 -0800 Subject: [PATCH 04/18] changed *@ to ddot since doubles are more precise than floats --- src/main/scala/BIDMach/models/SMF.scala | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index bbd4d4d8..1da12649 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -76,6 +76,7 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts override def init() = { mats = datasource.next; datasource.reset; + println("size(mats(0))="+size(mats(0))) nfeats = mats(0).nrows; val batchSize = mats(0).ncols; val d = opts.dim; @@ -235,9 +236,13 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts /** * The evalfun normally called during training. Returns -RMSE on training data - * minibatch (sdata). It has an extra option to return a matrix of scores, + * minibatch (sdata0). It has an extra option to return a matrix of scores, * which will be useful for minibatch MH test updaters. We need a 1/(2*sigma^2) - * if we're assuming a Gaussian error distribution. + * if we're assuming a Gaussian error distribution. + * + * Note: it looks scary to subtract iavg+avg from sdata0, but we don't add + * that to preds so we can still directly compare sdata and preds. I'll leave + * it here since John may have had a reason or doing that. */ def evalfun(sdata0:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val sdata = sdata0 - (iavg + avg); @@ -254,7 +259,7 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts if (opts.matrixOfScores) { // TODO Temporary but should be OK for now (b/c we almost never increment MB). val sigma_sq = variance(diff).dv - -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + -(1.0f/(2*sigma_sq)).v * (diff ddot diff) } else { val vv = diff ddot diff; -sqrt(row(vv/sdata.nnz)) @@ -264,6 +269,8 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts /** * The evalfun normally called during testing and predicting. Returns -RMSE on * training data minibatch (sdata). We do not need the matrix of scores here. + * - `sdata` matrix is the predictor input data (we've used train, but could be test) + * - `preds` matrix should indicate the non-zero *testing* data points. */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { val spreds = DDS(mm, user, sdata) + (iavg + avg); From 9c225433b44d8f57ab910cd9cf79b72d3f9a9a4f Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 8 Mar 2017 12:26:25 -0800 Subject: [PATCH 05/18] Slight updates, mostly debugging. My confusion before was about how the model matrices were still updating even though I wasn't accepting anything in the updater. It turns out that the SMF code will update it in the mupdate method. Ugh ... --- scripts/daniel_smf_netflix_mhtest.ssc | 29 +++++++++++++++++--- src/main/scala/BIDMach/models/SMF.scala | 19 ++++++++----- src/main/scala/BIDMach/updaters/MHTest.scala | 3 +- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 5656ef0e..b8940d13 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -3,7 +3,28 @@ import BIDMach.models.SMF /** * Test SMF code on netflix data. This will use OUR MHTest updater, which I put - * in as a new updater (SMF.learner2) to make this script more concise. + * in as a new updater (SMF.learner2) to make this script more concise. Some + * notes on the netflix dataset: + * + * size(a) = (17770,480189) + * a.nnz = 90430138 + * min=0, max=5 + * + * (a == 1).nnz = 4156151 + * (a == 2).nnz = 9120198 + * (a == 3).nnz = 25928920 + * (a == 4).nnz = 30375037 + * (a == 5).nnz = 20849832 + * mean (of nonzeros) = 3.6042476 + * sqrt((diff ddot diff) / diff.nn) = 1.0852 // Train RMSE using mean predictor + * + * (ta == 1).nnz = 461839 + * (ta == 2).nnz = 1011882 + * (ta == 3).nnz = 2882327 + * (ta == 4).nnz = 3375921 + * (ta == 5).nnz = 2318400 + * mean (of nonzeros) = 3.6046705 + * sqrt((diff ddot diff) / diff.nn) = 1.0851 // Test RMSE using mean predictor */ // Get random seed set up. @@ -40,10 +61,10 @@ opts.matrixOfScores = true // Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. opts.batchSize = 1000 +opts.npasses = 2 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f -opts.npasses = 3 val lambda = 4f opts.lambdau = lambda opts.regumean = lambda @@ -57,7 +78,7 @@ nn.train val model = nn.model.asInstanceOf[SMF] val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa) +val (mm, mopts) = SMF.predictor1(model, a, xa) // Provide `a` or `ta` as input? mopts.batchSize = 10000 mopts.uiter = 5 mopts.urate = opts.urate @@ -68,5 +89,5 @@ val pa = SMat(mm.preds(1)); min(pa.contents,5,pa.contents) max(pa.contents,1,pa.contents) val diff = ta.contents - pa.contents -val rmse = sqrt((diff ^* diff) / diff.length) +val rmse = sqrt((diff ddot diff) / diff.length) println("rmse = %f" format rmse.v) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 1da12649..25b3c35a 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -241,12 +241,10 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts * if we're assuming a Gaussian error distribution. * * Note: it looks scary to subtract iavg+avg from sdata0, but we don't add - * that to preds so we can still directly compare sdata and preds. I'll leave - * it here since John may have had a reason or doing that. + * that to preds so we can still directly compare sdata and preds. */ - def evalfun(sdata0:Mat, user:Mat, ipass:Int, pos:Long):FMat = { - val sdata = sdata0 - (iavg + avg); - val preds = DDS(mm, user, sdata); + def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { + val preds = DDS(mm, user, sdata) + (iavg + avg); if (ogmats != null) { ogmats(0) = user; if (ogmats.length > 1) { @@ -255,11 +253,17 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts } val dc = sdata.contents val pc = preds.contents - val diff = dc - pc; + val diff = DMat(dc - pc); if (opts.matrixOfScores) { // TODO Temporary but should be OK for now (b/c we almost never increment MB). val sigma_sq = variance(diff).dv - -(1.0f/(2*sigma_sq)).v * (diff ddot diff) + + //println("evalfun, sdata.contents.length = " +dc.length) + //println("mean of squared diffs = " +(diff ddot diff)/diff.length) + //println("sigma_sq = " +sigma_sq) + //println("result = " +mean(-(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff))) + + -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) } else { val vv = diff ddot diff; -sqrt(row(vv/sdata.nnz)) @@ -284,6 +288,7 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts ogmats(1) = xpreds; } } + println("TESTING evalfun, spreds.nnz="+spreds.nnz+", xpreds.nnz="+xpreds.nnz) preds.contents <-- xpreds.contents; -sqrt(row(vv/sdata.nnz)) } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 750f0b5b..e8661db1 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -79,6 +79,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * Note that the file for the norm2logdata should be in the correct directory. */ override def init(model0:Model) = { + setseed(1) model = model0; modelmats = model.modelmats updatemats = model.updatemats @@ -218,7 +219,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. } } - if (newMinibatch && accept) afterEachMinibatch() + if (newMinibatch) afterEachMinibatch() } From bbbeba8bb12572b3c1d6d988ca5ee47f5c6930e7 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Thu, 9 Mar 2017 15:37:17 -0800 Subject: [PATCH 06/18] Really confused. Better ask John. =( --- scripts/daniel_smf_netflix_mhtest.ssc | 11 +++++++--- src/main/scala/BIDMach/models/SMF.scala | 23 +++++++++----------- src/main/scala/BIDMach/updaters/MHTest.scala | 3 ++- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index b8940d13..2f1e1d1f 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -61,7 +61,7 @@ opts.matrixOfScores = true // Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. opts.batchSize = 1000 -opts.npasses = 2 +opts.npasses = 1 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f @@ -72,13 +72,14 @@ opts.lambdam = lambda / 500000 * 20 opts.regmmean = opts.lambdam opts.evalStep = 31 opts.doUsers = false -opts.lsgd = 0.010f +opts.lsgd = 0.010f; // Daniel: what is this? opts.what nn.train val model = nn.model.asInstanceOf[SMF] val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa) // Provide `a` or `ta` as input? +val (mm, mopts) = SMF.predictor1(model, a, xa) // a or ta as input? + mopts.batchSize = 10000 mopts.uiter = 5 mopts.urate = opts.urate @@ -91,3 +92,7 @@ max(pa.contents,1,pa.contents) val diff = ta.contents - pa.contents val rmse = sqrt((diff ddot diff) / diff.length) println("rmse = %f" format rmse.v) +println("size(mm.preds(0)) = " +size(mm.preds(0))) +println("size(mm.preds(1)) = " +size(pa)) +saveFMat("user.fmat.lz4", FMat(mm.preds(0))) +saveSMat("preds.smat.lz4", pa) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 25b3c35a..0c59cb0a 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -76,11 +76,14 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts override def init() = { mats = datasource.next; datasource.reset; + println("Inside SMF initialization:") println("size(mats(0))="+size(mats(0))) + println("mats.length="+mats.length) nfeats = mats(0).nrows; val batchSize = mats(0).ncols; val d = opts.dim; if (refresh) { + println("Inside refresh") mm = normrnd(0,0.01f,d,nfeats); mm = convertMat(mm); avg = mm.zeros(1,1) @@ -239,9 +242,6 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts * minibatch (sdata0). It has an extra option to return a matrix of scores, * which will be useful for minibatch MH test updaters. We need a 1/(2*sigma^2) * if we're assuming a Gaussian error distribution. - * - * Note: it looks scary to subtract iavg+avg from sdata0, but we don't add - * that to preds so we can still directly compare sdata and preds. */ def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val preds = DDS(mm, user, sdata) + (iavg + avg); @@ -257,12 +257,6 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts if (opts.matrixOfScores) { // TODO Temporary but should be OK for now (b/c we almost never increment MB). val sigma_sq = variance(diff).dv - - //println("evalfun, sdata.contents.length = " +dc.length) - //println("mean of squared diffs = " +(diff ddot diff)/diff.length) - //println("sigma_sq = " +sigma_sq) - //println("result = " +mean(-(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff))) - -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) } else { val vv = diff ddot diff; @@ -273,23 +267,26 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts /** * The evalfun normally called during testing and predicting. Returns -RMSE on * training data minibatch (sdata). We do not need the matrix of scores here. - * - `sdata` matrix is the predictor input data (we've used train, but could be test) + * - `sdata` matrix is the predictor input data, i.e. a minibatch of the + * training (or testing!) data. * - `preds` matrix should indicate the non-zero *testing* data points. + * The predictions on the TEST set are stored in ogmats(1) which turns into + * the `preds(1)` matrix that we can access outside BIDMach. */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { + // Predict on *training* then *testing*, filtered by sdata and preds. val spreds = DDS(mm, user, sdata) + (iavg + avg); + val xpreds = DDS(mm, user, preds) + (iavg + avg); val dc = sdata.contents; val pc = spreds.contents; val vv = (dc - pc) ddot (dc - pc); - val xpreds = DDS(mm, user, preds) + (iavg + avg); if (ogmats != null) { ogmats(0) = user; if (ogmats.length > 1) { ogmats(1) = xpreds; } } - println("TESTING evalfun, spreds.nnz="+spreds.nnz+", xpreds.nnz="+xpreds.nnz) - preds.contents <-- xpreds.contents; + preds.contents <-- xpreds.contents; // This doesn't seem necessary. -sqrt(row(vv/sdata.nnz)) } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index e8661db1..792b8c53 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -208,7 +208,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater accept = true } } - + // (Part 4) Reset parameters and use <-- to avoid alias problems. if (accept) { for (i <- 0 until modelmats.length) { @@ -219,6 +219,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. } } + if (newMinibatch) afterEachMinibatch() } From 1d1bb219bf889915b432505bca92f0129738c956 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 15 Mar 2017 11:18:36 -0700 Subject: [PATCH 07/18] Wow, I think I finally get SMF ... well, the main idea. I resolved my earlier questions. Now, let's TRY to get MHTest to work on this ... gulp. --- scripts/daniel_smf_netflix_adagrad.ssc | 4 +- scripts/daniel_smf_netflix_mhtest.ssc | 12 +- src/main/scala/BIDMach/models/SMF.scala | 333 +++++++++++-------- src/main/scala/BIDMach/updaters/MHTest.scala | 2 +- 4 files changed, 201 insertions(+), 150 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index 487acf2b..2a8dba1d 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -18,11 +18,11 @@ val d = 256 val (nn,opts) = SMF.learner1(a, d) // Daniel Seita: I'm not sure if these values are good for netflix. -opts.batchSize = 2000 +opts.batchSize = 1000 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f -opts.npasses = 2 +opts.npasses = 1 val lambda = 4f opts.lambdau = lambda opts.regumean = lambda diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 2f1e1d1f..0de96608 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -25,6 +25,8 @@ import BIDMach.models.SMF * (ta == 5).nnz = 2318400 * mean (of nonzeros) = 3.6046705 * sqrt((diff ddot diff) / diff.nn) = 1.0851 // Test RMSE using mean predictor + * + * BTW: (a *@ ta).nnz = 0, which shows that they are completely distinct. */ // Get random seed set up. @@ -55,11 +57,10 @@ opts.exitTheta = false opts.initThetaHere = false opts.burnIn = -1 -// For the SMF -opts.matrixOfScores = true - // Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. +// Also, the four items here that start with `lambda` or `reg` represent priors. +opts.matrixOfScores = true opts.batchSize = 1000 opts.npasses = 1 opts.uiter = 5 @@ -72,13 +73,12 @@ opts.lambdam = lambda / 500000 * 20 opts.regmmean = opts.lambdam opts.evalStep = 31 opts.doUsers = false -opts.lsgd = 0.010f; // Daniel: what is this? opts.what nn.train val model = nn.model.asInstanceOf[SMF] val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa) // a or ta as input? +val (mm, mopts) = SMF.predictor1(model, a, xa) mopts.batchSize = 10000 mopts.uiter = 5 @@ -94,5 +94,3 @@ val rmse = sqrt((diff ddot diff) / diff.length) println("rmse = %f" format rmse.v) println("size(mm.preds(0)) = " +size(mm.preds(0))) println("size(mm.preds(1)) = " +size(pa)) -saveFMat("user.fmat.lz4", FMat(mm.preds(0))) -saveSMat("preds.smat.lz4", pa) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 0c59cb0a..1b752f6e 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -72,39 +72,44 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts var epsilon = 0f; var aopts:ADAGrad.Opts = null; - override def init() = { + // Get dimensions; for Netflix, size(mats(0)) = (17770,batchSize). mats = datasource.next; - datasource.reset; - println("Inside SMF initialization:") - println("size(mats(0))="+size(mats(0))) - println("mats.length="+mats.length) - nfeats = mats(0).nrows; - val batchSize = mats(0).ncols; + datasource.reset; + nfeats = mats(0).nrows; + val batchSize = mats(0).ncols; val d = opts.dim; + if (refresh) { + // Randomly drawing mm, iavg, and avg (the three respective model + // matrices). Note that nfeats is the number of items (e.g. movies). println("Inside refresh") - mm = normrnd(0,0.01f,d,nfeats); - mm = convertMat(mm); - avg = mm.zeros(1,1) - iavg = mm.zeros(nfeats,1); - itemsum = mm.zeros(nfeats, 1); - itemcount = mm.zeros(nfeats, 1); - setmodelmats(Array(mm, iavg, avg)); + mm = normrnd(0,0.01f,d,nfeats); + mm = convertMat(mm); + avg = mm.zeros(1,1) + iavg = mm.zeros(nfeats,1); + itemsum = mm.zeros(nfeats, 1); + itemcount = mm.zeros(nfeats, 1); + setmodelmats(Array(mm, iavg, avg)); } + + // Handle brief logic with GPUs. Careful with aliasing as well!! useGPU = opts.useGPU && Mat.hasCUDA > 0; - if (useGPU || useDouble) { - gmats = new Array[Mat](mats.length); - } else { - gmats = mats; - } - - modelmats(0) = convertMat(modelmats(0)); - modelmats(1) = convertMat(modelmats(1)); - modelmats(2) = convertMat(modelmats(2)); - mm = modelmats(0); + if (useGPU || useDouble) { + gmats = new Array[Mat](mats.length); + } else { + gmats = mats; + } + modelmats(0) = convertMat(modelmats(0)); + modelmats(1) = convertMat(modelmats(1)); + modelmats(2) = convertMat(modelmats(2)); + mm = modelmats(0); iavg = modelmats(1); avg = modelmats(2); + + // Here's some confusing stuff. Seems to be "small" stuff about constants. + // uscale, an internal ADAGrad parameter but we use it (!!!). + // cscale, an internal ADAGrad parameter but we ignore it. lamu = mm.ones(d, 1) ∘ opts.lambdau if (opts.doUsers) lamu(0) = opts.regumean; slm = mm.ones(1,1) ∘ (opts.lambdam * batchSize); @@ -114,12 +119,18 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts cscale = mm.ones(d, 1); cscale(0,0) = 0.0001f; if (opts.doUsers) mm(0,?) = 1f + + // The updatemats is the same length as the model matrices. updatemats = new Array[Mat](3); updatemats(2) = mm.zeros(1,1); + + // Set this to null to avoid the internal ADAGrad updater making updates. if (opts.aopts != null) initADAGrad(d, nfeats); - vexp = convertMat(row(0.5f)); + vexp = convertMat(row(0.5f)); // External ADAGrad parameter, OK here. } + + /** An internal ADAGrad updater. Ignore this for our current experiments. */ def initADAGrad(d:Int, m:Int) = { aopts = opts.asInstanceOf[ADAGrad.Opts] firststep = -1f; @@ -132,116 +143,152 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts waitsteps = aopts.waitsteps; epsilon = aopts.epsilon; } - + + + /** + * Performs some number of passes over the minibatch to update the user + * matrix. Try to understand how the user matrix gets updated ... note that + * putBack = -1 by default. I think this is the user matrix update, so we're + * holding the item matrix fixed (it's actually the model matrix, but the same + * point holds) while updating the user by stochastic gradient descent. + * + * We subtract biases, so predictions can be done with DDS(mm,user,sdata) + * *without* re-adding biases. Also, we *do* clear out user here. is this + * because John said we can't really save the entire *full* user matrix (the + * one with size (dim,480189))? ucounts sums up the number of nonzeros in + * each columns of sdata0, then uci is something else on it. b might make + * sense in some way, because the derivative term later is mm*(sdata-preds) + * and the sdata-preds is supposed to be close to each other. + * + * We then update the user matrix several times based on current predictions. + * Actually, this update makes sense because the normal SGD update for x_u + * (user vectors) is x_u minus the following term: + * + * alpha*(data-prediction)*item_vector + lambda*user_vector + * + * and that's what we have here! QUESTION, though, uscale is an integrated + * ADAGrad value. Do we want it here? I'm also not sure why we need du to + * have uscale and uci there ... + * + * NOTE: Upon further inspection, it seems that `user` starts out as a matrix + * of all zeros. So the user.clear with putBack<0 is un-necessary as it is + * already cleared. I suppose in theory we should have some putBack mechanism + * (that way, the user matrix value is stored from prior iterations) but John + * said there's little reason to do that. Also, even with putBack=1, I can't + * get the user matrix's values carried over. Hmmm ... + * + * @param sdata0 Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, batchSize). + */ def uupdate(sdata0:Mat, user:Mat, ipass:Int, pos:Long):Unit = { - if (firststep <= 0) firststep = pos.toFloat; - val step = (pos + firststep)/firststep; - val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { - opts.asInstanceOf[Grad.Opts].texp.dv - } else { - opts.asInstanceOf[Grad.Opts].pexp.dv - } - uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) + if (firststep <= 0) firststep = pos.toFloat; + val step = (pos + firststep)/firststep; + val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { + opts.asInstanceOf[Grad.Opts].texp.dv + } else { + opts.asInstanceOf[Grad.Opts].pexp.dv + } + uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) + val sdata = sdata0 - (iavg + avg); - if (putBack < 0) { - user.clear - } - val b = mm * sdata; - val ucounts = sum(sdata0 != 0f); - val uci = (ucounts + 1f) ^ (- vexp); - for (i <- 0 until opts.uiter) { - val preds = DDS(mm, user, sdata); - val deriv = b - mm * preds - (user ∘ lamu); - val du = (deriv ∘ uscale ∘ uci); - if (opts.lsgd >= 0) { - val dpreds = DDS(mm, du, sdata); - accept(sdata, user, du, preds, dpreds, uscale, lamu, false); - } else { - user ~ user + du; - } + if (putBack < 0) { + user.clear + } + val b = mm * sdata; + val ucounts = sum(sdata0 != 0f); + val uci = (ucounts + 1f) ^ (- vexp); - if (opts.traceConverge) { - println("step %d, loss %f" format (i, ((norm(sdata.contents - preds.contents) ^ 2f) + (sum(user dot (user ∘ lamu)))).dv/sdata.nnz)); - } - } + for (i <- 0 until opts.uiter) { + val preds = DDS(mm, user, sdata); + val deriv = b - mm * preds - (user ∘ lamu); + val du = (deriv ∘ uscale ∘ uci); + user ~ user + du; + if (opts.traceConverge) { + println("step %d, loss %f" format (i, ((norm(sdata.contents - preds.contents) ^ 2f) + (sum(user dot (user ∘ lamu)))).dv/sdata.nnz)); + } + } } - + + + /** + * Computes updates to the updatemats. Note again that we subtract (iavg+avg) + * from the sdata, so that predictions are done with DDS(mm,user,sdata), and + * the differences (for gradient update later) are stored. This is for + * updating the item matrix, so we hold the user matrix fixed (it's updated in + * uupdate) and compute updates for the item matrix (and the bias terms, + * actually). Also, this might be why we don't use a user bias term. Note that + * we call predictions once here, since mm and user are fixed for this method; + * ideally, some other updater will use the updatemats we compute (i.e. the + * gradients) to update the item matrix. The item matrix, if it wasn't clear, + * is modelmats(0). + * + * Note that there's some extra work with ipass < 1, I think to get reasonable + * initialization values for our bias terms. Here, avg is the average rating + * across the entire nonzeros of sdata0 (hence, our global bias), and iavg is + * some (scaled) estimate of how much we should boost each individal item. The + * iavg should be smaller since the avg already scales stuff to roughly 3.6. + * + * During predictions, this method is NOT called, hence the biases aren't + * updated. + * + * @param sdata0 Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, + * batchSize). The matrix has the same values as the user matrix updated + * from the most recent uupdate method call. + */ def mupdate(sdata0:Mat, user:Mat, ipass:Int, pos:Long):Unit = { val sdata = sdata0 - (iavg + avg); // values to be accumulated val preds = DDS(mm, user, sdata); - val diffs = sdata + 2f; + val diffs = sdata + 2f; // I THINK 2f is only for avoiding aliasing, but why not 0f? diffs.contents ~ sdata.contents - preds.contents; + if (ipass < 1) { - itemsum ~ itemsum + sum(sdata0, 2); - itemcount ~ itemcount + sum(sdata0 != 0f, 2); - avg ~ sum(itemsum) / sum(itemcount); - iavg ~ ((itemsum + avg) / (itemcount + 1)) - avg; + itemsum ~ itemsum + sum(sdata0, 2); // sum horizontally + itemcount ~ itemcount + sum(sdata0 != 0f, 2); // count #nonzeros horizontally + avg ~ sum(itemsum) / sum(itemcount); + iavg ~ ((itemsum + avg) / (itemcount + 1)) - avg; } + + // Compute gradient updates for the biases, and set wuser=user unless we're weighing. val icomp = sdata0 != 0f val icount = sum(sdata0 != 0f, 2); updatemats(1) = (sum(diffs,2) - iavg*mlm) / (icount + 1f); // per-item term estimator updatemats(2) ~ sum(diffs.contents) / (diffs.contents.length + 1f); val wuser = if (opts.weightByUser) { - val iwt = 100f / max(sum(sdata != 0f), 100f); + val iwt = 100f / max(sum(sdata != 0f), 100f); user ∘ iwt; } else { user; } if (firststep <= 0) firststep = pos.toFloat; + + // I get it! This derivative is virtually the same as what we had with the + // user update, except user and mm swap locations, which is expected. if (opts.lsgd >= 0 || opts.aopts == null) { - updatemats(0) = (wuser *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); // simple derivative - if (opts.lsgd >= 0) { - val step = (pos + firststep)/firststep; - uscale.set((lrate.dv * math.pow(step, - texp.dv)).toFloat); - val dm = updatemats(0) ∘ uscale ∘ cscale; - val dpreds = DDS(dm, user, sdata); - accept(sdata, mm, dm, preds, dpreds, uscale, slm, true); - } + updatemats(0) = (wuser *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); // simple derivative } else { - if (texp.asInstanceOf[AnyRef] != null) { - val step = (pos + firststep)/firststep; - ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, texp, vexp, epsilon, step, waitsteps); - } else { - ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, pexp, vexp, epsilon, ipass + 1, waitsteps); - } + if (texp.asInstanceOf[AnyRef] != null) { + val step = (pos + firststep)/firststep; + ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, texp, vexp, epsilon, step, waitsteps); + } else { + ADAGrad.multUpdate(wuser, diffs, modelmats(0), sumsq, null, lrate, pexp, vexp, epsilon, ipass + 1, waitsteps); + } } if (opts.doUsers) mm(0,?) = 1f; } - def accept(sdata:Mat, mmod:Mat, du:Mat, preds:Mat, dpreds:Mat, scale:Mat, lambda:Mat, flip:Boolean) = { - // println("sdata " + FMat(sdata.contents)(0->5,0).t) - val diff1 = preds + 0f; - diff1.contents ~ sdata.contents - preds.contents; -// println("sdata %d %s" format (if (flip) 1 else 0, FMat(sdata.contents)(0->5,0).t.toString)); -// println("preds %d %s" format (if (flip) 1 else 0, FMat(preds.contents)(0->5,0).t.toString)); -// println("diff %d %s" format (if (flip) 1 else 0, FMat(diff1.contents)(0->5,0).t.toString)); -// println("sdata "+FMat(sdata.contents)(0->5,0).t.toString); - val diff2 = diff1 + 0f; - diff2.contents ~ diff1.contents - dpreds.contents; - diff1.contents ~ diff1.contents ∘ diff1.contents; - diff2.contents ~ diff2.contents ∘ diff2.contents; - val rmmod = mmod + 1f; - normrnd(0, opts.lsgd, rmmod); - val mmod2 = mmod + du + rmmod ∘ scale; - val loss1 = (if (flip) sum(diff1,2).t else sum(diff1)) + (mmod dot (mmod ∘ lambda)); - val loss2 = (if (flip) sum(diff2,2).t else sum(diff2)) + (mmod2 dot (mmod2 ∘ lambda)); - - val accprob = erfc((loss2 - loss1) /scale); - val rsel = accprob + 0f; - rand(rsel); - val selector = rsel < accprob; - mmod ~ (mmod2 ∘ selector) + (mmod ∘ (1f - selector)); - if (opts.traceConverge) { - println("accepted %d %f %f %f" format (if (flip) 1 else 0, mean(selector).dv, mean(loss1).dv, mean(loss2).dv)); - } - } - + /** * The evalfun normally called during training. Returns -RMSE on training data * minibatch (sdata0). It has an extra option to return a matrix of scores, * which will be useful for minibatch MH test updaters. We need a 1/(2*sigma^2) * if we're assuming a Gaussian error distribution. + * + * @param sdata Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, of size (dim, + * batchSize). The values here are based on the values computed in the most + * recent uupdate call. */ def evalfun(sdata:Mat, user:Mat, ipass:Int, pos:Long):FMat = { val preds = DDS(mm, user, sdata) + (iavg + avg); @@ -251,58 +298,58 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts ogmats(1) = preds; } } - val dc = sdata.contents - val pc = preds.contents - val diff = DMat(dc - pc); - if (opts.matrixOfScores) { - // TODO Temporary but should be OK for now (b/c we almost never increment MB). - val sigma_sq = variance(diff).dv - -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) - } else { - val vv = diff ddot diff; - -sqrt(row(vv/sdata.nnz)) - } + val dc = sdata.contents + val pc = preds.contents + val diff = DMat(dc - pc); + if (opts.matrixOfScores) { + // TODO Temporary but should be OK for now (b/c we almost never increment MB). + val sigma_sq = variance(diff).dv + //println(sqrt((diff ddot diff)/diff.length)) + -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + } else { + val vv = diff ddot diff; + -sqrt(row(vv/sdata.nnz)) + } } + /** - * The evalfun normally called during testing and predicting. Returns -RMSE on - * training data minibatch (sdata). We do not need the matrix of scores here. - * - `sdata` matrix is the predictor input data, i.e. a minibatch of the - * training (or testing!) data. - * - `preds` matrix should indicate the non-zero *testing* data points. - * The predictions on the TEST set are stored in ogmats(1) which turns into - * the `preds(1)` matrix that we can access outside BIDMach. + * The evalfun normally called during TESTING (i.e. PREDICTION). Returns -RMSE + * on the TRAINING minibatch in `sdata`. We should also store predictions in + * `ogmats(1)`, which is what we can access externally via `preds(1)`. Thus, + * it's predicting based on both training and testing. + * + * @param sdata Training data minibatch of size (nitems, batchSize). + * @param user Second matrix for computing predictions, size (dim, batchSize). + * @param preds Matrix indicating the non-zero TESTING data points. */ override def evalfun(sdata:Mat, user:Mat, preds:Mat, ipass:Int, pos:Long):FMat = { - // Predict on *training* then *testing*, filtered by sdata and preds. val spreds = DDS(mm, user, sdata) + (iavg + avg); - val xpreds = DDS(mm, user, preds) + (iavg + avg); - val dc = sdata.contents; - val pc = spreds.contents; - val vv = (dc - pc) ddot (dc - pc); - if (ogmats != null) { + val xpreds = DDS(mm, user, preds) + (iavg + avg); + val dc = sdata.contents; + val pc = spreds.contents; + val vv = (dc - pc) ddot (dc - pc); + if (ogmats != null) { ogmats(0) = user; if (ogmats.length > 1) { ogmats(1) = xpreds; } } - preds.contents <-- xpreds.contents; // This doesn't seem necessary. - -sqrt(row(vv/sdata.nnz)) + //preds.contents <-- xpreds.contents; // This doesn't seem necessary. + -sqrt(row(vv/sdata.nnz)) } - } + object SMF { trait Opts extends FactorModel.Opts { var ueps = 1e-10f var uconvg = 1e-3f - var miter = 5 var lambdau = 5f var lambdam = 5f var regumean = 0f var regmmean = 0f var urate = 0.1f - var lsgd = 0.1f var traceConverge = false var doUsers = true var weightByUser = false @@ -310,6 +357,7 @@ object SMF { var minv = 1f; var maxv = 5f; var matrixOfScores = false; + var lsgd = 0f; } class Options extends Opts {} @@ -334,8 +382,9 @@ object SMF { } /** - * Learner with single (training data) matrix as datasource, and ADAGrad Opts. - * We will benchmark this with the learner using MHTest. + * Learner with single (training data) matrix as datasource, and an + * **EXTERNAL** ADAGrad Opts. We will benchmark this with the learner using + * an external MHTest. No internal ADAGrad updater. */ def learner1(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts @@ -346,7 +395,7 @@ object SMF { opts.lrate = 0.1 opts.initUval = 0f; opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - opts.aopts = opts + opts.aopts = null val nn = new Learner( new MatSource(Array(mat0:Mat), opts), new SMF(opts), @@ -360,7 +409,8 @@ object SMF { /** * Learner with single (training data) matrix as datasource, and using our * MHTest updater. Use this for running experiments to benchmark with default - * ADAGrad. + * ADAGrad. For our experiments, we should NOT be using opts.aopts, which is + * the internal ADAGrad updater. So that should be null ... */ def learner2(mat0:Mat, d:Int) = { class xopts extends Learner.Options with SMF.Opts with MatSource.Opts with ADAGrad.Opts with MHTest.Opts @@ -371,7 +421,7 @@ object SMF { opts.lrate = 0.1 opts.initUval = 0f; opts.batchSize = math.min(100000, mat0.ncols/30 + 1) - opts.aopts = opts + opts.aopts = null val nn = new Learner( new MatSource(Array(mat0:Mat), opts), new SMF(opts), @@ -413,7 +463,6 @@ object SMF { val mopts = model.opts.asInstanceOf[SMF.Opts]; nopts.dim = mopts.dim; nopts.uconvg = mopts.uconvg; - nopts.miter = mopts.miter; nopts.lambdau = mopts.lambdau; nopts.lambdam = mopts.lambdam; nopts.regumean = mopts.regumean; @@ -438,19 +487,23 @@ object SMF { * which turns into the second factor matrix. It mirrors an SFA predictor code * which also forms this empty matrix into the matrix datasource, with the * only difference being the lack of an Minv option for `newmod`. + * + * @param mat1 The TRAINING DATA matrix. NOT THE TESTING DATA!!! NOT THE + * TESTING DATA!!! + * @param preds The non-zeros of the TESTING data (not training). */ def predictor1(model0:Model, mat1:Mat, preds:Mat) = { val model = model0.asInstanceOf[SMF] val nopts = new PredOpts; nopts.batchSize = math.min(10000, mat1.ncols/30 + 1) nopts.putBack = -1 + nopts.initUval = 0f // Daniel: for consistency with training update. val newmod = new SMF(nopts); newmod.refresh = false newmod.copyFrom(model); val mopts = model.opts.asInstanceOf[SMF.Opts]; nopts.dim = mopts.dim; nopts.uconvg = mopts.uconvg; - nopts.miter = mopts.miter; nopts.lambdau = mopts.lambdau; nopts.lambdam = mopts.lambdam; nopts.regumean = mopts.regumean; @@ -466,4 +519,4 @@ object SMF { nopts) (nn, nopts) } -} \ No newline at end of file +} diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 792b8c53..8202c7ff 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -208,7 +208,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater accept = true } } - + // (Part 4) Reset parameters and use <-- to avoid alias problems. if (accept) { for (i <- 0 until modelmats.length) { From 8bd301867bf7189a2dc95d3dbbf31035755ee742 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 15 Mar 2017 15:27:41 -0700 Subject: [PATCH 08/18] More documentation to myself. Not ready for integration into master --- scripts/daniel_smf_netflix_mhtest.ssc | 8 ++++- src/main/scala/BIDMach/models/SMF.scala | 3 +- src/main/scala/BIDMach/updaters/ADAGrad.scala | 36 ++++++++++++++++--- src/main/scala/BIDMach/updaters/MHTest.scala | 12 ++++++- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 0de96608..b0dd100c 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -42,7 +42,6 @@ println("size(a)="+size(a)+", with a.nnz="+a.nnz) // Daniel Seita: stuff for the MH Test updater. OH ... and our N is going to be // super-large. Ahhhh... we may need temperature then. -opts.smf = true // IMPORTANT, this affects some of the code. opts.N = a.nnz opts.temp = a.nnz / 1000 opts.Nknown = true @@ -57,6 +56,13 @@ opts.exitTheta = false opts.initThetaHere = false opts.burnIn = -1 +// IMPORTANT +opts.smf = true +opts.useInternalADAGrad = true +opts.momentum = true +opts.nesterov = true +opts.langevin = 0.3f + // Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. // Also, the four items here that start with `lambda` or `reg` represent priors. diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 1b752f6e..f187c2b4 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -129,7 +129,7 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts vexp = convertMat(row(0.5f)); // External ADAGrad parameter, OK here. } - + /** An internal ADAGrad updater. Ignore this for our current experiments. */ def initADAGrad(d:Int, m:Int) = { aopts = opts.asInstanceOf[ADAGrad.Opts] @@ -335,7 +335,6 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts ogmats(1) = xpreds; } } - //preds.contents <-- xpreds.contents; // This doesn't seem necessary. -sqrt(row(vv/sdata.nnz)) } } diff --git a/src/main/scala/BIDMach/updaters/ADAGrad.scala b/src/main/scala/BIDMach/updaters/ADAGrad.scala index bd27ea93..d48faef3 100755 --- a/src/main/scala/BIDMach/updaters/ADAGrad.scala +++ b/src/main/scala/BIDMach/updaters/ADAGrad.scala @@ -12,6 +12,9 @@ import scala.concurrent.ExecutionContext.Implicits.global class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Updater { + // Three other sets of matrices (sumSq, momentum, and randmat) all have (if + // initialized) same length and sizes as the modelmats and updatemats. Is the + // `mu` here a momentum term as well? var firstStep = 0f var modelmats:Array[Mat] = null var updatemats:Array[Mat] = null @@ -19,14 +22,22 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda var stepn:Mat = null var mask:Mat = null var momentum:Array[Mat] = null; - var ve:Mat = null - var pe:Mat = null - var te:Mat = null + var ve:Mat = null // opts.vexp, an ADAGrad parameter (see BIDMach wiki) + var pe:Mat = null // Similar to opts.texp? Some exponent like the others? + var te:Mat = null // opts.texp, an ADAGrad parameter (see BIDMach wiki) var lrate:Mat = null var mu:Mat = null var one:Mat = null var randmat:Array[Mat] = null + + /** + * Initialize ADAGrad model. Note the conditions required to create momentum + * and randmats. I just created opts.momentum = FMat(0) since that's an easy + * way to do it. But we ignore that when initializing `momentum` so it's + * confusing. But I also see later in the code that we *do* use opts.momentum, + * so what's the difference? Same thing with opts.nesterov vs nesterov mats??? + */ override def init(model0:Model) = { model = model0 modelmats = model.modelmats; @@ -57,7 +68,13 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda ve <-- opts.vexp; te <-- opts.texp; } + + /** + * Daniel: I don't know what this is supposed to do. However, there is *no* + * mention of any `update2` in the entire BIDMach repository --- I did a + * search. Hence, I think tihs method can be safely ignored. + */ def update2(ipass:Int, step:Long):Unit = { modelmats = model.modelmats; updatemats = model.updatemats; @@ -99,7 +116,17 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda um.clear; } } + + /** + * Whew, the major heavy-hitting for ADAGrad. Let's try to digest this slowly ... + * + * First, it looks like John is defining nsteps and tscale. Not sure what + * these mean... + * + * Second is the heavy-duty part, looping over each model matrix and the other + * matrices involved. What is opts.policies? I'm confused. + */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { val start = toc; modelmats = model.modelmats @@ -121,7 +148,8 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda } val nw = stepn; val nmats = math.min(modelmats.length, updatemats.length); -// println("u sumsq %g" format mini(sumSq(0)).dv) + + // Does something for each model matrix. for (i <- 0 until nmats) { if (opts.policies.asInstanceOf[AnyRef] != null) { if (opts.policies.length > 1) { diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 8202c7ff..b47d86ad 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -4,6 +4,7 @@ import BIDMat.{Mat,SBMat,CMat,DMat,FMat,IMat,HMat,GMat,GIMat,GSMat,SMat,SDMat,TM import BIDMat.MatFunctions._ import BIDMat.SciFunctions._ import BIDMach.models._ +import BIDMach.models.Model._ import edu.berkeley.bid.CUMACH /** @@ -68,6 +69,9 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var sumOfSquares:Float = 0f // \sum_{i=1}^b ((N/T)*log(p(x_i|theta')/p(x_i|theta)))^2. var targetVariance:Float = 0f // The target variance (so we only need one X_corr). + // Daniel: experimental, not sure if that belongs. + var aopts:ADAGrad.Opts = null; + /** * Standard initialization. We have: @@ -75,6 +79,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * - n2ld loads the pre-computed X_c variable distribution. * - {delta,proposed,tmp}Theta initialized to zeros with correct dimensions. * - If desired, initialize modelmats with small values to break symmetry. + * - If desired, initialize an internal ADAGrad updater. * * Note that the file for the norm2logdata should be in the correct directory. */ @@ -116,8 +121,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater modelmats(i) <-- normrnd(0, 0.03f, modelmats(i).nrows, modelmats(i).ncols) } } + + // Experimental ... not sure if this is right ... + // val adagrad = new ADAGrad(opts.asInstanceOf[ADAGrad.Opts]) + // adagrad.init(model) } - + /** * This performs the update and the MH test based on a minibatch of data. The @@ -354,6 +363,7 @@ object MHTest { var initThetaHere = false var burnIn = -1 var smf = false + var useInternalADAGrad = false } class Options extends Opts {} From 0ab7a3e247e1e5064d1cf5830fe099186f9cc7e1 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Fri, 17 Mar 2017 17:51:13 -0700 Subject: [PATCH 09/18] Updated MH Test, I think ADAGrad and SMF work now, but ... two issues: (1) CPU allocation, and (2) we always seem to be accepting, I've only seen one time where it rejected. Does that make sense? Note that I had to cut a threshold of -15 as the log prob for SMF, which should be OK. --- scripts/daniel_smf_netflix_adagrad.ssc | 2 + scripts/daniel_smf_netflix_mhtest.ssc | 16 ++--- src/main/scala/BIDMach/models/SMF.scala | 9 ++- src/main/scala/BIDMach/updaters/ADAGrad.scala | 33 ++++++--- src/main/scala/BIDMach/updaters/MHTest.scala | 69 ++++++++++++++----- 5 files changed, 95 insertions(+), 34 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index 2a8dba1d..98b845a4 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -31,6 +31,8 @@ opts.regmmean = opts.lambdam opts.evalStep = 31 opts.doUsers = false opts.lsgd = 0.010f +opts.momentum = FMat(0.5) +opts.langevin = 0.0f opts.what nn.train diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index b0dd100c..ed245afb 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -43,7 +43,7 @@ println("size(a)="+size(a)+", with a.nnz="+a.nnz) // Daniel Seita: stuff for the MH Test updater. OH ... and our N is going to be // super-large. Ahhhh... we may need temperature then. opts.N = a.nnz -opts.temp = a.nnz / 1000 +opts.temp = a.nnz / 10000 opts.Nknown = true opts.n2lsigma = 1.0f opts.nn2l = 4000 @@ -53,22 +53,22 @@ opts.verboseMH = false opts.collectData = false opts.collectDataDir = "tmp/" opts.exitTheta = false -opts.initThetaHere = false +opts.initThetaHere = true opts.burnIn = -1 -// IMPORTANT +// IMPORTANT. setting opts.smf=true means we create an ADAGrad class inside. +// We also want to compare with additive random noise, i.e. Langevin stuff. opts.smf = true -opts.useInternalADAGrad = true -opts.momentum = true -opts.nesterov = true -opts.langevin = 0.3f +opts.langevin = 0.0f +opts.momentum = FMat(0.5) +opts.nesterov = null // Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. // Also, the four items here that start with `lambda` or `reg` represent priors. opts.matrixOfScores = true opts.batchSize = 1000 -opts.npasses = 1 +opts.npasses = 3 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index f187c2b4..a3548888 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -303,9 +303,14 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts val diff = DMat(dc - pc); if (opts.matrixOfScores) { // TODO Temporary but should be OK for now (b/c we almost never increment MB). + // The FMat(diff *@ diff) will make a vector, hence the broadcasting. + // Also, I was getting a handful of *really* negative scores where log + // p(.) went to -10000 or so. To prevent that, set a threshold at -15. + //println(sqrt((diff ddot diff)/diff.length)) // Use for debugging and sanity checks. val sigma_sq = variance(diff).dv - //println(sqrt((diff ddot diff)/diff.length)) - -(1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + val scores = -ln(sqrt(2*math.Pi*sigma_sq)).v - (1.0f/(2*sigma_sq)).v * FMat(diff *@ diff) + max(scores, -15f, scores) + scores } else { val vv = diff ddot diff; -sqrt(row(vv/sdata.nnz)) diff --git a/src/main/scala/BIDMach/updaters/ADAGrad.scala b/src/main/scala/BIDMach/updaters/ADAGrad.scala index d48faef3..ed738171 100755 --- a/src/main/scala/BIDMach/updaters/ADAGrad.scala +++ b/src/main/scala/BIDMach/updaters/ADAGrad.scala @@ -33,10 +33,17 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda /** * Initialize ADAGrad model. Note the conditions required to create momentum - * and randmats. I just created opts.momentum = FMat(0) since that's an easy - * way to do it. But we ignore that when initializing `momentum` so it's - * confusing. But I also see later in the code that we *do* use opts.momentum, - * so what's the difference? Same thing with opts.nesterov vs nesterov mats??? + * and randmats. We have opts.momentum but also momentum mats, similar thing + * with nesterov. I would *guess* that opts.momentum is that hyper-parameter, + * while the momemtum mats gives the update for each parameter (i.e. component + * wise). But then what's the hyperparameter for Nesterov? There isn't any? + * BTW for both opts.momentum and opts.nesterov, we can set them to be one + * value or a vector with one per modelmat. If just one value, then it's + * repeated across all modelmats. ALSO, what value to set for opts.langevin? + * + * EDIT: figured it out, opts.nesterov is the same as opts.momentum, assuming + * that we choose to run one of momentum and nesterov's (and not both, which + * wouldn't make sense). */ override def init(model0:Model) = { model = model0 @@ -125,7 +132,16 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda * these mean... * * Second is the heavy-duty part, looping over each model matrix and the other - * matrices involved. What is opts.policies? I'm confused. + * matrices involved. What is opts.policies? I'm confused. Anyway, look at the + * CPU version of the case methods. The other version is the GPU, which I'll + * ignore now. + * + * The CPU version is more readable. First, it deals with some logic regarding + * the sum of squares (?). Second, it applies the Langevin dynamics. Third, + * and the crucial part, it applies either momentum updates or nesterov + * updates. I don't *think* we should be using both momentum or nesterov, at + * least based on this code logic. OH, I see the learning rate applied, then + * momentum (or nesterov) and indeed it matches the algorithm formulation. */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { val start = toc; @@ -149,7 +165,8 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda val nw = stepn; val nmats = math.min(modelmats.length, updatemats.length); - // Does something for each model matrix. + // This is applied to each model matrix separately. + // NOTE!! I changed momentum to be momentum(i); see my GitHub issue. for (i <- 0 until nmats) { if (opts.policies.asInstanceOf[AnyRef] != null) { if (opts.policies.length > 1) { @@ -170,10 +187,10 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda case (gmm:GMat, gum:GMat, gss:GMat, gve:GMat, gts:GMat, glrate:GMat) => { if (opts.momentum.asInstanceOf[AnyRef] != null) { val mu = if (opts.momentum.length > 1) opts.momentum(i) else opts.momentum(0); - ADAGrad.ADAGradm(gmm, gum, gss, momentum.asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); + ADAGrad.ADAGradm(gmm, gum, gss, momentum(i).asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } else if (opts.nesterov.asInstanceOf[AnyRef] != null) { val mu = if (opts.nesterov.length > 1) opts.nesterov(i) else opts.nesterov(0); - ADAGrad.ADAGradn(gmm, gum, gss, momentum.asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); + ADAGrad.ADAGradn(gmm, gum, gss, momentum(i).asInstanceOf[GMat], mu, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } else { ADAGrad.ADAGradx(gmm, gum, gss, mask.asInstanceOf[GMat], nw.dv.toFloat, gve, gts, glrate, opts.langevin, opts.epsilon, (opts.waitsteps < nsteps)); } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index b47d86ad..07303e13 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -69,9 +69,9 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var sumOfSquares:Float = 0f // \sum_{i=1}^b ((N/T)*log(p(x_i|theta')/p(x_i|theta)))^2. var targetVariance:Float = 0f // The target variance (so we only need one X_corr). - // Daniel: experimental, not sure if that belongs. - var aopts:ADAGrad.Opts = null; - + // Daniel: experimental, for the SMF. + var adagrad:ADAGrad = null; + var tmpMomentum:Array[Mat] = null /** * Standard initialization. We have: @@ -84,7 +84,6 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * Note that the file for the norm2logdata should be in the correct directory. */ override def init(model0:Model) = { - setseed(1) model = model0; modelmats = model.modelmats updatemats = model.updatemats @@ -122,9 +121,15 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } } - // Experimental ... not sure if this is right ... - // val adagrad = new ADAGrad(opts.asInstanceOf[ADAGrad.Opts]) - // adagrad.init(model) + if (opts.smf) { + // This should force adagrad.momentum(i) = momentum(i) in the rest of this code. + adagrad = new ADAGrad(opts.asInstanceOf[ADAGrad.Opts]) + adagrad.init(model) + tmpMomentum = new Array[Mat](nmats) + for (i <- 0 until nmats) { + tmpMomentum(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) + } + } } @@ -140,7 +145,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * datasource size and the number of (training) passes. */ override def update(ipass:Int, step:Long, gprogress:Float):Unit = { - if (newMinibatch) beforeEachMinibatch() + if (newMinibatch) beforeEachMinibatch(ipass, step, gprogress) n += 1.0f // (Part 1) Compute scores for theta and theta', scaled by N/T. @@ -168,7 +173,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater sumOfSquares += sum((diff)*@(diff)).v sumOfValues += sum(diff).v val deltaStar = sumOfValues/b.v - logu - val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v + val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd) @@ -226,6 +231,9 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } else { for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. + if (opts.smf) { + adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. + } } } @@ -236,20 +244,48 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** * Stuff we should do before each minibatch. This involves calling the * proposer, resetting some values, and saving the current model matrix into - * `tmpTheta` so we can restore it later when needed. + * `tmpTheta` so we can restore it later when needed. Here, we want to set the + * proposer matrices, so that when we continue in uupdate, we have the current + * and proposed model matrices stored in modelmats and proposedTheta, + * respectively. + * + * Also, we have a different (i.e. better!) proposer with ADAGrad. The update + * *should* affect all of the modelmats(i) due to aliasing (since it changes + * adagrad.modelmats(i)). However, this doesn't put it in proposedTheta, so + * here's a workaround: get the modelmats stored into tmpTheta. Then do the + * update, which will update modelmats to the proposed matrices. Then copy + * those into propsoedTheta, and then get current modelmats back to tmpTheta + * (i.e. so modelmats remains the same before and after, and it's just the + * proposedTheta which changes). With momentum, fortunately it's simpler, we + * have that in adagrad.momentum and simply copy the old state into + * tmpMomentum. */ - def beforeEachMinibatch() { + def beforeEachMinibatch(ipass:Int, step:Long, gprogress:Float) { if (opts.verboseMH) println("\n\tNew minibatch!") - randomWalkProposer() + + for (i <- 0 until modelmats.length) { + tmpTheta(i) <-- modelmats(i) + if (opts.smf) { + tmpMomentum(i) <-- adagrad.momentum(i) + } + } + + if (opts.smf) { + adagrad.update(ipass, step, gprogress) + for (i <- 0 until modelmats.length) { + proposedTheta(i) <-- adagrad.modelmats(i) // adagrad.modelmats(i) = modelmats(i) + modelmats(i) <-- tmpTheta(i) // Should make adagrad.modelmats(i) back to what it was before. + } + } else { + randomWalkProposer() + } + logu = ln(rand(1,1)).v newMinibatch = false b = 0 n = 0 sumOfValues = 0f sumOfSquares = 0f - for (i <- 0 until modelmats.length) { - tmpTheta(i) <-- modelmats(i) - } } @@ -337,6 +373,8 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater val s0 = mean(scores0).dv println("b="+b+", n="+n+", logu="+logu) println("mean(scores1) = "+s1+" - mean(scores0) = "+s0+" = "+(s1-s0)) + println("maxi(scores1) = "+maxi(scores1)+", maxi(scores0) = "+maxi(scores0)) + println("mini(scores1) = "+mini(scores1)+", mini(scores0) = "+mini(scores0)) println("sampleVar = " +sampleVariance+ ", delta* = " +deltaStar+ ", numStd = " +numStd) } @@ -363,7 +401,6 @@ object MHTest { var initThetaHere = false var burnIn = -1 var smf = false - var useInternalADAGrad = false } class Options extends Opts {} From b1178e4c083284c0d3e3f3becd1cdfb14de21e1c Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Sat, 18 Mar 2017 15:23:53 -0700 Subject: [PATCH 10/18] OK the memory allocation stuff is fine, not really worried now. And I can track acceptance rates --- bidmach | 2 +- scripts/daniel_smf_netflix_mhtest.ssc | 15 +++-- src/main/scala/BIDMach/models/SMF.scala | 17 +++++ src/main/scala/BIDMach/updaters/MHTest.scala | 65 +++++++++++++++----- 4 files changed, 78 insertions(+), 21 deletions(-) diff --git a/bidmach b/bidmach index db39c8be..9bc38a2c 100755 --- a/bidmach +++ b/bidmach @@ -2,7 +2,7 @@ # export JAVA_HOME="" # Set here if not set in environment # export CUDA_PATH="" # Set here if not set in environment -MEMSIZE="-Xmx14G" +MEMSIZE="-Xmx40G" export JAVA_OPTS="${MEMSIZE} -Xms128M -Dfile.encoding=UTF-8" # Set as much memory as possible BIDMACH_ROOT="${BASH_SOURCE[0]}" if [ ! `uname` = "Darwin" ]; then diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index ed245afb..a67cce05 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -62,13 +62,14 @@ opts.smf = true opts.langevin = 0.0f opts.momentum = FMat(0.5) opts.nesterov = null +opts.saveAcceptRate = true -// Daniel Seita: actually, a batch size of 2000 means we may get 100k "elements" +// Daniel Seita: actually, a batch size of 2000 means we may get 300k "elements" // due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. // Also, the four items here that start with `lambda` or `reg` represent priors. opts.matrixOfScores = true opts.batchSize = 1000 -opts.npasses = 3 +opts.npasses = 4 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f @@ -93,10 +94,12 @@ mopts.lsgd = 0.0f mm.predict val pa = SMat(mm.preds(1)); -min(pa.contents,5,pa.contents) -max(pa.contents,1,pa.contents) +println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) val diff = ta.contents - pa.contents val rmse = sqrt((diff ddot diff) / diff.length) println("rmse = %f" format rmse.v) -println("size(mm.preds(0)) = " +size(mm.preds(0))) -println("size(mm.preds(1)) = " +size(pa)) +min(pa.contents,5,pa.contents) +max(pa.contents,1,pa.contents) +val diff2 = ta.contents - pa.contents +val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) +println("rmse (w/clipping) = %f" format rmse2.v) diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index a3548888..87e9b188 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -72,12 +72,16 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts var epsilon = 0f; var aopts:ADAGrad.Opts = null; + // Daniel: doing this to set MB sizes in MHTest code. + var numNonzerosMB:Int = -1 + override def init() = { // Get dimensions; for Netflix, size(mats(0)) = (17770,batchSize). mats = datasource.next; datasource.reset; nfeats = mats(0).nrows; val batchSize = mats(0).ncols; + numNonzerosMB = mats(0).nnz val d = opts.dim; if (refresh) { @@ -334,6 +338,11 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts val dc = sdata.contents; val pc = spreds.contents; val vv = (dc - pc) ddot (dc - pc); + + println("mean values: "+mean(dc)+" "+mean(pc)+" "+mean(vv)) + println("max values: "+maxi(dc)+" "+maxi(pc)) + println("min values: "+mini(dc)+" "+mini(pc)) + if (ogmats != null) { ogmats(0) = user; if (ogmats.length > 1) { @@ -342,6 +351,14 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts } -sqrt(row(vv/sdata.nnz)) } + + + /** So I can set the MHTest container size appropriately. */ + def getNonzeros():Int = { + return numNonzerosMB + } + + } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 07303e13..8937ef7f 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -70,8 +70,10 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var targetVariance:Float = 0f // The target variance (so we only need one X_corr). // Daniel: experimental, for the SMF. + var currentSizeSMF:Int = -1; var adagrad:ADAGrad = null; var tmpMomentum:Array[Mat] = null + var acceptanceRate:Mat = null /** * Standard initialization. We have: @@ -87,9 +89,21 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater model = model0; modelmats = model.modelmats updatemats = model.updatemats - scores0 = zeros(1,model.datasource.opts.batchSize) - scores1 = zeros(1,model.datasource.opts.batchSize) - diff = zeros(1,model.datasource.opts.batchSize) + acceptanceRate = zeros(1, opts.exitThetaAmount * 10) + if (opts.smf) { + val numnnz = model.asInstanceOf[SMF].getNonzeros() + if (numnnz < 0) { + println("Something wrong happened, numnnz="+numnnz) + sys.exit + } + scores0 = zeros(1, numnnz*10) + scores1 = zeros(1, numnnz*10) + diff = zeros(1, numnnz*10) + } else { + scores0 = zeros(1, model.datasource.opts.batchSize) + scores1 = zeros(1, model.datasource.opts.batchSize) + diff = zeros(1, model.datasource.opts.batchSize) + } T = opts.temp if (opts.Nknown) { @@ -138,6 +152,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * original data is split up into equal-sized minibatches in the Learner code. * (The last minibatch is ignored since it generally has a different size.) * + * SMF.scala necessitates extra cases to handle the varying batch sizes. They + * differ across "minibatches" so the scores at "the end" have to be cleared + * (but since scores are only for current MB, just call "clear"), the number + * of nonzeros have to be computed, and then the scores are copied to the + * appropriate interval. EDIT: ugh, never mind, doesn't work ... + * * @param ipass The current pass over the full (training) data. * @param step Progress within the current minibatch, indicated as a numerical * index representing the starting column of this minibatch. @@ -150,28 +170,39 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater // (Part 1) Compute scores for theta and theta', scaled by N/T. if (opts.smf) { - scores0 = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + currentSizeSMF = model.datasource.omats(0).nnz + b += currentSizeSMF + scores0.clear + scores0(0 -> currentSizeSMF) = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)).t } else { scores0 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) + b += scores0.length } - b += scores0.length // With SMF, using scores0.length as the MB size generalizes better. if (scores0.length == 1) { throw new RuntimeException("Need individual scores, but getting a scalar.") } + for (i <- 0 until modelmats.length) { modelmats(i) <-- proposedTheta(i) } if (opts.smf) { - scores1 = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) - diff = scores1 - scores0 + scores1.clear + scores1(0 -> currentSizeSMF) = (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)).t + diff.clear + diff(0 -> currentSizeSMF) = scores1(0 -> currentSizeSMF) - scores0(0 -> currentSizeSMF) } else { scores1 <-- (model.evalbatchg(model.datasource.omats, ipass, step) * (N/T.dv)) diff ~ scores1 - scores0 } // (Part 2) Update our \Delta* and sample variance of \Delta*. - sumOfSquares += sum((diff)*@(diff)).v - sumOfValues += sum(diff).v + if (opts.smf) { + sumOfSquares += sum((diff(0 -> currentSizeSMF)) *@ (diff(0 -> currentSizeSMF))).v + sumOfValues += sum(diff(0 -> currentSizeSMF)).v + } else { + sumOfSquares += sum((diff)*@(diff)).v + sumOfValues += sum(diff).v + } val deltaStar = sumOfValues/b.v - logu val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) @@ -227,7 +258,8 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater if (accept) { for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) // Now tmpTheta has proposed theta. - } + } + acceptanceRate(t) = 1 } else { for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. @@ -235,6 +267,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. } } + acceptanceRate(t) = 0 } if (newMinibatch) afterEachMinibatch() @@ -312,6 +345,9 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater T = opts.tempAfterBurnin opts.sigmaProposer = opts.sigmaProposerAfterBurnin } + if (opts.saveAcceptRate) { + saveMat("acceptRate.mat.lz4", acceptanceRate) + } } @@ -369,12 +405,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double) { - val s1 = mean(scores1).dv - val s0 = mean(scores0).dv + val s1 = mean(scores1(0 -> b.toInt)).dv + val s0 = mean(scores0(0 -> b.toInt)).dv println("b="+b+", n="+n+", logu="+logu) println("mean(scores1) = "+s1+" - mean(scores0) = "+s0+" = "+(s1-s0)) - println("maxi(scores1) = "+maxi(scores1)+", maxi(scores0) = "+maxi(scores0)) - println("mini(scores1) = "+mini(scores1)+", mini(scores0) = "+mini(scores0)) + println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) + println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) println("sampleVar = " +sampleVariance+ ", delta* = " +deltaStar+ ", numStd = " +numStd) } @@ -401,6 +437,7 @@ object MHTest { var initThetaHere = false var burnIn = -1 var smf = false + var saveAcceptRate = false } class Options extends Opts {} From ec31c767f0e76c907089eb3a273c3edea75aadb5 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Sat, 18 Mar 2017 15:38:26 -0700 Subject: [PATCH 11/18] OK enough debugprints. Now figure out what to do for thep aper --- scripts/daniel_smf_netflix_adagrad.ssc | 15 ++++++++++----- scripts/daniel_smf_netflix_mhtest.ssc | 2 +- src/main/scala/BIDMach/models/SMF.scala | 7 ++++--- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index 98b845a4..744b7c16 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -19,10 +19,10 @@ val (nn,opts) = SMF.learner1(a, d) // Daniel Seita: I'm not sure if these values are good for netflix. opts.batchSize = 1000 +opts.npasses = 1 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f -opts.npasses = 1 val lambda = 4f opts.lambdau = lambda opts.regumean = lambda @@ -30,9 +30,11 @@ opts.lambdam = lambda / 500000 * 20 opts.regmmean = opts.lambdam opts.evalStep = 31 opts.doUsers = false -opts.lsgd = 0.010f + opts.momentum = FMat(0.5) +opts.nesterov = null opts.langevin = 0.0f + opts.what nn.train @@ -46,9 +48,12 @@ mopts.lsgd = 0.0f mm.predict val pa = SMat(mm.preds(1)); -min(pa.contents,5,pa.contents) -max(pa.contents,1,pa.contents) +println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) val diff = ta.contents - pa.contents val rmse = sqrt((diff ^* diff) / diff.length) println("rmse = %f" format rmse.v) -//sys.exit +min(pa.contents,5,pa.contents) +max(pa.contents,1,pa.contents) +val diff2 = ta.contents - pa.contents +val rmse2 = sqrt((diff2 ^* diff2) / diff2.length) +println("rmse (w/clipping) = %f" format rmse2.v) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index a67cce05..ba34cbad 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -69,7 +69,7 @@ opts.saveAcceptRate = true // Also, the four items here that start with `lambda` or `reg` represent priors. opts.matrixOfScores = true opts.batchSize = 1000 -opts.npasses = 4 +opts.npasses = 3 opts.uiter = 5 opts.urate = 0.05f opts.lrate = 0.05f diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 87e9b188..c60d5e7f 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -339,9 +339,10 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts val pc = spreds.contents; val vv = (dc - pc) ddot (dc - pc); - println("mean values: "+mean(dc)+" "+mean(pc)+" "+mean(vv)) - println("max values: "+maxi(dc)+" "+maxi(pc)) - println("min values: "+mini(dc)+" "+mini(pc)) + println("mean values (train/t.pred): "+mean(dc)+" "+mean(pc)) + println("std. values (train/t.pred): "+sqrt(variance(dc))+" "+sqrt(variance(pc))) + println("max. values (train/t.pred): "+maxi(dc)+" "+maxi(pc)) + println("min. values (train/t.pred): "+mini(dc)+" "+mini(pc)) if (ogmats != null) { ogmats(0) = user; From ae2472abf7600c8b11abe770a67243fe8dd12702 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Sat, 18 Mar 2017 16:26:34 -0700 Subject: [PATCH 12/18] OK this should be the style of script to look for different values. Now let's wait to see what John thinks of the proposal issue funcitno. Then I can benchmark with different tests but same hyperparameters for withMHTest and noMHTest --- scripts/daniel_smf_netflix_adagrad.ssc | 100 +++++++++++++------------ 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index 744b7c16..f18db6af 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -2,58 +2,64 @@ import BIDMach.models.SMF /** - * Test SMF code on netflix data. This will use default ADAGrad, which already - * gets roughly 0.845 RMSE so I assume we have to beat that. We'll also need to - * run this with different random seeds. + * Test SMF code on netflix data. This will use default ADAGrad. In general, + * RMSEs of roughly 0.83 to 0.85 are "good". */ -// Get random seed set up. -// TODO random seed code - -// Now get back to the real netflix data. First, load data and set things up: val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 -val (nn,opts) = SMF.learner1(a, d) - -// Daniel Seita: I'm not sure if these values are good for netflix. -opts.batchSize = 1000 -opts.npasses = 1 -opts.uiter = 5 -opts.urate = 0.05f -opts.lrate = 0.05f -val lambda = 4f -opts.lambdau = lambda -opts.regumean = lambda -opts.lambdam = lambda / 500000 * 20 -opts.regmmean = opts.lambdam -opts.evalStep = 31 -opts.doUsers = false - -opts.momentum = FMat(0.5) -opts.nesterov = null -opts.langevin = 0.0f - -opts.what -nn.train +val lrates = row(0.0001, 0.001, 0.01, 0.1) +val momens = row(0.5, 0.75, 0.9, 0.95) +var bestrmse = 10.0; -val model = nn.model.asInstanceOf[SMF] -val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa) -mopts.batchSize = 10000 -mopts.uiter = 5 -mopts.urate = opts.urate -mopts.lsgd = 0.0f -mm.predict +for (i <- 0 until lrates.length) { + for (j <- 0 until momens.length) { + val (nn,opts) = SMF.learner1(a, d) + opts.batchSize = 1000 + opts.npasses = 2 + opts.momentum = momens(j) + opts.nesterov = null + opts.langevin = 0.0f + + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i) + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.lsgd = 0.0f + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ^* diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ^* diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + } +} -val pa = SMat(mm.preds(1)); -println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) -val diff = ta.contents - pa.contents -val rmse = sqrt((diff ^* diff) / diff.length) -println("rmse = %f" format rmse.v) -min(pa.contents,5,pa.contents) -max(pa.contents,1,pa.contents) -val diff2 = ta.contents - pa.contents -val rmse2 = sqrt((diff2 ^* diff2) / diff2.length) -println("rmse (w/clipping) = %f" format rmse2.v) +println("Best RMSE: "+bestrmse) +sys.exit From 9bb1b30ff465f75880b99821509ff40dc7ad58f8 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Sun, 19 Mar 2017 13:59:44 -0700 Subject: [PATCH 13/18] Let's try this to run these in batch mode. To check out performance at different settings, etc. --- scripts/daniel_smf_netflix_adagrad.ssc | 97 +++++++------ scripts/daniel_smf_netflix_mhtest.ssc | 141 ++++++++++--------- src/main/scala/BIDMach/models/SMF.scala | 8 +- src/main/scala/BIDMach/updaters/MHTest.scala | 37 +++-- 4 files changed, 158 insertions(+), 125 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index f18db6af..246a1e2e 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -6,60 +6,71 @@ import BIDMach.models.SMF * RMSEs of roughly 0.83 to 0.85 are "good". */ +// Same code as in the MHTest+ADAGrad script. +setseed(0) val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 val lrates = row(0.0001, 0.001, 0.01, 0.1) -val momens = row(0.5, 0.75, 0.9, 0.95) +val momens = row(0.5, 0.75, 0.9, 0.95, 0.99) +val langs = row(0.0, 0.05, 0.5) var bestrmse = 10.0; +var prettystring = "moment. lrate lang. rmse\n" for (i <- 0 until lrates.length) { for (j <- 0 until momens.length) { - val (nn,opts) = SMF.learner1(a, d) - opts.batchSize = 1000 - opts.npasses = 2 - opts.momentum = momens(j) - opts.nesterov = null - opts.langevin = 0.0f - - opts.uiter = 5 - opts.urate = 0.05f - opts.lrate = lrates(i) - val lambda = 4f - opts.lambdau = lambda - opts.regumean = lambda - opts.lambdam = lambda / 500000 * 20 - opts.regmmean = opts.lambdam - opts.evalStep = 31 - opts.doUsers = false - opts.what - nn.train - - val model = nn.model.asInstanceOf[SMF] - val xa = (ta != 0) - val (mm, mopts) = SMF.predictor1(model, a, xa) - mopts.batchSize = 10000 - mopts.uiter = 5 - mopts.urate = opts.urate - mopts.lsgd = 0.0f - mm.predict - - val pa = SMat(mm.preds(1)); - println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) - val diff = ta.contents - pa.contents - val rmse = sqrt((diff ^* diff) / diff.length) - println("\nrmse = %f" format rmse.v) - min(pa.contents,5,pa.contents) - max(pa.contents,1,pa.contents) - val diff2 = ta.contents - pa.contents - val rmse2 = sqrt((diff2 ^* diff2) / diff2.length) - println("rmse (w/clipping) = %f\n" format rmse2.v) - if (rmse2.v < bestrmse) { - bestrmse = rmse2.v + for (k <- 0 until langs.length) { + val (nn,opts) = SMF.learner1(a, d) + + // Common parameters with the MHTest+ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 2 + opts.momentum = momens(j) + opts.nesterov = null + opts.langevin = langs(k).v + + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) + + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + prettystring += "%1.3f %1.5f %1.3f %1.4f\n" format (momens(j).v,lrates(i).v,langs(k).v,rmse2.v) } } } -println("Best RMSE: "+bestrmse) +println("\nBest RMSE: "+bestrmse+ "\n") +println(prettystring) sys.exit diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index ba34cbad..3d206b46 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -29,77 +29,90 @@ import BIDMach.models.SMF * BTW: (a *@ ta).nnz = 0, which shows that they are completely distinct. */ -// Get random seed set up. -// TODO random seed code - -// Now get back to the real netflix data. First, load data and set things up: +// Same code as in the ADAGrad-only script. +setseed(0) val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 -val (nn,opts) = SMF.learner2(a, d) -println("size(a)="+size(a)+", with a.nnz="+a.nnz) +val lrates = row(0.0001, 0.001, 0.01) +val momens = row(0.5, 0.75, 0.9, 0.95, 0.99) +val langs = row(0.0, 0.05, 0.5) +var bestrmse = 10.0; +var prettystring = "moment. lrate lang. rmse\n" -// Daniel Seita: stuff for the MH Test updater. OH ... and our N is going to be -// super-large. Ahhhh... we may need temperature then. -opts.N = a.nnz -opts.temp = a.nnz / 10000 -opts.Nknown = true -opts.n2lsigma = 1.0f -opts.nn2l = 4000 -opts.sigmaProposer = 0.01f -opts.continueDespiteFull = false -opts.verboseMH = false -opts.collectData = false -opts.collectDataDir = "tmp/" -opts.exitTheta = false -opts.initThetaHere = true -opts.burnIn = -1 +for (i <- 0 until lrates.length) { + for (j <- 0 until momens.length) { + for (k <- 0 until langs.length) { + val (nn,opts) = SMF.learner2(a, d) -// IMPORTANT. setting opts.smf=true means we create an ADAGrad class inside. -// We also want to compare with additive random noise, i.e. Langevin stuff. -opts.smf = true -opts.langevin = 0.0f -opts.momentum = FMat(0.5) -opts.nesterov = null -opts.saveAcceptRate = true + // Common parameters with the ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 2 + opts.momentum = momens(j) + opts.nesterov = null + opts.langevin = langs(k).v -// Daniel Seita: actually, a batch size of 2000 means we may get 300k "elements" -// due to the sparsity. So I'm thinking we stick to batch sizes of 1000 or less. -// Also, the four items here that start with `lambda` or `reg` represent priors. -opts.matrixOfScores = true -opts.batchSize = 1000 -opts.npasses = 3 -opts.uiter = 5 -opts.urate = 0.05f -opts.lrate = 0.05f -val lambda = 4f -opts.lambdau = lambda -opts.regumean = lambda -opts.lambdam = lambda / 500000 * 20 -opts.regmmean = opts.lambdam -opts.evalStep = 31 -opts.doUsers = false -opts.what -nn.train + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false -val model = nn.model.asInstanceOf[SMF] -val xa = (ta != 0) -val (mm, mopts) = SMF.predictor1(model, a, xa) + // Now some stuff specific for the MHTest+ADAGrad. + opts.smf = true + opts.saveAcceptRate = true + opts.acceptRateDir = "tmp/" + opts.N = a.nnz + opts.temp = a.nnz / 10000 + opts.Nknown = true + opts.n2lsigma = 1.0f + opts.nn2l = 4000 + opts.sigmaProposer = 0.01f + opts.continueDespiteFull = false + opts.verboseMH = false + opts.collectData = false + opts.collectDataDir = "tmp/" + opts.exitTheta = false + opts.initThetaHere = true + opts.burnIn = -1 + opts.matrixOfScores = true + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) -mopts.batchSize = 10000 -mopts.uiter = 5 -mopts.urate = opts.urate -mopts.lsgd = 0.0f -mm.predict + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + prettystring += "%1.3f %1.5f %1.3f %1.4f\n" format (momens(j).v,lrates(i).v,langs(k).v,rmse2.v) + } + } +} -val pa = SMat(mm.preds(1)); -println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) -val diff = ta.contents - pa.contents -val rmse = sqrt((diff ddot diff) / diff.length) -println("rmse = %f" format rmse.v) -min(pa.contents,5,pa.contents) -max(pa.contents,1,pa.contents) -val diff2 = ta.contents - pa.contents -val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) -println("rmse (w/clipping) = %f" format rmse2.v) +println("\nBest RMSE: "+bestrmse+ "\n") +println(prettystring) +sys.exit diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index c60d5e7f..7bd3e00b 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -339,10 +339,10 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts val pc = spreds.contents; val vv = (dc - pc) ddot (dc - pc); - println("mean values (train/t.pred): "+mean(dc)+" "+mean(pc)) - println("std. values (train/t.pred): "+sqrt(variance(dc))+" "+sqrt(variance(pc))) - println("max. values (train/t.pred): "+maxi(dc)+" "+maxi(pc)) - println("min. values (train/t.pred): "+mini(dc)+" "+mini(pc)) + println("mean values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(mean(dc)).v,FMat(mean(pc)).v)) + println("std. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(sqrt(variance(dc))).v,FMat(sqrt(variance(pc))).v)) + println("max. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(maxi(dc)).v,FMat(maxi(pc)).v)) + println("min. values (train/t.pred): \t%1.4f\t%1.4f" format (FMat(mini(dc)).v,FMat(mini(pc)).v)) if (ogmats != null) { ogmats(0) = user; diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 8937ef7f..f852e962 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -5,7 +5,8 @@ import BIDMat.MatFunctions._ import BIDMat.SciFunctions._ import BIDMach.models._ import BIDMach.models.Model._ -import edu.berkeley.bid.CUMACH +import edu.berkeley.bid.CUMACH +import BIDMach.Learner /** * Our fast MH test. See: @@ -64,7 +65,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var n:Float = 0f // The *number* of minibatches we are using. var logu:Float = 0f // log u, since we assume a symmetric proposer. var T:Int = 1 // The temperature of the distribution. - var t:Int = 0 // Current number of samples of theta. + var _t:Int = 0 // Current number of samples of theta. var sumOfValues:Float = 0f // \sum_{i=1}^b (N/T)*log(p(x_i|theta')/p(x_i|theta)). var sumOfSquares:Float = 0f // \sum_{i=1}^b ((N/T)*log(p(x_i|theta')/p(x_i|theta)))^2. var targetVariance:Float = 0f // The target variance (so we only need one X_corr). @@ -259,7 +260,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) // Now tmpTheta has proposed theta. } - acceptanceRate(t) = 1 + acceptanceRate(_t) = 1 } else { for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. @@ -267,10 +268,10 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. } } - acceptanceRate(t) = 0 + acceptanceRate(_t) = 0 } - if (newMinibatch) afterEachMinibatch() + if (newMinibatch) afterEachMinibatch(ipass, gprogress) } @@ -328,25 +329,32 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * logic about the burn-in period, and also exit the program if we reach the * desired number of samples. */ - def afterEachMinibatch() { - t += 1 + def afterEachMinibatch(ipass:Int, gprogress:Float) { + _t += 1 if (opts.collectData) { for (i <- 0 until modelmats.length) { - saveFMat(opts.collectDataDir+ "theta_%d_%04d.fmat.lz4" format (i,t), FMat(modelmats(i))) + saveFMat(opts.collectDataDir+ "theta_%d_%04d.fmat.lz4" format (i,_t), FMat(modelmats(i))) } - saveFMat(opts.collectDataDir+ "data_%04d.fmat.lz4" format (t), FMat(b)) + saveFMat(opts.collectDataDir+ "data_%04d.fmat.lz4" format (_t), FMat(b)) } - if (t == opts.exitThetaAmount && opts.exitTheta) { - println("Exiting code now since t=" +t) + if (_t == opts.exitThetaAmount && opts.exitTheta) { + println("Exiting code now since t=" +_t) sys.exit } - if (t == opts.burnIn) { + if (_t == opts.burnIn) { println("ALERT: Past burn-in period. Now change temperature, proposer, etc.") T = opts.tempAfterBurnin opts.sigmaProposer = opts.sigmaProposerAfterBurnin } - if (opts.saveAcceptRate) { - saveMat("acceptRate.mat.lz4", acceptanceRate) + if (opts.smf) { + if (opts.saveAcceptRate && (ipass+1) == opts.asInstanceOf[Learner.Options].npasses + && gprogress > 0.99) { + val mom = adagrad.opts.momentum(0).v + val lr = adagrad.opts.lrate.v + val lang = adagrad.opts.langevin(0).v + saveMat(opts.acceptRateDir+"arate_%1.3f_%1.4f_%1.3f.mat.lz4" format (mom,lr,lang), + acceptanceRate(0 -> _t)) + } } } @@ -438,6 +446,7 @@ object MHTest { var burnIn = -1 var smf = false var saveAcceptRate = false + var acceptRateDir = "tmp/" } class Options extends Opts {} From 888d67758c573336cc549c3adb28bdb3c07ac36f Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Tue, 21 Mar 2017 16:01:25 -0700 Subject: [PATCH 14/18] more slight script updates --- scripts/daniel_smf_netflix_mhtest.ssc | 14 +++++++------- src/main/scala/BIDMach/updaters/MHTest.scala | 17 ++++++++++------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 3d206b46..7b172b0f 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -35,9 +35,9 @@ val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 -val lrates = row(0.0001, 0.001, 0.01) -val momens = row(0.5, 0.75, 0.9, 0.95, 0.99) -val langs = row(0.0, 0.05, 0.5) +val lrates = row(0.01) +val momens = row(0.95) +val langs = row(0.05) var bestrmse = 10.0; var prettystring = "moment. lrate lang. rmse\n" @@ -47,7 +47,7 @@ for (i <- 0 until lrates.length) { val (nn,opts) = SMF.learner2(a, d) // Common parameters with the ADAGrad version. - opts.batchSize = 1000 + opts.batchSize = 10 opts.npasses = 2 opts.momentum = momens(j) opts.nesterov = null @@ -66,16 +66,16 @@ for (i <- 0 until lrates.length) { // Now some stuff specific for the MHTest+ADAGrad. opts.smf = true - opts.saveAcceptRate = true + opts.saveAcceptRate = false opts.acceptRateDir = "tmp/" opts.N = a.nnz - opts.temp = a.nnz / 10000 + opts.temp = a.nnz / 1000 opts.Nknown = true opts.n2lsigma = 1.0f opts.nn2l = 4000 opts.sigmaProposer = 0.01f opts.continueDespiteFull = false - opts.verboseMH = false + opts.verboseMH = true opts.collectData = false opts.collectDataDir = "tmp/" opts.exitTheta = false diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index f852e962..8d2eba51 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -208,7 +208,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -226,7 +226,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } } // (Part 3.2) Abnormally good or bad minibatches. - else if (math.abs(numStd) > 5.0) { + else if (math.abs(numStd) > 10.0) { if (opts.verboseMH) { println("\tCASE 1: math.abs(numStd) = " +math.abs(numStd)) } @@ -248,7 +248,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater val Xc = normlogrnd(1,1).dv val testStat = deltaStar + Xn + Xc if (opts.verboseMH) { - println("\tCASE 3; with testStat = "+testStat) + println("\tCASE 3; with testStat = %1.4f (Xn = %1.4f, Xc = %1.4f)" format (testStat, Xn, Xc)) } if (testStat > 0) { accept = true @@ -257,11 +257,13 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater // (Part 4) Reset parameters and use <-- to avoid alias problems. if (accept) { + if (opts.verboseMH) println("\tACCEPT") for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) // Now tmpTheta has proposed theta. } acceptanceRate(_t) = 1 } else { + if (opts.verboseMH) println("\treject") for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. if (opts.smf) { @@ -412,14 +414,15 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double) { + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float) { val s1 = mean(scores1(0 -> b.toInt)).dv val s0 = mean(scores0(0 -> b.toInt)).dv - println("b="+b+", n="+n+", logu="+logu) - println("mean(scores1) = "+s1+" - mean(scores0) = "+s0+" = "+(s1-s0)) + println("b = %d, n = %d, logu = %1.4f" format (b, n.toInt, logu)) + println("mean(scores1) (%1.4f) - mean(scores0) (%1.4f) = %1.4f" format (s1, s0, s1-s0)) println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) - println("sampleVar = " +sampleVariance+ ", delta* = " +deltaStar+ ", numStd = " +numStd) + println("delta^* (%1.4f) = sumDivB (%1.4f) - logu (%1.4f)" format (deltaStar, sumDivB, logu)) + println("sampleVar = %1.4f, numStd = %1.4f" format (sampleVariance, numStd)) } } From 0701ec60679ee7eea6636063aa41f1413afe0c47 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 22 Mar 2017 09:05:17 -0700 Subject: [PATCH 15/18] fixed logu --> psi --- src/main/scala/BIDMach/updaters/MHTest.scala | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 8d2eba51..6d31dad5 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -63,7 +63,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var b:Long = 0 // Current minibatch size (also `b` in the paper). var N:Long = 0 // Maximum minibatch size (i.e. all training data). var n:Float = 0f // The *number* of minibatches we are using. - var logu:Float = 0f // log u, since we assume a symmetric proposer. + var psi:Float = 0f // \psi = log (1 * prop_ratio * prior_ratio) var T:Int = 1 // The temperature of the distribution. var _t:Int = 0 // Current number of samples of theta. var sumOfValues:Float = 0f // \sum_{i=1}^b (N/T)*log(p(x_i|theta')/p(x_i|theta)). @@ -76,6 +76,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var tmpMomentum:Array[Mat] = null var acceptanceRate:Mat = null + /** * Standard initialization. We have: * @@ -204,11 +205,11 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater sumOfSquares += sum((diff)*@(diff)).v sumOfValues += sum(diff).v } - val deltaStar = sumOfValues/b.v - logu + val deltaStar = sumOfValues/b.v - psi val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -316,7 +317,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater randomWalkProposer() } - logu = ln(rand(1,1)).v + psi = ln(1).v // WARNING, symmetric proposals ONLY, since \psi(1,\theta,theta')=0. newMinibatch = false b = 0 n = 0 @@ -414,15 +415,14 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float) { + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double) { val s1 = mean(scores1(0 -> b.toInt)).dv val s0 = mean(scores0(0 -> b.toInt)).dv - println("b = %d, n = %d, logu = %1.4f" format (b, n.toInt, logu)) + println("b = %d, n = %d" format (b, n.toInt)) println("mean(scores1) (%1.4f) - mean(scores0) (%1.4f) = %1.4f" format (s1, s0, s1-s0)) println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) - println("delta^* (%1.4f) = sumDivB (%1.4f) - logu (%1.4f)" format (deltaStar, sumDivB, logu)) - println("sampleVar = %1.4f, numStd = %1.4f" format (sampleVariance, numStd)) + println("delta^* = %1.4f, sampleVar = %1.4f, numStd = %1.4f" format (deltaStar, sampleVariance, numStd)) } } From b0f0aebd0b766eb43b13e48ad7cab027dc53ba11 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 22 Mar 2017 12:02:08 -0700 Subject: [PATCH 16/18] Tried my way with energy function and momentum, wasn't working. =( Now let me switch focus to MALA ... --- scripts/daniel_smf_netflix_adagrad.ssc | 93 ++++++------ scripts/daniel_smf_netflix_mhtest.ssc | 134 +++++++++--------- src/main/scala/BIDMach/updaters/ADAGrad.scala | 5 +- src/main/scala/BIDMach/updaters/MHTest.scala | 54 +++++-- 4 files changed, 155 insertions(+), 131 deletions(-) diff --git a/scripts/daniel_smf_netflix_adagrad.ssc b/scripts/daniel_smf_netflix_adagrad.ssc index 246a1e2e..44a0d391 100755 --- a/scripts/daniel_smf_netflix_adagrad.ssc +++ b/scripts/daniel_smf_netflix_adagrad.ssc @@ -13,61 +13,58 @@ val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 val lrates = row(0.0001, 0.001, 0.01, 0.1) -val momens = row(0.5, 0.75, 0.9, 0.95, 0.99) val langs = row(0.0, 0.05, 0.5) var bestrmse = 10.0; -var prettystring = "moment. lrate lang. rmse\n" +var prettystring = "lrate lang. rmse\n" for (i <- 0 until lrates.length) { - for (j <- 0 until momens.length) { - for (k <- 0 until langs.length) { - val (nn,opts) = SMF.learner1(a, d) + for (k <- 0 until langs.length) { + val (nn,opts) = SMF.learner1(a, d) - // Common parameters with the MHTest+ADAGrad version. - opts.batchSize = 1000 - opts.npasses = 2 - opts.momentum = momens(j) - opts.nesterov = null - opts.langevin = langs(k).v - - opts.uiter = 5 - opts.urate = 0.05f - opts.lrate = lrates(i).v - val lambda = 4f - opts.lambdau = lambda - opts.regumean = lambda - opts.lambdam = lambda / 500000 * 20 - opts.regmmean = opts.lambdam - opts.evalStep = 31 - opts.doUsers = false - opts.what - nn.train - - val model = nn.model.asInstanceOf[SMF] - val xa = (ta != 0) - val (mm, mopts) = SMF.predictor1(model, a, xa) - mopts.batchSize = 10000 - mopts.uiter = 5 - mopts.urate = opts.urate - mopts.aopts = null - mm.predict - - val pa = SMat(mm.preds(1)); - println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) - val diff = ta.contents - pa.contents - val rmse = sqrt((diff ddot diff) / diff.length) - println("\nrmse = %f" format rmse.v) - min(pa.contents,5,pa.contents) - max(pa.contents,1,pa.contents) - val diff2 = ta.contents - pa.contents - val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) - println("rmse (w/clipping) = %f\n" format rmse2.v) + // Common parameters with the MHTest+ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 2 + opts.nesterov = null + opts.langevin = langs(k).v + opts.momentum = 1f + + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) - if (rmse2.v < bestrmse) { - bestrmse = rmse2.v - } - prettystring += "%1.3f %1.5f %1.3f %1.4f\n" format (momens(j).v,lrates(i).v,langs(k).v,rmse2.v) + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v } + prettystring += "%1.5f %1.3f %1.4f\n" format (lrates(i).v,langs(k).v,rmse2.v) } } diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 7b172b0f..df6fa691 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -36,80 +36,80 @@ val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 val lrates = row(0.01) -val momens = row(0.95) -val langs = row(0.05) +val langs = row(0.0) var bestrmse = 10.0; -var prettystring = "moment. lrate lang. rmse\n" +var prettystring = "lrate lang. arate rmse\n" for (i <- 0 until lrates.length) { - for (j <- 0 until momens.length) { - for (k <- 0 until langs.length) { - val (nn,opts) = SMF.learner2(a, d) + for (k <- 0 until langs.length) { + val (nn,opts) = SMF.learner2(a, d) - // Common parameters with the ADAGrad version. - opts.batchSize = 10 - opts.npasses = 2 - opts.momentum = momens(j) - opts.nesterov = null - opts.langevin = langs(k).v + // Common parameters with the ADAGrad version. + opts.batchSize = 1000 + opts.npasses = 2 + opts.nesterov = null + opts.langevin = langs(k).v + opts.momentum = null - opts.uiter = 5 - opts.urate = 0.05f - opts.lrate = lrates(i).v - val lambda = 4f - opts.lambdau = lambda - opts.regumean = lambda - opts.lambdam = lambda / 500000 * 20 - opts.regmmean = opts.lambdam - opts.evalStep = 31 - opts.doUsers = false + opts.uiter = 5 + opts.urate = 0.05f + opts.lrate = lrates(i).v + val lambda = 4f + opts.lambdau = lambda + opts.regumean = lambda + opts.lambdam = lambda / 500000 * 20 + opts.regmmean = opts.lambdam + opts.evalStep = 31 + opts.doUsers = false - // Now some stuff specific for the MHTest+ADAGrad. - opts.smf = true - opts.saveAcceptRate = false - opts.acceptRateDir = "tmp/" - opts.N = a.nnz - opts.temp = a.nnz / 1000 - opts.Nknown = true - opts.n2lsigma = 1.0f - opts.nn2l = 4000 - opts.sigmaProposer = 0.01f - opts.continueDespiteFull = false - opts.verboseMH = true - opts.collectData = false - opts.collectDataDir = "tmp/" - opts.exitTheta = false - opts.initThetaHere = true - opts.burnIn = -1 - opts.matrixOfScores = true - opts.what - nn.train - - val model = nn.model.asInstanceOf[SMF] - val xa = (ta != 0) - val (mm, mopts) = SMF.predictor1(model, a, xa) - mopts.batchSize = 10000 - mopts.uiter = 5 - mopts.urate = opts.urate - mopts.aopts = null - mm.predict - - val pa = SMat(mm.preds(1)); - println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) - val diff = ta.contents - pa.contents - val rmse = sqrt((diff ddot diff) / diff.length) - println("\nrmse = %f" format rmse.v) - min(pa.contents,5,pa.contents) - max(pa.contents,1,pa.contents) - val diff2 = ta.contents - pa.contents - val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) - println("rmse (w/clipping) = %f\n" format rmse2.v) + // Now some stuff specific for the MHTest+ADAGrad. + opts.smf = true + opts.saveAcceptRate = true + opts.acceptRateDir = "tmp/" + opts.N = a.nnz + opts.temp = a.nnz / 10000 + opts.Nknown = true + opts.n2lsigma = 1.0f + opts.nn2l = 4000 + opts.sigmaProposer = 0.01f + opts.continueDespiteFull = false + opts.verboseMH = true + opts.collectData = false + opts.collectDataDir = "tmp/" + opts.exitTheta = false + opts.initThetaHere = true + opts.burnIn = -1 + opts.matrixOfScores = true + opts.what + nn.train + + val model = nn.model.asInstanceOf[SMF] + val xa = (ta != 0) + val (mm, mopts) = SMF.predictor1(model, a, xa) + mopts.batchSize = 10000 + mopts.uiter = 5 + mopts.urate = opts.urate + mopts.aopts = null + mm.predict + + val pa = SMat(mm.preds(1)); + println("Note: max(pa)="+maxi(maxi(pa))+" and min(pa)="+mini(mini(pa))) + val diff = ta.contents - pa.contents + val rmse = sqrt((diff ddot diff) / diff.length) + println("\nrmse = %f" format rmse.v) + min(pa.contents,5,pa.contents) + max(pa.contents,1,pa.contents) + val diff2 = ta.contents - pa.contents + val rmse2 = sqrt((diff2 ddot diff2) / diff2.length) + println("rmse (w/clipping) = %f\n" format rmse2.v) - if (rmse2.v < bestrmse) { - bestrmse = rmse2.v - } - prettystring += "%1.3f %1.5f %1.3f %1.4f\n" format (momens(j).v,lrates(i).v,langs(k).v,rmse2.v) - } + val accepts = loadMat(opts.acceptRateDir+"arate_%1.4f_%1.3f.mat.lz4" format (lrates(i).v,langs(k).v)) + val arate = accepts.nnz / accepts.length.toFloat + + if (rmse2.v < bestrmse) { + bestrmse = rmse2.v + } + prettystring += "%1.5f %1.3f %1.3f %1.4f\n" format (lrates(i).v,langs(k).v,arate,rmse2.v) } } diff --git a/src/main/scala/BIDMach/updaters/ADAGrad.scala b/src/main/scala/BIDMach/updaters/ADAGrad.scala index ed738171..8658b1ee 100755 --- a/src/main/scala/BIDMach/updaters/ADAGrad.scala +++ b/src/main/scala/BIDMach/updaters/ADAGrad.scala @@ -26,6 +26,7 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda var pe:Mat = null // Similar to opts.texp? Some exponent like the others? var te:Mat = null // opts.texp, an ADAGrad parameter (see BIDMach wiki) var lrate:Mat = null + var tscale:Mat = null // Daniel: I had to add this to make it accessible to MHTest. var mu:Mat = null var one:Mat = null var randmat:Array[Mat] = null @@ -46,6 +47,7 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda * wouldn't make sense). */ override def init(model0:Model) = { + tscale = 1 // Daniel: and had to do this here to make it non-null to start. model = model0 modelmats = model.modelmats; updatemats = model.updatemats; @@ -155,7 +157,8 @@ class ADAGrad(override val opts:ADAGrad.Opts = new ADAGrad.Options) extends Upda step / firstStep; } } - val tscale = if (opts.texp.asInstanceOf[AnyRef] != 0) { + // Daniel: had to change this to make it accessible + var tscale = if (opts.texp.asInstanceOf[AnyRef] != 0) { stepn.set(1/(nsteps+1)); stepn ^ te; } else { diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 6d31dad5..5e46a479 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -205,11 +205,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater sumOfSquares += sum((diff)*@(diff)).v sumOfValues += sum(diff).v } - val deltaStar = sumOfValues/b.v - psi + val extraOffset = getExtraOffset() + val deltaStar = sumOfValues/b.v - psi + extraOffset val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v, extraOffset) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -267,16 +268,35 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater if (opts.verboseMH) println("\treject") for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. - if (opts.smf) { - adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. - } + //if (opts.smf) { + // adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. + //} } acceptanceRate(_t) = 0 } - + if (newMinibatch) afterEachMinibatch(ipass, gprogress) } + + /** For the ADAGrad updater. */ + def getExtraOffset():Float = { + if (opts.smf) { + var fullSum = 0f + //for (i <- 0 until modelmats.length) { + // // TODO check that this is right ... I'm really concerned. + // // Then we have to *invert* all of this, right? I do that by dividing by mass ... + // val mass = (adagrad.sumSq(i)^adagrad.ve + adagrad.opts.epsilon) / (adagrad.tscale * adagrad.lrate) + // val momCurrSq = tmpMomentum(i) *@ tmpMomentum(i) + // val momPropSq = adagrad.momentum(i) *@ adagrad.momentum(i) + // val thisSum = sum(sum((momCurrSq-momPropSq)/mass)) + // fullSum += 0.5f * FMat(thisSum).v + //} + fullSum + } else { + 0f + } + } /** * Stuff we should do before each minibatch. This involves calling the @@ -294,17 +314,17 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * those into propsoedTheta, and then get current modelmats back to tmpTheta * (i.e. so modelmats remains the same before and after, and it's just the * proposedTheta which changes). With momentum, fortunately it's simpler, we - * have that in adagrad.momentum and simply copy the old state into - * tmpMomentum. + * have that in adagrad.momentum (that's the "proposed" momentum) and simply + * keep the "current" one in tmpMomentum. */ def beforeEachMinibatch(ipass:Int, step:Long, gprogress:Float) { if (opts.verboseMH) println("\n\tNew minibatch!") for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) - if (opts.smf) { - tmpMomentum(i) <-- adagrad.momentum(i) - } + //if (opts.smf) { + // tmpMomentum(i) <-- adagrad.momentum(i) + //} } if (opts.smf) { @@ -352,10 +372,9 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater if (opts.smf) { if (opts.saveAcceptRate && (ipass+1) == opts.asInstanceOf[Learner.Options].npasses && gprogress > 0.99) { - val mom = adagrad.opts.momentum(0).v val lr = adagrad.opts.lrate.v val lang = adagrad.opts.langevin(0).v - saveMat(opts.acceptRateDir+"arate_%1.3f_%1.4f_%1.3f.mat.lz4" format (mom,lr,lang), + saveMat(opts.acceptRateDir+"arate_%1.4f_%1.3f.mat.lz4" format (lr,lang), acceptanceRate(0 -> _t)) } } @@ -415,14 +434,19 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double) { + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float, extraOffset:Float) { val s1 = mean(scores1(0 -> b.toInt)).dv val s0 = mean(scores0(0 -> b.toInt)).dv println("b = %d, n = %d" format (b, n.toInt)) println("mean(scores1) (%1.4f) - mean(scores0) (%1.4f) = %1.4f" format (s1, s0, s1-s0)) println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) - println("delta^* = %1.4f, sampleVar = %1.4f, numStd = %1.4f" format (deltaStar, sampleVariance, numStd)) + if (opts.smf) { + println("delta^* (%1.4f) = sumDivB (%1.4f) + extraOffset (%1.4f)" format (deltaStar, sumDivB, extraOffset)) + println("sampleVar = %1.4f, numStd = %1.4f" format (sampleVariance, numStd)) + } else { + println("delta^* = %1.4f, sampleVar = %1.4f, numStd = %1.4f" format (deltaStar, sampleVariance, numStd)) + } } } From 54cbc2ce14676fcb28d455eb557a8cb015e49383 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Wed, 22 Mar 2017 17:37:41 -0700 Subject: [PATCH 17/18] Well, got MALA but its not really working .. --- scripts/daniel_smf_netflix_mhtest.ssc | 6 +- src/main/scala/BIDMach/models/SMF.scala | 2 +- src/main/scala/BIDMach/updaters/MHTest.scala | 140 +++++++++++++----- src/main/scala/BIDMach/updaters/Updater.scala | 2 +- 4 files changed, 107 insertions(+), 43 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index df6fa691..7f1a9b2e 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -35,13 +35,13 @@ val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 -val lrates = row(0.01) -val langs = row(0.0) +val lrates = row(0.005) +val langs = sqrt(2f * lrates) // NEW! MALA so it's sqrt(2 * lr). var bestrmse = 10.0; var prettystring = "lrate lang. arate rmse\n" for (i <- 0 until lrates.length) { - for (k <- 0 until langs.length) { + for (k <- 0 until 1) { val (nn,opts) = SMF.learner2(a, d) // Common parameters with the ADAGrad version. diff --git a/src/main/scala/BIDMach/models/SMF.scala b/src/main/scala/BIDMach/models/SMF.scala index 7bd3e00b..7d9a38d6 100755 --- a/src/main/scala/BIDMach/models/SMF.scala +++ b/src/main/scala/BIDMach/models/SMF.scala @@ -192,8 +192,8 @@ class SMF(override val opts:SMF.Opts = new SMF.Options) extends FactorModel(opts } else { opts.asInstanceOf[Grad.Opts].pexp.dv } - uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) + uscale.set(opts.urate * math.pow(ipass+1, - texp).toFloat) val sdata = sdata0 - (iavg + avg); if (putBack < 0) { user.clear diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index 5e46a479..cc4f8f3e 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -75,7 +75,21 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater var adagrad:ADAGrad = null; var tmpMomentum:Array[Mat] = null var acceptanceRate:Mat = null + var currentUpdatemats:Array[Mat] = null + var proposedUpdatemats:Array[Mat] = null + var sdata0:GSMat = null + var user:Mat = null + var mm:Mat = null + var iavg:Mat = null + var avg:Mat = null + var uscale:Mat = null + var vexp:Mat = null + var lamu:Mat = null + var slm:Mat = null + var mlm:Mat = null + var firststep = -1f + /** * Standard initialization. We have: @@ -141,10 +155,24 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater // This should force adagrad.momentum(i) = momentum(i) in the rest of this code. adagrad = new ADAGrad(opts.asInstanceOf[ADAGrad.Opts]) adagrad.init(model) - tmpMomentum = new Array[Mat](nmats) + currentUpdatemats = new Array[Mat](nmats) + proposedUpdatemats = new Array[Mat](nmats) for (i <- 0 until nmats) { - tmpMomentum(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) + currentUpdatemats(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) + proposedUpdatemats(i) = modelmats(i).zeros(modelmats(i).nrows, modelmats(i).ncols) } + + sdata0 = GSMat(model.datasource.omats(0).asInstanceOf[SMat]) + user = FactorModel.reuseuser(sdata0, model.opts.dim, 0f) + mm = proposedTheta(0) + iavg = proposedTheta(1) + avg = proposedTheta(2) + uscale = mm.zeros(1,1) + vexp = GMat(row(0.5f)) + lamu = mm.ones(opts.asInstanceOf[SMF.Opts].dim, 1) ∘ opts.asInstanceOf[SMF.Opts].lambdau + slm = mm.ones(1,1) ∘ (opts.asInstanceOf[SMF.Opts].lambdam * sdata0.ncols); + mlm = mm.ones(1,1) ∘ (opts.asInstanceOf[SMF.Opts].regmmean * sdata0.ncols); + firststep = -1f } } @@ -205,12 +233,11 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater sumOfSquares += sum((diff)*@(diff)).v sumOfValues += sum(diff).v } - val extraOffset = getExtraOffset() - val deltaStar = sumOfValues/b.v - psi + extraOffset + val deltaStar = sumOfValues/b.v - psi val sampleVariance = (sumOfSquares/b.v - ((sumOfValues/b.v)*(sumOfValues/b.v))) / b.v val numStd = deltaStar / math.sqrt(sampleVariance) var accept = false - if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v, extraOffset) + if (opts.verboseMH) debugPrints(sampleVariance, deltaStar, numStd, sumOfValues/b.v, psi) // (Part 3) Run our test! // (Part 3.1) Take care of the full data case; this usually indicates a problem. @@ -268,9 +295,6 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater if (opts.verboseMH) println("\treject") for (i <- 0 until modelmats.length) { modelmats(i) <-- tmpTheta(i) // Now modelmats back to old theta. - //if (opts.smf) { - // adagrad.momentum(i) <-- tmpMomentum(i) // Revert ADAGrad momentum. - //} } acceptanceRate(_t) = 0 } @@ -279,25 +303,6 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } - /** For the ADAGrad updater. */ - def getExtraOffset():Float = { - if (opts.smf) { - var fullSum = 0f - //for (i <- 0 until modelmats.length) { - // // TODO check that this is right ... I'm really concerned. - // // Then we have to *invert* all of this, right? I do that by dividing by mass ... - // val mass = (adagrad.sumSq(i)^adagrad.ve + adagrad.opts.epsilon) / (adagrad.tscale * adagrad.lrate) - // val momCurrSq = tmpMomentum(i) *@ tmpMomentum(i) - // val momPropSq = adagrad.momentum(i) *@ adagrad.momentum(i) - // val thisSum = sum(sum((momCurrSq-momPropSq)/mass)) - // fullSum += 0.5f * FMat(thisSum).v - //} - fullSum - } else { - 0f - } - } - /** * Stuff we should do before each minibatch. This involves calling the * proposer, resetting some values, and saving the current model matrix into @@ -317,27 +322,86 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater * have that in adagrad.momentum (that's the "proposed" momentum) and simply * keep the "current" one in tmpMomentum. */ - def beforeEachMinibatch(ipass:Int, step:Long, gprogress:Float) { + def beforeEachMinibatch(ipass:Int, pos:Long, gprogress:Float) { if (opts.verboseMH) println("\n\tNew minibatch!") - for (i <- 0 until modelmats.length) { tmpTheta(i) <-- modelmats(i) - //if (opts.smf) { - // tmpMomentum(i) <-- adagrad.momentum(i) - //} + currentUpdatemats(i) <-- updatemats(i) } + // Important, update the model matrices. Also, compute \psi(1,theta,theta'). + // Did computation by hand. A lot of this depends on whether it's SMF or not. if (opts.smf) { - adagrad.update(ipass, step, gprogress) + + // Now modelmats is DIFFERENT! modelmats is now the proposed stuff. Note + // that this does not change the updatemats; that came from SMF.scala. We + // then make proposedTheta contain the proposed stuff! Think of this as + // only providing us with proposedTheta (and keeping modelmats unchanged). + adagrad.update(ipass, pos, gprogress) for (i <- 0 until modelmats.length) { proposedTheta(i) <-- adagrad.modelmats(i) // adagrad.modelmats(i) = modelmats(i) modelmats(i) <-- tmpTheta(i) // Should make adagrad.modelmats(i) back to what it was before. - } - } else { + } + + // Next, with proposedTheta, we now compute the proposed log gradient. I + // think it's easier to copy the relevant mupdate code here. This is very + // specific to SMF.scala, sorry. I actually have to REDO the calls here + // ... oh boy. But fortunately, the `user` matrix gets reset to 0 each + // time for the real SMF, so I can do it here. ALSO, make sure I don't + // update the biases as I would with a call to mupdate in ipass=0. + + sdata0 = GSMat(model.datasource.omats(0).asInstanceOf[SMat]) + user <-- FactorModel.reuseuser(sdata0, model.opts.dim, 0f) + mm <-- proposedTheta(0) + iavg <-- proposedTheta(1) + avg <-- proposedTheta(2) + + // UUPDATE, which means (for instance) predictions are based on *proposed* theta. + if (firststep <= 0) firststep = pos.toFloat; + val step = (pos + firststep)/firststep; + val texp = if (opts.asInstanceOf[Grad.Opts].texp.asInstanceOf[AnyRef] != null) { + opts.asInstanceOf[Grad.Opts].texp.dv + } else { + opts.asInstanceOf[Grad.Opts].pexp.dv + } + uscale.set(opts.asInstanceOf[SMF.Opts].urate * math.pow(ipass+1, - texp).toFloat) + val sdata = sdata0 - GMat(iavg+avg); + val b = mm * sdata; + val ucounts = sum(sdata0 != 0f); + val uci = (ucounts + 1f) ^ (- vexp); + for (i <- 0 until opts.asInstanceOf[SMF.Opts].uiter) { + val preds = DDS(mm, user, sdata); + val deriv = b - mm * preds - (user ∘ lamu); + val du = (deriv ∘ uscale ∘ uci); + user ~ user + du; + } + + // MUPDATE + val preds = DDS(proposedTheta(0), user, sdata); + val diffs = sdata + 2f; + diffs.contents ~ sdata.contents - preds.contents; + val icomp = sdata0 != 0f + val icount = sum(sdata0 != 0f, 2); + proposedUpdatemats(1) = (sum(diffs,2) - iavg*mlm) / (icount + 1f); + proposedUpdatemats(2) ~ sum(diffs.contents) / (diffs.contents.length + 1f); + if (firststep <= 0) firststep = pos.toFloat; + proposedUpdatemats(0) = (user *^ diffs - (mm ∘ slm)) / ((icount + 1).t ^ vexp); + + // Finally, we can get the psi terms. + psi = 0f + val tau = FMat(adagrad.lrate).v + for (i <- 0 until modelmats.length) { + val term1 = modelmats(i) - proposedTheta(i) - tau*proposedUpdatemats(i) + val term2 = proposedTheta(i) - modelmats(i) - tau*currentUpdatemats(i) + psi += (0.25f*tau) * ((term1 ddot term1) - (term2 ddot term2)).v + } + } + else { + // Otherwise, it's easy, just do a random walk and set psi = ln(1) = 0. randomWalkProposer() + psi = 0f } - psi = ln(1).v // WARNING, symmetric proposals ONLY, since \psi(1,\theta,theta')=0. newMinibatch = false b = 0 n = 0 @@ -434,7 +498,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater /** This is for debugging. */ - def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float, extraOffset:Float) { + def debugPrints(sampleVariance:Float, deltaStar:Float, numStd:Double, sumDivB:Float, psi:Float) { val s1 = mean(scores1(0 -> b.toInt)).dv val s0 = mean(scores0(0 -> b.toInt)).dv println("b = %d, n = %d" format (b, n.toInt)) @@ -442,7 +506,7 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater println("maxi(scores1) = "+maxi(scores1(0 -> b.toInt))+", maxi(scores0) = "+maxi(scores0(0 -> b.toInt))) println("mini(scores1) = "+mini(scores1(0 -> b.toInt))+", mini(scores0) = "+mini(scores0(0 -> b.toInt))) if (opts.smf) { - println("delta^* (%1.4f) = sumDivB (%1.4f) + extraOffset (%1.4f)" format (deltaStar, sumDivB, extraOffset)) + println("delta^* (%1.4f) = sumDivB (%1.4f) - psi (%1.4f)" format (deltaStar, sumDivB, psi)) println("sampleVar = %1.4f, numStd = %1.4f" format (sampleVariance, numStd)) } else { println("delta^* = %1.4f, sampleVar = %1.4f, numStd = %1.4f" format (deltaStar, sampleVariance, numStd)) diff --git a/src/main/scala/BIDMach/updaters/Updater.scala b/src/main/scala/BIDMach/updaters/Updater.scala index 614f67e6..fe34b9d6 100755 --- a/src/main/scala/BIDMach/updaters/Updater.scala +++ b/src/main/scala/BIDMach/updaters/Updater.scala @@ -19,7 +19,7 @@ abstract class Updater(val opts:Updater.Opts = new Updater.Options) extends Seri def update(ipass:Int, step:Long):Unit = {} def update(ipass:Int, step:Long, gprogress:Float):Unit = update(ipass, step) - + def updateM(ipass:Int):Unit = { model.updatePass(ipass) } From 685822d3ccc01bc98386173137b409501a9a3d89 Mon Sep 17 00:00:00 2001 From: DanielTakeshi Date: Thu, 23 Mar 2017 14:20:13 -0700 Subject: [PATCH 18/18] Fixed bug, 1/(4*tau). Better to just use sigma which is what I do. --- scripts/daniel_smf_netflix_mhtest.ssc | 10 +++++----- src/main/scala/BIDMach/updaters/MHTest.scala | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/daniel_smf_netflix_mhtest.ssc b/scripts/daniel_smf_netflix_mhtest.ssc index 7f1a9b2e..c87d281e 100755 --- a/scripts/daniel_smf_netflix_mhtest.ssc +++ b/scripts/daniel_smf_netflix_mhtest.ssc @@ -35,10 +35,10 @@ val dir = "/data/netflix/" val a = loadSMat(dir+"newtrain.smat.lz4") val ta = loadSMat(dir+"newtest.smat.lz4") val d = 256 -val lrates = row(0.005) +val lrates = row(0.01) val langs = sqrt(2f * lrates) // NEW! MALA so it's sqrt(2 * lr). var bestrmse = 10.0; -var prettystring = "lrate lang. arate rmse\n" +var prettystring = "lrate lang. arate rmse\n" for (i <- 0 until lrates.length) { for (k <- 0 until 1) { @@ -46,7 +46,7 @@ for (i <- 0 until lrates.length) { // Common parameters with the ADAGrad version. opts.batchSize = 1000 - opts.npasses = 2 + opts.npasses = 1 opts.nesterov = null opts.langevin = langs(k).v opts.momentum = null @@ -67,7 +67,7 @@ for (i <- 0 until lrates.length) { opts.saveAcceptRate = true opts.acceptRateDir = "tmp/" opts.N = a.nnz - opts.temp = a.nnz / 10000 + opts.temp = a.nnz / 1000 opts.Nknown = true opts.n2lsigma = 1.0f opts.nn2l = 4000 @@ -109,7 +109,7 @@ for (i <- 0 until lrates.length) { if (rmse2.v < bestrmse) { bestrmse = rmse2.v } - prettystring += "%1.5f %1.3f %1.3f %1.4f\n" format (lrates(i).v,langs(k).v,arate,rmse2.v) + prettystring += "%1.5f %1.3f %1.4f %1.4f\n" format (lrates(i).v,langs(k).v,arate,rmse2.v) } } diff --git a/src/main/scala/BIDMach/updaters/MHTest.scala b/src/main/scala/BIDMach/updaters/MHTest.scala index cc4f8f3e..cefeda8a 100755 --- a/src/main/scala/BIDMach/updaters/MHTest.scala +++ b/src/main/scala/BIDMach/updaters/MHTest.scala @@ -366,12 +366,12 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater } uscale.set(opts.asInstanceOf[SMF.Opts].urate * math.pow(ipass+1, - texp).toFloat) val sdata = sdata0 - GMat(iavg+avg); - val b = mm * sdata; + val bb = mm * sdata; val ucounts = sum(sdata0 != 0f); val uci = (ucounts + 1f) ^ (- vexp); for (i <- 0 until opts.asInstanceOf[SMF.Opts].uiter) { val preds = DDS(mm, user, sdata); - val deriv = b - mm * preds - (user ∘ lamu); + val deriv = bb - mm * preds - (user ∘ lamu); val du = (deriv ∘ uscale ∘ uci); user ~ user + du; } @@ -390,10 +390,11 @@ class MHTest(override val opts:MHTest.Opts = new MHTest.Options) extends Updater // Finally, we can get the psi terms. psi = 0f val tau = FMat(adagrad.lrate).v + val sigma = FMat(adagrad.opts.langevin).v for (i <- 0 until modelmats.length) { val term1 = modelmats(i) - proposedTheta(i) - tau*proposedUpdatemats(i) val term2 = proposedTheta(i) - modelmats(i) - tau*currentUpdatemats(i) - psi += (0.25f*tau) * ((term1 ddot term1) - (term2 ddot term2)).v + psi += (1f/(2*sigma*sigma)) * ((term1 ddot term1) - (term2 ddot term2)).v } } else {