From 98020a0087b9be42115688ba8e2fd1efbc8d05a2 Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Sun, 26 Feb 2023 01:57:55 -0800 Subject: [PATCH 1/8] Add in changes that should work for propagating output format based on input format --- apps/taco_dse/hypermapper_taco_client.cpp | 2 +- apps/taco_dse/taco_helper.h | 37 +++--- include/taco/format.h | 10 ++ include/taco/index_notation/index_notation.h | 16 +++ include/taco/lower/iterator.h | 5 + include/taco/lower/lower.h | 1 + include/taco/lower/lowerer_impl.h | 2 +- include/taco/lower/lowerer_impl_imperative.h | 10 +- include/taco/tensor.h | 63 ++++++++++ src/index_notation/index_notation.cpp | 104 +++++++++++++++++ src/index_notation/transformations.cpp | 44 ++++--- src/lower/iterator.cpp | 70 ++++++++++- src/lower/lower.cpp | 4 +- src/lower/lowerer_impl_imperative.cpp | 61 ++++++++-- src/tensor.cpp | 115 ++++++++++++++++++- test/test.cpp | 6 + test/test.h | 1 + test/tests-scheduling-eval.cpp | 104 +++++++++++++++++ 18 files changed, 593 insertions(+), 62 deletions(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index c98c598fd..25ebf7d11 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -974,7 +974,7 @@ HMObjective calculateObjectiveTTMDense(std::vector &InputPar compute_times = vector(); ttm_handler->set_cold_run(); - taco::Tensor temp_result({ttm_handler->NUM_I, ttm_handler->NUM_J, ttm_handler->NUM_L}, taco::dense); + taco::Tensor temp_result({ttm_handler->NUM_I, ttm_handler->NUM_J, ttm_handler->NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense}); std::vector valid_perm(120, true); std::vector> orders; diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h index f6880ba13..0cdc63b3e 100644 --- a/apps/taco_dse/taco_helper.h +++ b/apps/taco_dse/taco_helper.h @@ -1427,9 +1427,10 @@ class TTV : public tacoOp { // .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); } - void schedule_and_compute(taco::Tensor &result, int chunk_size_i, int chunk_size_fpos, int chunk_size_k, + void schedule_and_compute(taco::Tensor &result_, int chunk_size_i, int chunk_size_fpos, int chunk_size_k, std::vector order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false, int num_reps=10) { + taco::Tensor result("result", {NUM_I, NUM_J}, taco::dense); result(i, j) = B(i, j, k) * c(k); // std::cout << "Elements: " << std::endl; @@ -1481,23 +1482,6 @@ class TTV : public tacoOp { timer.clear_cache(); - // taco::util::Timer timer; - // timer.clear_cache(); - // result.setAssembleWhileCompute(true); - //result.compile(sched); - //result.setNeedsAssemble(true); - // result.assemble(); - // timer.start(); - // result.compute(); - // timer.stop(); - - // compute_time = timer.getResult().mean; - // if(default_config) { - // default_compute_time = timer.getResult().mean; - // } - // timer.clear_cache(); - - } void compute(bool default_config = false) override @@ -1626,11 +1610,14 @@ class TTM : public tacoOp { } double compute_unscheduled() { - taco::Tensor result({NUM_I, NUM_J, NUM_L}, taco::dense); + taco::Tensor result = copyNonZeroStructure({NUM_I, NUM_J, NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense}, B, 2); result(i,j,l) = B(i,j,k) * C(k,l); taco::util::Timer timer; + result.setPreserveNonZero(true); + result.setAssembleWhileCompute(false); + result.setNeedsAssemble(false); result.compile(); - result.assemble(); +// result.assemble(); timer.start(); result.compute(); timer.stop(); @@ -1661,7 +1648,7 @@ class TTM : public tacoOp { } void schedule_and_compute(taco::Tensor &result_, int chunk_size, int unroll_factor, std::vector order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false, int num_reps=20) { - taco::Tensor result("result", {NUM_I, NUM_J, NUM_L}, taco::dense); + taco::Tensor result = copyNonZeroStructure({NUM_I, NUM_J, NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense}, B, 2); result(i,j,l) = B(i,j,k) * C(k,l); taco::IndexStmt sched = result.getAssignment().concretize(); @@ -1680,11 +1667,15 @@ class TTM : public tacoOp { taco::util::Timer timer; std::vector compute_times; timer.clear_cache(); + result.setPreserveNonZero(true); + result.setNeedsAssemble(false); result.compile(sched); - result.setNeedsAssemble(true); - result.assemble(); + +// result.assemble(); for(int i = 0; i < num_reps; i++) { timer.start(); + result.setPreserveNonZero(true); + result.setNeedsCompute(true); result.compute(); timer.stop(); diff --git a/include/taco/format.h b/include/taco/format.h index 81bdadda4..c957649eb 100644 --- a/include/taco/format.h +++ b/include/taco/format.h @@ -157,6 +157,16 @@ class ModeFormat { friend bool operator!=(const ModeFormat&, const ModeFormat&); friend std::ostream& operator<<(std::ostream&, const ModeFormat&); + template + bool is() { + return std::dynamic_pointer_cast(this->impl) != nullptr; + } + template + std::shared_ptr as() { + taco_iassert(this->is()); + return std::dynamic_pointer_cast(this->impl); + } + private: std::shared_ptr impl; diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h index d5fe90a97..f03b233e7 100644 --- a/include/taco/index_notation/index_notation.h +++ b/include/taco/index_notation/index_notation.h @@ -1317,5 +1317,21 @@ IndexStmt generatePackStmt(TensorVar tensor, IndexStmt generatePackCOOStmt(TensorVar tensor, std::vector indexVars, bool otherIsOnRight); +// preservesNonZeroStructure returns true if the output of the input +// stmt has the same non-zero structure as one of the inputs. If it does, +// it populates the input NonZeroAnalyzerResult with information about +// why the non-zero structure is preserved. +struct NonZeroAnalyzerResult { + NonZeroAnalyzerResult() {} + NonZeroAnalyzerResult(std::unique_ptr resultAccess, std::unique_ptr inputAccess) + : resultAccess(std::move(resultAccess)), + inputAccess(std::move(inputAccess)) {} + // Note that these accesses are wrapped in std::unique_ptr so that + // we avoid issues around the overloaed operator= on Access types. + std::unique_ptr resultAccess; + std::unique_ptr inputAccess; +}; +bool preservesNonZeroStructure(IndexStmt stmt, NonZeroAnalyzerResult& res); + } #endif diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h index a838505f6..9886e3900 100644 --- a/include/taco/lower/iterator.h +++ b/include/taco/lower/iterator.h @@ -217,6 +217,11 @@ class Iterator : public util::Comparable { /// backing the index set. Iterator getIndexSetIterator() const; + // getTrackingIterator returns the iterator that the current iterator is tracking. This + // is used to implement an optimization for when the non-zero structure of the output + // is the same as one of the input tensors. + Iterator getTrackingIterator() const; + friend bool operator==(const Iterator&, const Iterator&); friend bool operator<(const Iterator&, const Iterator&); friend std::ostream& operator<<(std::ostream&, const Iterator&); diff --git a/include/taco/lower/lower.h b/include/taco/lower/lower.h index aab025394..b7053612a 100644 --- a/include/taco/lower/lower.h +++ b/include/taco/lower/lower.h @@ -46,6 +46,7 @@ class Lowerer { /// parts of a concrete index notation statement. ir::Stmt lower(IndexStmt stmt, std::string functionName, bool assemble=true, bool compute=true, bool pack=false, bool unpack=false, + bool enablePreserveNonZeros=false, Lowerer lowerer=Lowerer()); /// Check whether the an index statement can be lowered to C code. If the diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h index f5003b5ee..95f80b9b0 100644 --- a/include/taco/lower/lowerer_impl.h +++ b/include/taco/lower/lowerer_impl.h @@ -56,7 +56,7 @@ class LowererImpl : public util::Uncopyable { /// Lower an index statement to an IR function. virtual ir::Stmt lower(IndexStmt stmt, std::string name, - bool assemble, bool compute, bool pack, bool unpack) = 0; + bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros) = 0; protected: diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h index fa97e3cd9..08e6d9065 100644 --- a/include/taco/lower/lowerer_impl_imperative.h +++ b/include/taco/lower/lowerer_impl_imperative.h @@ -56,7 +56,7 @@ class LowererImplImperative : public LowererImpl { /// Lower an index statement to an IR function. ir::Stmt lower(IndexStmt stmt, std::string name, - bool assemble, bool compute, bool pack, bool unpack); + bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros=false); protected: @@ -501,6 +501,7 @@ class LowererImplImperative : public LowererImpl { private: bool assemble; bool compute; + bool enablePreserveNonZeros = false; bool loopOrderAllowsShortCircuit = false; std::set needCompute; @@ -600,6 +601,13 @@ class LowererImplImperative : public LowererImpl { friend class Visitor; std::shared_ptr visitor; + // These two fields maintain information about if the optimization + // to write into sparse outputs with the same non-zero structure + // as the input tensor is enabled. + bool preservesNonZeros = false; + NonZeroAnalyzerResult nonZeroAnalyzerResult; + + }; } diff --git a/include/taco/tensor.h b/include/taco/tensor.h index 79f91b8db..683bbabc1 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -479,6 +479,8 @@ class TensorBase { /* --- Friend Functions --- */ /// True iff two tensors have the same type and the same values. friend bool equals(const TensorBase&, const TensorBase&); + friend bool equalsInt64(const TensorBase&, const TensorBase&); + /// True iff two TensorBase objects refer to the same tensor (TensorBase /// and Tensor objects are references to tensors). @@ -501,6 +503,7 @@ class TensorBase { std::vector getDependentTensors(); void setNeedsCompute(bool needsCompute); void setNeedsAssemble(bool needsAssemble); + void setPreserveNonZero(bool preserveNonZero); private: static std::shared_ptr getHelperFunctions( const Format& format, Datatype ctype, const std::vector& dimensions); @@ -916,6 +919,7 @@ struct TensorBase::Content { bool needsCompile; bool needsAssemble; bool needsCompute; + bool preserveNonZero; std::vector> dependentTensors; unsigned int uniqueId; @@ -1338,5 +1342,64 @@ void taco_set_num_threads(int num_threads); /// computations. This will be replaced by a scheduling language in the future. int taco_get_num_threads(); + +// copyNonZeroStructure copies the non-zero structure of the src tensor into a new +// tensor, but does not copy the values. This method is intended to be used in the case +// where the result tensor of a computation has a sparse output with non-zero structure +// identical to an input tensor's non-zero structure. +template +Tensor copyNonZeroStructure(std::vector resDims, Format format, Tensor src, int srcLevels) { + + // Double check that the result format is a prefix of the source format. + taco_uassert(src.getFormat().getOrder() >= srcLevels); + taco_uassert(format.getOrder() >= src.getFormat().getOrder()); + + for (size_t i = 0; i < (size_t)srcLevels; i++) { + taco_uassert(resDims[i] == src.getDimensions()[i]); + taco_uassert(format.getModeFormats()[i] == src.getFormat().getModeFormats()[i]); + } + + Tensor result(resDims, format); + auto srcIndex = src.getStorage().getIndex(); + auto rFormat = format.getModeFormats(); + std::vector resModeIndex; + for (size_t level = 0; level < (size_t)srcLevels; level++) { + auto srcModeIndex = srcIndex.getModeIndex(level); + resModeIndex.push_back(srcModeIndex); + } + + // Need to keep track of new size starting with size of previous crd array + int denseSize = srcIndex.getModeIndex(srcLevels - 1).getIndexArray(1).getSize(); + for (size_t level = srcLevels; level < (size_t) format.getOrder(); level++) { + taco_uassert(rFormat[level] == taco::Dense) << "Can only have dense levels after sparse levels"; + Type rType = result.getTensorVar().getType(); + denseSize = denseSize * resDims[level]; + int sizeArr [1] = {resDims[level]}; + auto arr = makeArray((int*)sizeArr, (size_t)1); + std::vector denseArr; + denseArr.push_back(arr); + ModeIndex rMI(denseArr); + resModeIndex.push_back(rMI); + } + + // TODO (owhsu): Need to figure out why the dense arrays added afterwards + // don't seem to be correct (i.e. index l for TTM) + // even though when the Array data (above) is printed here, they seem correct. + Index resultIndex(format, resModeIndex); + result.getStorage().setIndex(resultIndex); + + // Perform a similar operation as above but for the values array. + // However, we only need to construct the values, not copy anything into them. + auto srcVals = src.getStorage().getValues(); + auto srcType = srcVals.getType(); + auto srcSize = denseSize; + // Array(Datatype type, void* data, size_t size, Policy policy=Free); + Array resVals = makeArray(srcType, srcSize); + result.getStorage().setValues(resVals); + return result; +} + +// TODO (owhsu): See if we need a function to copy the nonzero structure + } #endif diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp index f28b60228..f819dd3be 100644 --- a/src/index_notation/index_notation.cpp +++ b/src/index_notation/index_notation.cpp @@ -22,6 +22,7 @@ #include "taco/index_notation/index_notation_rewriter.h" #include "taco/index_notation/index_notation_printer.h" #include "taco/ir/ir.h" +#include "taco/lower/lower.h" #include "taco/codegen/module.h" #include "taco/tensor.h" @@ -32,6 +33,8 @@ #include "taco/util/collections.h" #include "taco/util/functions.h" #include "taco/util/env.h" +#include "taco/lower/mode_format_dense.h" +#include "taco/lower/mode_format_compressed.h" using namespace std; @@ -4198,4 +4201,105 @@ IndexStmt generatePackCOOStmt(TensorVar tensor, return generatePackStmt(tensor, tensorName + "_COO", bufferFormat, indexVars, otherIsOnRight); } + +bool preservesNonZeroStructure(IndexStmt stmt, NonZeroAnalyzerResult& res) { + // TODO (rohany): Handle when the statement can contain workspaces. + + // We have to use a unique_ptr here to get around the overloaded operator= + // on Access types. + // std::cout << stmt << std::endl; + std::unique_ptr resultAccess = nullptr; + Where where = nullptr; + // First, let's find the output access. + match(stmt, std::function([&](const AssignmentNode* node){ + // There should only be one output access. + + taco_iassert(!resultAccess); + resultAccess = std::make_unique(node->lhs); + }), + std::function([&](const WhereNode* node, Matcher* ctx){ + // FIXME (owhsu): Patch to handle where statements with workspaces. + // Only need to go to the consumer side since that will + // get the outer-most output access + ctx->match(node->consumer); + })); + // Some expressions don't have assignments, and thus won't have an RHS to consider. + if (resultAccess == nullptr) { + return false; + } + + // Now, there can only be one non-dense tensor in the LHS. + std::vector sparseRHSTensors; + match(stmt, std::function([&](const AssignmentNode* node, Matcher* m) { + // Only visit the RHS to ensure that we count only RHS tensors. + m->match(node->rhs); + }), std::function([&](const AccessNode* node) { + bool allDense = true; + auto formats = node->tensorVar.getFormat().getModeFormats(); + for (size_t i = 0; i < formats.size(); i++) { + if (!formats[i].is()) { + allDense = false; + break; + } + } + if (!allDense) { + sparseRHSTensors.push_back(node); + } + })); + + // If there is more than one sparse tensor in the RHS, then the operation + // does not preserve non-zero structure. + // TODO (rohany): This is a little too strict. A more general policy would allow + // multiple sparse tensors in the RHS as long the merges occurring with those + // sparse tensors occur at a lower level than the + if (sparseRHSTensors.size() != 1) { + return false; + } + + Access inputAccess(sparseRHSTensors[0]); + // Finally, the result access must have the same formats and index + // variable accesses as the same sized prefix of the input. + // TODO (rohany): This is actually too restrictive of a check. This + // will matter more if we have any benchmarks that do this, but the + // real check is as follows: + // * Some prefix of result matches (in variables and formats) to input. + // * After that prefix, the remaining levels of result must be dense. + // This check allows for the following formats: Result({Sparse, Dense}), + // Input({Sparse}). In this case, the non-zero structure of result is + // in fact determined by the input, and the remaining dimensions of result + // are dense so they don't matter when considering non-zero structure. + // For reference on how to implement something like this, see + // https://github.com/tensor-compiler/taco/compare/parallel-sparse-results. + auto rVars = resultAccess->getIndexVars(); + auto iVars = inputAccess.getIndexVars(); + auto rFormat = resultAccess->getTensorVar().getFormat().getModeFormats(); + auto iFormat = inputAccess.getTensorVar().getFormat().getModeFormats(); + + // The result tensor must have dimension <= the input dimension. + if (rVars.size() > iVars.size()) { + return false; + } + + for (size_t i = 0; i < rVars.size(); i++) { + if (rVars[i] != iVars[i]) return false; + // The mode formats don't have dynamic type tags right now, so + // this is all that we can do. We can't just check if they are + // exactly equal, because the RectCompressedModeFormats are + // parametrized on their dimensionality. + if (rFormat[i].is()) { + if (!iFormat[i].is()) return false; + } else if (rFormat[i].is()) { + if (!iFormat[i].is()) return false; + } else { + return false; + } + } + + // At this point we are sure that the input statement + // has the same output non-zero pattern as the input, + // so populate the output and return true. + res = NonZeroAnalyzerResult(std::move(resultAccess), std::make_unique(inputAccess)); + return true; +} + } diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp index cb5437322..1598473aa 100644 --- a/src/index_notation/transformations.cpp +++ b/src/index_notation/transformations.cpp @@ -802,6 +802,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { set reductionIndexVars; set parentParallelUnits; std::string reason = ""; + bool preservesNonZeroOutputStructure = false; IndexStmt rewriteParallel(IndexStmt stmt) { provGraph = ProvenanceGraph(stmt); @@ -824,6 +825,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { assembledByUngroupedInsert.push_back(tensorVars[result]); } + NonZeroAnalyzerResult res; + this->preservesNonZeroOutputStructure = preservesNonZeroStructure(stmt, res); + return rewrite(stmt); } @@ -874,27 +878,29 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const { iterators, provGraph, definedIndexVars); - // Precondition 3: Every result iterator must have insert capability - for (Iterator iterator : underivedLattice.results()) { - if (util::contains(assembledByUngroupedInsert, iterator.getTensor())) { - for (Iterator it = iterator; !it.isRoot(); it = it.getParent()) { - if (it.hasInsertCoord() || !it.isYieldPosPure()) { - reason = "Precondition failed: The output tensor does not " - "support parallelized inserts"; - return; - } - } - } else { - while (true) { - if (!iterator.hasInsert()) { - reason = "Precondition failed: The output tensor must support " - "inserts"; - return; + // Precondition 3: Every result iterator must have insert capability, + // or the result tensor preserves the same non-zero structure as the input. + if (!this->preservesNonZeroOutputStructure) { + for (Iterator iterator : lattice.results()) { + if (util::contains(assembledByUngroupedInsert, iterator.getTensor())) { + for (Iterator it = iterator; !it.isRoot(); it = it.getParent()) { + if (it.hasInsertCoord() || !it.isYieldPosPure()) { + reason = "Precondition failed: The output tensor does not " + "support parallelized inserts"; + return; + } } - if (iterator.isLeaf()) { - break; + } else { + while (true) { + if (!iterator.hasInsert()) { + reason = "Precondition failed: The output tensor must allow inserts"; + return; + } + if (iterator.isLeaf()) { + break; + } + iterator = iterator.getChild(); } - iterator = iterator.getChild(); } } } diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp index a3f8cface..51058dd14 100644 --- a/src/lower/iterator.cpp +++ b/src/lower/iterator.cpp @@ -44,8 +44,11 @@ struct Iterator::Content { Window(ir::Expr _lo, ir::Expr _hi, ir::Expr _stride, ir::Expr _windowVar) : windowVar(_windowVar), lo(_lo), hi(_hi), stride(_stride) {}; }; - std::unique_ptr window; + std::shared_ptr window; Iterator indexSetIterator; + + // The iterator that this iterator is tracking, if any + Iterator trackingIterator; }; Iterator::Iterator() : content(nullptr) { @@ -452,7 +455,7 @@ bool Iterator::isStrided() const { void Iterator::setWindowBounds(ir::Expr lo, ir::Expr hi, ir::Expr stride) { auto windowVarName = this->getIndexVar().getName() + this->getMode().getName() + "_window"; auto wvar = ir::Var::make(windowVarName, Int()); - this->content->window = std::make_unique(Content::Window(lo, hi, stride, wvar)); + this->content->window = std::make_shared(Content::Window(lo, hi, stride, wvar)); } bool Iterator::hasIndexSet() const { @@ -467,6 +470,10 @@ void Iterator::setIndexSetIterator(Iterator iter) { this->content->indexSetIterator = iter; } +Iterator Iterator::getTrackingIterator() const { + return this->content->trackingIterator; +} + bool operator==(const Iterator& a, const Iterator& b) { if (a.isDimensionIterator() && b.isDimensionIterator()) { return a.getIndexVar() == b.getIndexVar(); @@ -521,6 +528,12 @@ Iterators::Iterators(IndexStmt stmt, const map& tensorVars) ProvenanceGraph provGraph = ProvenanceGraph(stmt); set underivedAdded; set computeVars; + + // Figure out whether we are able to get away with keeping the non-zero + // structure of the input for the output tensor. + NonZeroAnalyzerResult nonZeroResult; + auto preservesNonZeros = preservesNonZeroStructure(stmt, nonZeroResult); + // Create dimension iterators match(stmt, function([&](auto n, auto m) { @@ -562,6 +575,23 @@ Iterators::Iterators(IndexStmt stmt, const map& tensorVars) }) ); + if (preservesNonZeros) { + // If our output preserves the non-zero structure of the input + // tensor, then we'll modify the result iterator to track variables + // of the input tensor that control iteration. However, information + // about the mode etc will be retained. + auto resultTV = nonZeroResult.resultAccess->getTensorVar(); + auto inputTV = nonZeroResult.inputAccess->getTensorVar(); + + for (int i = 0; i < resultTV.getOrder(); i++) { + auto resultIter = this->content->levelIterators[{*nonZeroResult.resultAccess, i+1}]; + auto inputIter = this->content->levelIterators[{*nonZeroResult.inputAccess, i+1}]; + taco_iassert(resultIter.defined()); + taco_iassert(inputIter.defined()); + resultIter.content->trackingIterator = inputIter; + } + } + // Reverse the levelIterators map for fast modeAccess lookup for (auto& iterator : content->levelIterators) { content->modeAccesses.insert({iterator.second, iterator.first}); @@ -582,6 +612,33 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI int level = 1; ModeFormat parentModeType; + + // (owhsu) TODO: See if this needs to be uncommented out + // From rohany patch for nonzero structure support +// // This logic is for setting up iterators for pos splits. In a fused position split, +// // the original logic assigned all level iterators to have the index variable of the +// // fused pos split. This logic seems incorrect. Instead, having only the deepest level +// // that the pos split occurred use that index variable seemed like the right thing to +// // satisfy how the code was used in lowerForallFusedPosition. This block attempts to +// // find the deepest match for the position split in the coordinate tree of the tensor. +// int deepestPosMatch = -1; +// { +// int counter = 0; +// for (auto modeTypePack : format.getModeFormatPacks()) { +// taco_iassert(modeTypePack.getModeFormats().size() == 1); +// for (auto& modeType : modeTypePack.getModeFormats()) { +// int modeNumber = format.getModeOrdering()[counter]; +// IndexVar indexVar = access.getIndexVars()[modeNumber]; +// IndexVar iteratorIndexVar; +// if (provGraph.getPosIteratorDescendant(indexVar, &iteratorIndexVar) && +// provGraph.isPosOfAccess(iteratorIndexVar, access)) { +// deepestPosMatch = std::max(deepestPosMatch, modeNumber); +// } +// } +// counter++; +// } +// } + for (ModeFormatPack modeTypePack : format.getModeFormatPacks()) { vector arrays; taco_iassert(modeTypePack.getModeFormats().size() > 0); @@ -604,6 +661,15 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI else if (!provGraph.isPosOfAccess(iteratorIndexVar, access)) { // want to iterate across level as a position variable if has irregular descendant, but otherwise iterate normally iteratorIndexVar = indexVar; + + // TODO (owhsu): See if this needs to be uncommented out + // From rohany support sparse output nonzero structure preserved +// IndexVar posFuseVar; +// IndexVar iteratorIndexVar = indexVar; +// // As described above, only set the deepest position split match to be the fused iterator. +// if (provGraph.getPosIteratorDescendant(indexVar, &posFuseVar) && +// provGraph.isPosOfAccess(posFuseVar, access) && modeNumber == deepestPosMatch) { +// iteratorIndexVar = posFuseVar; } Mode mode(tensorIR, dim, level, modeType, modePack, pos, parentModeType); diff --git a/src/lower/lower.cpp b/src/lower/lower.cpp index 511dcb442..d7ad6bbfd 100644 --- a/src/lower/lower.cpp +++ b/src/lower/lower.cpp @@ -45,13 +45,13 @@ std::shared_ptr Lowerer::getLowererImpl() { } ir::Stmt lower(IndexStmt stmt, std::string name, - bool assemble, bool compute, bool pack, bool unpack, + bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros, Lowerer lowerer) { string reason; taco_iassert(isLowerable(stmt, &reason)) << "Not lowerable, because " << reason << ": " << stmt; - ir::Stmt lowered = lowerer.getLowererImpl()->lower(stmt, name, assemble, compute, pack, unpack); + ir::Stmt lowered = lowerer.getLowererImpl()->lower(stmt, name, assemble, compute, pack, unpack, enablePreserveNonZeros); // TODO: re-enable this // std::string messages; diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp index 6a3baadb3..a7c21c95e 100644 --- a/src/lower/lowerer_impl_imperative.cpp +++ b/src/lower/lowerer_impl_imperative.cpp @@ -232,10 +232,11 @@ static std::set hasSparseInserts(IndexStmt stmt, Iterators iterators, Stmt LowererImplImperative::lower(IndexStmt stmt, string name, - bool assemble, bool compute, bool pack, bool unpack) + bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros) { this->assemble = assemble; this->compute = compute; + this->enablePreserveNonZeros = enablePreserveNonZeros; definedIndexVarsOrdered = {}; definedIndexVars = {}; loopOrderAllowsShortCircuit = allForFreeLoopsBeforeAllReductionLoops(stmt); @@ -297,6 +298,14 @@ LowererImplImperative::lower(IndexStmt stmt, string name, // Create variables for keeping track of result values array capacity createCapacityVars(resultVars, &capacityVars); + // Figure out whether we can preserve the non-zero structure + // of an input tensor for the result tensor. However, to ensure + // that we don't break normal TACO usage, we'll only do this + // when enabled. + if (this->enablePreserveNonZeros) { + this->preservesNonZeros = preservesNonZeroStructure(stmt, this->nonZeroAnalyzerResult); + } + // Create iterators iterators = Iterators(stmt, tensorVars); @@ -2075,14 +2084,48 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, const set& reducedAccesses, MergeStrategy mergeStrategy) { - // Inserter positions - Stmt declInserterPosVars = declLocatePosVars(inserters); + // If we're performing the optimization that allows for the non-zero structure + // of an input tensor to be maintained in the result tensor, then we need to + // do some cleanup here before dealing with appenders and inserters. In particular, + // the iterator for the result tensor will be an appender (as it is sparse), but + // will not quite be a inserter or locator, as it is tracking the variables from + // the input tensor it is aligned with. Therefore, we will just ignore generating + // appender related code for the result tensor in this case. + Iterator resultNonZeroIter; + if (this->preservesNonZeros) { + std::vector realAppenders; + for (auto &iter : appenders) { + if (iter.getTensor() != this->tensorVars[this->nonZeroAnalyzerResult.resultAccess->getTensorVar()]) { + realAppenders.push_back(iter); + } else { + resultNonZeroIter = iter; + } + } + appenders = realAppenders; + } + + // There can be overlaps between the inserters and locators, which results in + // duplicate emitting of variable declarations. We'll fix that here. + std::vector itersWithLocators; + for (auto it : inserters) { + if (!util::contains(itersWithLocators, it)) { itersWithLocators.push_back(it); } + } + for (auto it : locators) { + if (!util::contains(itersWithLocators, it)) { itersWithLocators.push_back(it); } + } + auto declPosVars = declLocatePosVars(itersWithLocators); - // Locate positions - Stmt declLocatorPosVars = declLocatePosVars(locators); + Stmt trackPosVars; + if (this->preservesNonZeros && resultNonZeroIter.defined()) { + if (resultNonZeroIter.hasPosIter()) { + auto tracking = resultNonZeroIter.getTrackingIterator(); + taco_iassert(tracking.defined()); + trackPosVars = ir::VarDecl::make(resultNonZeroIter.getPosVar(), tracking.getPosVar()); + } + } if (captureNextLocatePos) { - capturedLocatePos = Block::make(declInserterPosVars, declLocatorPosVars); + capturedLocatePos = declPosVars; captureNextLocatePos = false; } @@ -2111,7 +2154,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, append(stmts, loweredCases); Stmt body = Block::make(stmts); - return Block::make(declInserterPosVars, declLocatorPosVars, body); + return Block::make(declPosVars, trackPosVars, body); } Stmt initVals = resizeAndInitValues(appenders, reducedAccesses); @@ -2137,8 +2180,8 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt, // TODO: Emit code to insert coordinates return Block::make(initVals, - declInserterPosVars, - declLocatorPosVars, + declPosVars, + trackPosVars, body, appendCoords, incr); diff --git a/src/tensor.cpp b/src/tensor.cpp index 9588bf224..1d83a724c 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -60,7 +60,7 @@ TensorBase::TensorBase(std::string name, Datatype ctype) : TensorBase(name, ctype, {}, Format(), Literal::zero(ctype)) { } -TensorBase::TensorBase(Datatype ctype, vector dimensions, +TensorBase::TensorBase(Datatype ctype, vector dimensions, ModeFormat modeType, Literal fill) : TensorBase(util::uniqueName('A'), ctype, dimensions, std::vector(dimensions.size(), modeType), fill) { @@ -225,6 +225,10 @@ void TensorBase::setNeedsCompute(bool needsCompute) { content->needsCompute = needsCompute; } +void TensorBase::setPreserveNonZero(bool preserveNonZero) { + content->preserveNonZero = preserveNonZero; +} + bool TensorBase::neverPacked() { return content->neverPacked; } @@ -672,13 +676,21 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) { } } - content->assembleFunc = lower(stmtToCompile, "assemble", true, false); - content->computeFunc = lower(stmtToCompile, "compute", assembleWhileCompute, true); + if (content->preserveNonZero) { + content->assembleFunc = lower(stmtToCompile, "assemble", true, false, false, + false, true); + content->computeFunc = lower(stmtToCompile, "compute", assembleWhileCompute, true, + false, false, true); + } else { + content->assembleFunc = lower(stmtToCompile, "assemble", true, false); + content->computeFunc = lower(stmtToCompile, "compute", assembleWhileCompute, true); + } // If we have to recompile the kernel, we need to create a new Module. Since // the module we are holding on to could have been retrieved from the cache, // we can't modify it. content->module = make_shared(); - content->module->addFunction(content->assembleFunc); + if (!content->preserveNonZero) + content->module->addFunction(content->assembleFunc); content->module->addFunction(content->computeFunc); content->module->compile(); cacheComputeKernel(concretizedAssign, content->module); @@ -1064,6 +1076,55 @@ bool scalarEquals(std::complex a, std::complex b) { return true; } +template +bool equalsTypedInt64(const TensorBase& a, const TensorBase& b) { + auto at = iterate(a); + auto bt = iterate(b); + auto ait = at.template beginTyped(); + auto bit = bt.template beginTyped(); + + while (ait != at.template endTyped() && bit != bt.template endTyped()) { + auto acoord = ait->first; + auto bcoord = bit->first; + auto aval = ait->second; + auto bval = bit->second; + + if (acoord != bcoord) { + if (isZero(aval)) { + ++ait; + continue; + } + else if (isZero(bval)) { + ++bit; + continue; + } + + return false; + } + if (!scalarEquals(aval, bval)) { + return false; + } + + ++ait; + ++bit; + } + while (ait != at.template endTyped()) { + auto aval = ait->second; + if (!isZero(aval)) { + return false; + } + ++ait; + } + while (bit != bt.template endTyped()) { + auto bval = bit->second; + if (!isZero(bval)) { + return false; + } + ++bit; + } + return (ait == at.template endTyped() && bit == bt.template endTyped()); +} + template bool equalsTyped(const TensorBase& a, const TensorBase& b) { auto at = iterate(a); @@ -1113,6 +1174,52 @@ bool equalsTyped(const TensorBase& a, const TensorBase& b) { return (ait == at.end() && bit == bt.end()); } +bool equalsInt64(const TensorBase& a, const TensorBase& b) { + // Component type must be the same + if (a.getComponentType() != b.getComponentType()) { + return false; + } + + // Fill values must be the same + if (!equals(a.getFillValue(), b.getFillValue())) { + return false; + } + + // Orders must be the same + if (a.getOrder() != b.getOrder()) { + return false; + } + + // Dimensions must be the same + for (int mode = 0; mode < a.getOrder(); mode++) { + if (a.getDimension(mode) != b.getDimension(mode)) { + return false; + } + } + + // Values must be the same + switch(a.getComponentType().getKind()) { + case Datatype::Bool: taco_ierror; return false; + case Datatype::UInt8: return equalsTypedInt64(a, b); + case Datatype::UInt16: return equalsTypedInt64(a, b); + case Datatype::UInt32: return equalsTypedInt64(a, b); + case Datatype::UInt64: return equalsTypedInt64(a, b); + case Datatype::UInt128: return equalsTypedInt64(a, b); + case Datatype::Int8: return equalsTypedInt64(a, b); + case Datatype::Int16: return equalsTypedInt64(a, b); + case Datatype::Int32: return equalsTypedInt64(a, b); + case Datatype::Int64: return equalsTypedInt64(a, b); + case Datatype::Int128: return equalsTypedInt64(a, b); + case Datatype::Float32: return equalsTypedInt64(a, b); + case Datatype::Float64: return equalsTypedInt64(a, b); + case Datatype::Complex64: return equalsTypedInt64>(a, b); + case Datatype::Complex128: return equalsTypedInt64>(a, b); + case Datatype::Undefined: taco_ierror << "Undefined data type"; + } + taco_unreachable; + return false; +} + bool equals(const TensorBase& a, const TensorBase& b) { // Component type must be the same if (a.getComponentType() != b.getComponentType()) { diff --git a/test/test.cpp b/test/test.cpp index a49f10ff7..f61381c11 100644 --- a/test/test.cpp +++ b/test/test.cpp @@ -32,6 +32,12 @@ void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual) { ASSERT_TRUE(equals(expected, actual)); } +void ASSERT_TENSOR_EQ_INT64(TensorBase expected, TensorBase actual) { + SCOPED_TRACE(string("expected: ") + util::toString(expected)); + SCOPED_TRACE(string(" actual: ") + util::toString(actual)); + ASSERT_TRUE(equalsInt64(expected, actual)); +} + void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) { SCOPED_TRACE(string("expected: ") + util::toString(expected)); SCOPED_TRACE(string(" actual: ") + util::toString(actual)); diff --git a/test/test.h b/test/test.h index 3302bf81f..d19452ccc 100644 --- a/test/test.h +++ b/test/test.h @@ -61,6 +61,7 @@ void ASSERT_VECTOR_EQ(std::vector expected, void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual); void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual); +void ASSERT_TENSOR_EQ_INT64(TensorBase expected, TensorBase actual); template void ASSERT_COMPONENTS_EQUALS(vector>> expectedIndices, diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp index a16ab11f9..40e123f5f 100644 --- a/test/tests-scheduling-eval.cpp +++ b/test/tests-scheduling-eval.cpp @@ -2249,3 +2249,107 @@ TEST(scheduling_eval, DISABLED_bfsPullScheduled) { expected.compute(); ASSERT_TENSOR_EQ(expected, y); } + +TEST(hypermapper, TTM_unsched) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 4; + int NUM_J = 4; + int NUM_K = 4; + int NUM_L = 4; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (i))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (l))); + } + } + + B.pack(); + C.pack(); + + Tensor result("result",{NUM_I, NUM_J, NUM_L}, {Sparse, Sparse, Dense}); + result(i,j,l) = B(i,j,k) * C(k,l); + result.setPreserveNonZero(true); + result.compile(); + result.assemble(); + result.compute(); + + std::cout << "RESULT: " << result << std::endl; + + A(i,j,l) = B(i,j,k) * C(k, l); + A.compile(); + A.assemble(); + A.compute(); + std::cout << "A: " << A << std::endl; + ASSERT_TENSOR_EQ_INT64(A, result); +} + +TEST(hypermapper, TTM_unsched_copy) { + if (should_use_CUDA_codegen()) { + return; + } + int NUM_I = 4; + int NUM_J = 4; + int NUM_K = 4; + int NUM_L = 4; + float SPARSITY = .1; + Tensor A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs + Tensor B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse}); + Tensor C("C", {NUM_K, NUM_L}, {Dense, Dense}); + + srand(935); + for (int i = 0; i < NUM_I; i++) { + for (int j = 0; j < NUM_J; j++) { + for (int k = 0; k < NUM_K; k++) { + float rand_float = (float) rand() / (float) (RAND_MAX); + if (rand_float < SPARSITY) { + B.insert({i, j, k}, (double) ((int) (i))); + } + } + } + } + + for (int k = 0; k < NUM_K; k++) { + for (int l = 0; l < NUM_L; l++) { + float rand_float = (float)rand()/(float)(RAND_MAX); + C.insert({k, l}, (double) ((int) (l))); + } + } + + B.pack(); + C.pack(); + + std::vector dims = {NUM_I, NUM_J, NUM_L}; + Tensor result = copyNonZeroStructure(dims, {Sparse, Sparse, Dense}, B, 2); + result(i,j,l) = B(i,j,k) * C(k,l); + result.setPreserveNonZero(true); + result.compile(); + result.compute(); + + std::cout << "RESULT: " << result << std::endl; + + A(i,j,l) = B(i,j,k) * C(k, l); + A.compile(); + A.assemble(); + A.compute(); + std::cout << "A: " << A << std::endl; + ASSERT_TENSOR_EQ_INT64(A, result); +} \ No newline at end of file From 54a31fa52a07a5d875e951484f70ad05eb54d4eb Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Sun, 26 Feb 2023 13:14:11 -0800 Subject: [PATCH 2/8] Add in preserveNonZeros for TTV --- apps/taco_dse/hypermapper_taco_client.cpp | 6 ++++-- apps/taco_dse/taco_helper.h | 14 +++++++++----- include/taco/tensor.h | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 25ebf7d11..099a50b76 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -899,7 +899,8 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar int temp_chunk_size_k = 1; int temp_omp_num_threads = 32; // default_config_time = ttv_handler->get_default_compute_time(); - ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, temp_loop_ordering, temp_omp_scheduling_type, temp_omp_chunk_size, temp_omp_num_threads, false); + ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, + temp_loop_ordering, temp_omp_scheduling_type, temp_omp_chunk_size, temp_omp_num_threads, false, 5); ttv_handler->set_cold_run(); default_config_time = ttv_handler->get_compute_time(); @@ -908,7 +909,8 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar if(!no_sched_init) { try{ - ttv_handler->schedule_and_compute(temp_result, chunk_size_i, chunk_size_fpos, chunk_size_k, loop_ordering, omp_scheduling_type, omp_chunk_size, omp_num_threads, false); + ttv_handler->schedule_and_compute(temp_result, chunk_size_i, chunk_size_fpos, chunk_size_k, + loop_ordering, omp_scheduling_type, omp_chunk_size, omp_num_threads, false, 5); ttv_handler->set_cold_run(); double compute_time = ttv_handler->get_compute_time(); Obj.compute_time = compute_time; diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h index 0cdc63b3e..7855aebf7 100644 --- a/apps/taco_dse/taco_helper.h +++ b/apps/taco_dse/taco_helper.h @@ -1379,11 +1379,14 @@ class TTV : public tacoOp { int get_num_j() { return NUM_J; } double compute_unscheduled() { - taco::Tensor result({NUM_I, NUM_J}, taco::dense); + taco::Tensor result = copyNonZeroStructure({NUM_I, NUM_J}, {taco::Sparse, taco::Sparse}, B, 2); result(i, j) = B(i, j, k) * c(k); taco::util::Timer timer; + result.setPreserveNonZero(true); + result.setNeedsAssemble(false); + result.setAssembleWhileCompute(false); result.compile(); - result.assemble(); + //result.assemble(); timer.start(); result.compute(); timer.stop(); @@ -1430,7 +1433,7 @@ class TTV : public tacoOp { void schedule_and_compute(taco::Tensor &result_, int chunk_size_i, int chunk_size_fpos, int chunk_size_k, std::vector order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false, int num_reps=10) { - taco::Tensor result("result", {NUM_I, NUM_J}, taco::dense); + taco::Tensor result = copyNonZeroStructure({NUM_I, NUM_J}, {taco::Sparse, taco::Sparse}, B, 2); result(i, j) = B(i, j, k) * c(k); // std::cout << "Elements: " << std::endl; @@ -1456,9 +1459,10 @@ class TTV : public tacoOp { taco::util::Timer timer; std::vector compute_times; timer.clear_cache(); + result.setPreserveNonZero(true); + result.setNeedsAssemble(false); result.compile(sched); - result.setNeedsAssemble(true); - result.assemble(); +// result.assemble(); for(int i = 0; i < num_reps; i++) { timer.start(); result.setNeedsCompute(true); diff --git a/include/taco/tensor.h b/include/taco/tensor.h index 683bbabc1..bb8b8e12d 100644 --- a/include/taco/tensor.h +++ b/include/taco/tensor.h @@ -1352,7 +1352,7 @@ Tensor copyNonZeroStructure(std::vector resDims, Format format, Tensor= srcLevels); - taco_uassert(format.getOrder() >= src.getFormat().getOrder()); + taco_uassert(format.getOrder() >= srcLevels); for (size_t i = 0; i < (size_t)srcLevels; i++) { taco_uassert(resDims[i] == src.getDimensions()[i]); From 74579a2066416d7d59a7b39797d7bb0d0d16d6e7 Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Sun, 26 Feb 2023 13:41:14 -0800 Subject: [PATCH 3/8] Add in changes to TTV to get the schedule faster --- apps/taco_dse/hypermapper_taco_client.cpp | 2 +- apps/taco_dse/taco_helper.h | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 099a50b76..595a122c7 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -896,7 +896,7 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar int temp_omp_chunk_size = 1; int temp_chunk_size_i = 1; int temp_chunk_size_fpos = 1; - int temp_chunk_size_k = 1; + int temp_chunk_size_k = 16; int temp_omp_num_threads = 32; // default_config_time = ttv_handler->get_default_compute_time(); ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h index 7855aebf7..41215eeb3 100644 --- a/apps/taco_dse/taco_helper.h +++ b/apps/taco_dse/taco_helper.h @@ -1272,7 +1272,7 @@ class TTV : public tacoOp { taco::Tensor B; taco::Tensor c; taco::IndexStmt stmt; - taco::IndexVar f, fpos, chunk, fpos2, k1, k2, i0, i1; + taco::IndexVar f, fpos, chunk, fpos2, k1, k2, kpos, kpos1, kpos2, i0, i1; int run_mode, num_reps; TTV(int mode, int NUM_I = 1000, int NUM_J = 1000, int NUM_K = 1000, float SPARSITY = .3) : NUM_I{NUM_I}, NUM_J{NUM_J}, @@ -1288,7 +1288,8 @@ class TTV : public tacoOp { { } TTV() : run_mode(1), initialized{false}, cold_run{true}, - f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), k1("k1"), k2("k2"), i0("i0"), i1("i1") {} + f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), k1("k1"), k2("k2"), i0("i0"), i1("i1"), + kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"){} float get_sparsity() { return (run_mode == 0) ? SPARSITY : inputCache.get_sparsity(); } void set_cold_run() { cold_run = true; } void initialize_data(int mode = RANDOM) override @@ -1348,7 +1349,7 @@ class TTV : public tacoOp { B.pack(); c.pack(); - std::vector reorder_{i0, chunk, fpos2, k1, k2}; + std::vector reorder_{i0, chunk, fpos2, kpos1, kpos2}; compute_reordering(reorder_); // Avoid duplicate reinitialize initialized = true; @@ -1386,7 +1387,7 @@ class TTV : public tacoOp { result.setNeedsAssemble(false); result.setAssembleWhileCompute(false); result.compile(); - //result.assemble(); + // result.assemble(); timer.start(); result.compute(); timer.stop(); @@ -1413,9 +1414,12 @@ class TTV : public tacoOp { return sched.split(i, i0, i1, chunk_size_i).fuse(i1, j, f) .pos(f, fpos, B(i,j,k)) .split(fpos, chunk, fpos2, chunk_size_fpos) - .split(k, k1, k2, chunk_size_k) + .pos(k, kpos, B(i,j,k)) + .split(kpos, kpos1, kpos2, chunk_size_k) .reorder(reorder) - .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces); + .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces) + .parallelize(kpos2, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction); + // return stmt.fuse(i, j, f) // .pos(f, fpos, B(i,j,k)) From dd4cf5fb5b6089973bf98b471692e9eaef3547ae Mon Sep 17 00:00:00 2001 From: Olivia Hsu Date: Tue, 28 Feb 2023 16:26:47 -0800 Subject: [PATCH 4/8] Add in fix to predictor value --- apps/taco_dse/hypermapper_taco_client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 595a122c7..08d264536 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -1336,7 +1336,7 @@ int main(int argc, char **argv) { int omp_chunk_size = program.get("--omp_chunk_size"); bool Predictor = false; if (test_name == "TTV" || test_name == "MTTKRP" || test_name == "TTM") { - Predictor = false; + Predictor = true; } std::string log_file_ = "hypermapper_taco_log.csv"; From adffc62c06ee7fa8f0834055e3746eaf924535ae Mon Sep 17 00:00:00 2001 From: lrubens Date: Wed, 1 Mar 2023 00:43:33 +0000 Subject: [PATCH 5/8] Upate starting time for runs --- apps/taco_dse/hypermapper_taco_client.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 08d264536..938a8726f 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -874,7 +874,8 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar timer = ttv_handler->compute_unscheduled(); compute_times.push_back(timer); } - Obj.compute_time = median(compute_times); + // Obj.compute_time = median(compute_times); + no_sched_time = median(compute_times); no_sched_init = true; cout << "computed unscheduled" << endl; } @@ -896,7 +897,7 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar int temp_omp_chunk_size = 1; int temp_chunk_size_i = 1; int temp_chunk_size_fpos = 1; - int temp_chunk_size_k = 16; + int temp_chunk_size_k = 8; int temp_omp_num_threads = 32; // default_config_time = ttv_handler->get_default_compute_time(); ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, @@ -904,6 +905,7 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar ttv_handler->set_cold_run(); default_config_time = ttv_handler->get_compute_time(); + Obj.compute_time = default_config_time; logger << ttv_handler->get_num_i() << "," << ttv_handler->get_num_j() << "," << default_config_time << "," << no_sched_time << std::endl; } From b8b52ae0be05073643a1899399963ce8b9362ef4 Mon Sep 17 00:00:00 2001 From: lrubens Date: Wed, 1 Mar 2023 14:47:29 -0800 Subject: [PATCH 6/8] pushing randomized ttv --- apps/taco_dse/hypermapper_taco_client.cpp | 20 +++--- apps/taco_dse/taco_helper.h | 78 ++++++++++++++--------- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 938a8726f..5afcbd4ea 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -846,9 +846,9 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar std::vector loop_ordering = static_cast>*>(InputParams[6])->getVal(); std::vector default_ordering{0,1,2,3,4}; - int NUM_I = 10000; - int NUM_J = 10000; - int NUM_K = 1000; + int NUM_I = 1000; + int NUM_J = 100; + int NUM_K = 100; std::vector compute_times; @@ -858,7 +858,11 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar cout << "INITIALIZING" << endl; ttv_handler = new TTV(); ttv_handler->matrix_name = matrix_name; - ttv_handler->initialize_data(1); + ttv_handler->SPARSITY = 0.1; + ttv_handler->NUM_I = NUM_I; + ttv_handler->NUM_J = NUM_J; + ttv_handler->NUM_K = NUM_K; + ttv_handler->initialize_data(0); initialized = true; // sparsity = ttv_handler->get_sparsity(); num_i = ttv_handler->NUM_I; @@ -894,10 +898,10 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar // int temp_unroll_factor = 8; std::vector temp_loop_ordering{0,1,2,3,4}; int temp_omp_scheduling_type = 0; - int temp_omp_chunk_size = 1; - int temp_chunk_size_i = 1; - int temp_chunk_size_fpos = 1; - int temp_chunk_size_k = 8; + int temp_omp_chunk_size = 16; + int temp_chunk_size_i = 16; + int temp_chunk_size_fpos = 16; + int temp_chunk_size_k = 16; int temp_omp_num_threads = 32; // default_config_time = ttv_handler->get_default_compute_time(); ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h index 41215eeb3..287fe0eb6 100644 --- a/apps/taco_dse/taco_helper.h +++ b/apps/taco_dse/taco_helper.h @@ -170,7 +170,7 @@ struct UfuncInputCache { } template - taco::Tensor getTensor(std::string path, U format, bool countNNZ = false, float sparsity=0.3, int num_k = 1000, bool includeThird = false) { + taco::Tensor getTensor(std::string path, U format, bool shift_dim=false, bool countNNZ = false, float sparsity=0.3, int num_k = 1000, bool includeThird = false) { // See if the paths match. if (this->lastPath == path) { // TODO (rohany): Not worrying about whether the format was the same as what was asked for. @@ -191,6 +191,17 @@ struct UfuncInputCache { this->num_j = this->inputTensor.getDimensions()[1]; this->num_k = this->inputTensor.getDimensions()[2]; + int last_dim = 0; + if (shift_dim) { + last_dim = this->inputTensor.getDimensions()[3]; + } + + taco::Tensor copy("test", {this->num_i, this->num_k, last_dim}, taco::Sparse); + + // for (auto component : this->inputTensor) { + + // } + if (countNNZ) { this->nnz = 0; #ifdef TACO_DEFAULT_INTEGER_TYPE @@ -1302,39 +1313,46 @@ class TTV : public tacoOp { return; srand(9536); - // for (int i = 0; i < NUM_I; i++) - // { - // for (int j = 0; j < NUM_J; j++) - // { - // for (int k = 0; k < NUM_K; k++) - // { - // float rand_float = (float)rand() / (float)(RAND_MAX); - // if (rand_float < SPARSITY) - // { - // B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY))); - // } - // } - // } - // } - auto ssPath = std::getenv("FROST_PATH"); - if(ssPath == nullptr) { - std::cout << "Environment variable FROST_PATH not set\n"; + if (mode == RANDOM) { + taco::Tensor res("res", {NUM_I, NUM_J, NUM_K}, taco::Sparse); + B = res; + for (int i = 0; i < NUM_I; i++) + { + for (int j = 0; j < NUM_J; j++) + { + for (int k = 0; k < NUM_K; k++) + { + float rand_float = (float)rand() / (float)(RAND_MAX); + if (rand_float < SPARSITY) + { + B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY))); + } + } + } + } } - std::string ssPathStr = std::string(ssPath); - char sep = '/'; - std::string matrix_path; - if (ssPathStr[ssPathStr.length()] == sep) { - matrix_path = ssPathStr + matrix_name; - } else { - matrix_path = ssPathStr + "/" + matrix_name; - } + else { + auto ssPath = std::getenv("FROST_PATH"); + if(ssPath == nullptr) { + std::cout << "Environment variable FROST_PATH not set\n"; + } + std::string ssPathStr = std::string(ssPath); - B = inputCache.getTensor(matrix_path, Sparse, true); - NUM_I = inputCache.num_i; - NUM_J = inputCache.num_j; - NUM_K = inputCache.num_k; + char sep = '/'; + std::string matrix_path; + if (ssPathStr[ssPathStr.length()] == sep) { + matrix_path = ssPathStr + matrix_name; + } else { + matrix_path = ssPathStr + "/" + matrix_name; + } + + B = inputCache.getTensor(matrix_path, Sparse, true); + NUM_I = inputCache.num_i; + NUM_J = inputCache.num_j; + NUM_K = inputCache.num_k; + } std::cout << "Dimensions: " << NUM_I << ", " << NUM_J << ", " << NUM_K << std::endl; From 11ff6466a9ddcdff3aa4a6ba9c91dd69769cefc3 Mon Sep 17 00:00:00 2001 From: lrubens Date: Wed, 1 Mar 2023 14:56:53 -0800 Subject: [PATCH 7/8] adding changes --- apps/taco_dse/taco_helper.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h index 287fe0eb6..47fe97b04 100644 --- a/apps/taco_dse/taco_helper.h +++ b/apps/taco_dse/taco_helper.h @@ -1314,6 +1314,7 @@ class TTV : public tacoOp { srand(9536); + int nnz = 0; if (mode == RANDOM) { taco::Tensor res("res", {NUM_I, NUM_J, NUM_K}, taco::Sparse); B = res; @@ -1327,6 +1328,7 @@ class TTV : public tacoOp { if (rand_float < SPARSITY) { B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY))); + nnz++; } } } @@ -1355,6 +1357,7 @@ class TTV : public tacoOp { } std::cout << "Dimensions: " << NUM_I << ", " << NUM_J << ", " << NUM_K << std::endl; + std::cout << "NNZ: " << nnz << std::endl; taco::Tensor c_("c", {NUM_K}, taco::Format{taco::ModeFormat::Dense}); c = c_; From 2a7be92e22634e8bc3e46b7e996351d8e5d5087a Mon Sep 17 00:00:00 2001 From: lrubens Date: Wed, 1 Mar 2023 15:02:54 -0800 Subject: [PATCH 8/8] fix minor bug --- apps/taco_dse/hypermapper_taco_client.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp index 5afcbd4ea..0c0bdcbc1 100644 --- a/apps/taco_dse/hypermapper_taco_client.cpp +++ b/apps/taco_dse/hypermapper_taco_client.cpp @@ -887,6 +887,7 @@ HMObjective calculateObjectiveTTVDense(std::vector &InputPar //Initiate scheduling passing in chunk_size (param to optimize) bool default_config = (chunk_size_i == 16); bool valid = true; + Obj.valid = valid; compute_times = vector(); ttv_handler->set_cold_run();