From 98020a0087b9be42115688ba8e2fd1efbc8d05a2 Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Sun, 26 Feb 2023 01:57:55 -0800
Subject: [PATCH 1/8] Add in changes that should work for propagating output
 format based on input format

---
 apps/taco_dse/hypermapper_taco_client.cpp    |   2 +-
 apps/taco_dse/taco_helper.h                  |  37 +++---
 include/taco/format.h                        |  10 ++
 include/taco/index_notation/index_notation.h |  16 +++
 include/taco/lower/iterator.h                |   5 +
 include/taco/lower/lower.h                   |   1 +
 include/taco/lower/lowerer_impl.h            |   2 +-
 include/taco/lower/lowerer_impl_imperative.h |  10 +-
 include/taco/tensor.h                        |  63 ++++++++++
 src/index_notation/index_notation.cpp        | 104 +++++++++++++++++
 src/index_notation/transformations.cpp       |  44 ++++---
 src/lower/iterator.cpp                       |  70 ++++++++++-
 src/lower/lower.cpp                          |   4 +-
 src/lower/lowerer_impl_imperative.cpp        |  61 ++++++++--
 src/tensor.cpp                               | 115 ++++++++++++++++++-
 test/test.cpp                                |   6 +
 test/test.h                                  |   1 +
 test/tests-scheduling-eval.cpp               | 104 +++++++++++++++++
 18 files changed, 593 insertions(+), 62 deletions(-)
diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index c98c598fd..25ebf7d11 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -974,7 +974,7 @@ HMObjective calculateObjectiveTTMDense(std::vector<HMInputParamBase *> &InputPar
 
   compute_times = vector<double>();
   ttm_handler->set_cold_run();
-  taco::Tensor<double> temp_result({ttm_handler->NUM_I, ttm_handler->NUM_J, ttm_handler->NUM_L}, taco::dense);
+  taco::Tensor<double> temp_result({ttm_handler->NUM_I, ttm_handler->NUM_J, ttm_handler->NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense});
 
   std::vector<bool> valid_perm(120, true);
   std::vector<std::vector<int>> orders;
diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h
index f6880ba13..0cdc63b3e 100644
--- a/apps/taco_dse/taco_helper.h
+++ b/apps/taco_dse/taco_helper.h
@@ -1427,9 +1427,10 @@ class TTV : public tacoOp {
         //         .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
     }
 
-    void schedule_and_compute(taco::Tensor<double> &result, int chunk_size_i, int chunk_size_fpos, int chunk_size_k,
+    void schedule_and_compute(taco::Tensor<double> &result_, int chunk_size_i, int chunk_size_fpos, int chunk_size_k,
                               std::vector<int> order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false,
                               int num_reps=10) {
+        taco::Tensor<double> result("result", {NUM_I, NUM_J}, taco::dense);
         result(i, j) = B(i, j, k) * c(k);
 
         // std::cout << "Elements: " << std::endl;
@@ -1481,23 +1482,6 @@ class TTV : public tacoOp {
         timer.clear_cache();
 
 
-        // taco::util::Timer timer;
-        // timer.clear_cache();
-        // result.setAssembleWhileCompute(true);
-        //result.compile(sched);
-	//result.setNeedsAssemble(true);
-        // result.assemble();
-        // timer.start();
-        // result.compute();
-        // timer.stop();
-
-        // compute_time = timer.getResult().mean;
-        // if(default_config) {
-        //    default_compute_time = timer.getResult().mean;
-        // }
-        // timer.clear_cache();
-
-
     }
 
     void compute(bool default_config = false) override
@@ -1626,11 +1610,14 @@ class TTM : public tacoOp {
     }
 
     double compute_unscheduled() {
-        taco::Tensor<double> result({NUM_I, NUM_J, NUM_L}, taco::dense);
+        taco::Tensor<double> result = copyNonZeroStructure({NUM_I, NUM_J, NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense}, B, 2);
         result(i,j,l) = B(i,j,k) * C(k,l);
         taco::util::Timer timer;
+        result.setPreserveNonZero(true);
+        result.setAssembleWhileCompute(false);
+        result.setNeedsAssemble(false);
         result.compile();
-        result.assemble();
+//        result.assemble();
         timer.start();
         result.compute();
         timer.stop();
@@ -1661,7 +1648,7 @@ class TTM : public tacoOp {
     }
 
     void schedule_and_compute(taco::Tensor<double> &result_, int chunk_size, int unroll_factor, std::vector<int> order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false, int num_reps=20) {
-	taco::Tensor<double> result("result", {NUM_I, NUM_J, NUM_L}, taco::dense);
+	      taco::Tensor<double> result = copyNonZeroStructure({NUM_I, NUM_J, NUM_L}, {taco::Sparse, taco::Sparse, taco::Dense}, B, 2);
         result(i,j,l) = B(i,j,k) * C(k,l);
 
         taco::IndexStmt sched = result.getAssignment().concretize();
@@ -1680,11 +1667,15 @@ class TTM : public tacoOp {
         taco::util::Timer timer;
         std::vector<double> compute_times;
         timer.clear_cache();
+        result.setPreserveNonZero(true);
+        result.setNeedsAssemble(false);
         result.compile(sched);
-        result.setNeedsAssemble(true);
-        result.assemble();
+
+//        result.assemble();
         for(int i = 0; i < num_reps; i++) {
             timer.start();
+            result.setPreserveNonZero(true);
+
             result.setNeedsCompute(true);
             result.compute();
             timer.stop();
diff --git a/include/taco/format.h b/include/taco/format.h
index 81bdadda4..c957649eb 100644
--- a/include/taco/format.h
+++ b/include/taco/format.h
@@ -157,6 +157,16 @@ class ModeFormat {
   friend bool operator!=(const ModeFormat&, const ModeFormat&);
   friend std::ostream& operator<<(std::ostream&, const ModeFormat&);
 
+  template <typename T>
+  bool is() {
+    return std::dynamic_pointer_cast<const T>(this->impl) != nullptr;
+  }
+  template <typename T>
+  std::shared_ptr<const T> as() {
+    taco_iassert(this->is<T>());
+    return std::dynamic_pointer_cast<const T>(this->impl);
+  }
+
 private:
   std::shared_ptr<const ModeFormatImpl> impl;
 
diff --git a/include/taco/index_notation/index_notation.h b/include/taco/index_notation/index_notation.h
index d5fe90a97..f03b233e7 100644
--- a/include/taco/index_notation/index_notation.h
+++ b/include/taco/index_notation/index_notation.h
@@ -1317,5 +1317,21 @@ IndexStmt generatePackStmt(TensorVar tensor,
 IndexStmt generatePackCOOStmt(TensorVar tensor, 
                               std::vector<IndexVar> indexVars, bool otherIsOnRight);
 
+// preservesNonZeroStructure returns true if the output of the input
+// stmt has the same non-zero structure as one of the inputs. If it does,
+// it populates the input NonZeroAnalyzerResult with information about
+// why the non-zero structure is preserved.
+struct NonZeroAnalyzerResult {
+    NonZeroAnalyzerResult() {}
+    NonZeroAnalyzerResult(std::unique_ptr<Access> resultAccess, std::unique_ptr<Access> inputAccess)
+            : resultAccess(std::move(resultAccess)),
+              inputAccess(std::move(inputAccess)) {}
+    // Note that these accesses are wrapped in std::unique_ptr so that
+    // we avoid issues around the overloaed operator= on Access types.
+    std::unique_ptr<Access> resultAccess;
+    std::unique_ptr<Access> inputAccess;
+};
+bool preservesNonZeroStructure(IndexStmt stmt, NonZeroAnalyzerResult& res);
+
 }
 #endif
diff --git a/include/taco/lower/iterator.h b/include/taco/lower/iterator.h
index a838505f6..9886e3900 100644
--- a/include/taco/lower/iterator.h
+++ b/include/taco/lower/iterator.h
@@ -217,6 +217,11 @@ class Iterator : public util::Comparable<Iterator> {
   /// backing the index set.
   Iterator getIndexSetIterator() const;
 
+  // getTrackingIterator returns the iterator that the current iterator is tracking. This
+  // is used to implement an optimization for when the non-zero structure of the output
+  // is the same as one of the input tensors.
+  Iterator getTrackingIterator() const;
+
   friend bool operator==(const Iterator&, const Iterator&);
   friend bool operator<(const Iterator&, const Iterator&);
   friend std::ostream& operator<<(std::ostream&, const Iterator&);
diff --git a/include/taco/lower/lower.h b/include/taco/lower/lower.h
index aab025394..b7053612a 100644
--- a/include/taco/lower/lower.h
+++ b/include/taco/lower/lower.h
@@ -46,6 +46,7 @@ class Lowerer {
 /// parts of a concrete index notation statement.
 ir::Stmt lower(IndexStmt stmt, std::string functionName,
                bool assemble=true, bool compute=true, bool pack=false, bool unpack=false,
+               bool enablePreserveNonZeros=false,
                Lowerer lowerer=Lowerer());
 
 /// Check whether the an index statement can be lowered to C code.  If the
diff --git a/include/taco/lower/lowerer_impl.h b/include/taco/lower/lowerer_impl.h
index f5003b5ee..95f80b9b0 100644
--- a/include/taco/lower/lowerer_impl.h
+++ b/include/taco/lower/lowerer_impl.h
@@ -56,7 +56,7 @@ class LowererImpl : public util::Uncopyable {
 
   /// Lower an index statement to an IR function.
   virtual ir::Stmt lower(IndexStmt stmt, std::string name, 
-                 bool assemble, bool compute, bool pack, bool unpack) = 0;
+                 bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros) = 0;
 
 protected:
 
diff --git a/include/taco/lower/lowerer_impl_imperative.h b/include/taco/lower/lowerer_impl_imperative.h
index fa97e3cd9..08e6d9065 100644
--- a/include/taco/lower/lowerer_impl_imperative.h
+++ b/include/taco/lower/lowerer_impl_imperative.h
@@ -56,7 +56,7 @@ class LowererImplImperative : public LowererImpl {
 
   /// Lower an index statement to an IR function.
   ir::Stmt lower(IndexStmt stmt, std::string name, 
-                 bool assemble, bool compute, bool pack, bool unpack);
+                 bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros=false);
 
 protected:
 
@@ -501,6 +501,7 @@ class LowererImplImperative : public LowererImpl {
 private:
   bool assemble;
   bool compute;
+  bool enablePreserveNonZeros = false;
   bool loopOrderAllowsShortCircuit = false;
 
   std::set<TensorVar> needCompute;
@@ -600,6 +601,13 @@ class LowererImplImperative : public LowererImpl {
   friend class Visitor;
   std::shared_ptr<Visitor> visitor;
 
+  // These two fields maintain information about if the optimization
+  // to write into sparse outputs with the same non-zero structure
+  // as the input tensor is enabled.
+  bool preservesNonZeros = false;
+  NonZeroAnalyzerResult nonZeroAnalyzerResult;
+
+
 };
 
 }
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index 79f91b8db..683bbabc1 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -479,6 +479,8 @@ class TensorBase {
   /* --- Friend Functions    --- */
   /// True iff two tensors have the same type and the same values.
   friend bool equals(const TensorBase&, const TensorBase&);
+  friend bool equalsInt64(const TensorBase&, const TensorBase&);
+
 
   /// True iff two TensorBase objects refer to the same tensor (TensorBase
   /// and Tensor objects are references to tensors).
@@ -501,6 +503,7 @@ class TensorBase {
   std::vector<TensorBase> getDependentTensors();
   void setNeedsCompute(bool needsCompute);
   void setNeedsAssemble(bool needsAssemble);
+  void setPreserveNonZero(bool preserveNonZero);
 private:
   static std::shared_ptr<ir::Module> getHelperFunctions(
       const Format& format, Datatype ctype, const std::vector<int>& dimensions);
@@ -916,6 +919,7 @@ struct TensorBase::Content {
   bool               needsCompile;
   bool               needsAssemble;
   bool               needsCompute;
+  bool               preserveNonZero;
   std::vector<std::weak_ptr<TensorBase::Content>> dependentTensors;
   unsigned int       uniqueId;
 
@@ -1338,5 +1342,64 @@ void taco_set_num_threads(int num_threads);
 /// computations. This will be replaced by a scheduling language in the future.
 int taco_get_num_threads();
 
+
+// copyNonZeroStructure copies the non-zero structure of the src tensor into a new
+// tensor, but does not copy the values. This method is intended to be used in the case
+// where the result tensor of a computation has a sparse output with non-zero structure
+// identical to an input tensor's non-zero structure.
+template<typename T>
+Tensor<T> copyNonZeroStructure(std::vector<int> resDims, Format format, Tensor<T> src, int srcLevels)  {
+
+  // Double check that the result format is a prefix of the source format.
+  taco_uassert(src.getFormat().getOrder() >= srcLevels);
+  taco_uassert(format.getOrder() >= src.getFormat().getOrder());
+
+  for (size_t i = 0; i < (size_t)srcLevels; i++) {
+    taco_uassert(resDims[i] == src.getDimensions()[i]);
+    taco_uassert(format.getModeFormats()[i] == src.getFormat().getModeFormats()[i]);
+  }
+
+  Tensor<T> result(resDims, format);
+  auto srcIndex = src.getStorage().getIndex();
+  auto rFormat = format.getModeFormats();
+  std::vector<ModeIndex> resModeIndex;
+  for (size_t level = 0; level < (size_t)srcLevels; level++) {
+    auto srcModeIndex = srcIndex.getModeIndex(level);
+    resModeIndex.push_back(srcModeIndex);
+  }
+
+  // Need to keep track of new size starting with size of previous crd array
+  int denseSize = srcIndex.getModeIndex(srcLevels - 1).getIndexArray(1).getSize();
+  for (size_t level = srcLevels; level < (size_t) format.getOrder(); level++) {
+    taco_uassert(rFormat[level] == taco::Dense) << "Can only have dense levels after sparse levels";
+    Type rType = result.getTensorVar().getType();
+    denseSize = denseSize * resDims[level];
+    int sizeArr [1] = {resDims[level]};
+    auto arr = makeArray<int>((int*)sizeArr, (size_t)1);
+    std::vector<Array> denseArr;
+    denseArr.push_back(arr);
+    ModeIndex rMI(denseArr);
+    resModeIndex.push_back(rMI);
+  }
+
+  // TODO (owhsu): Need to figure out why the dense arrays added afterwards
+  //  don't seem to be correct (i.e. index l for TTM)
+  //  even though when the Array data (above) is printed here, they seem correct.
+  Index resultIndex(format, resModeIndex);
+  result.getStorage().setIndex(resultIndex);
+
+  // Perform a similar operation as above but for the values array.
+  // However, we only need to construct the values, not copy anything into them.
+  auto srcVals = src.getStorage().getValues();
+  auto srcType = srcVals.getType();
+  auto srcSize = denseSize;
+  //   Array(Datatype type, void* data, size_t size, Policy policy=Free);
+  Array resVals = makeArray(srcType, srcSize);
+  result.getStorage().setValues(resVals);
+  return result;
+}
+
+// TODO (owhsu): See if we need a function to copy the nonzero structure
+
 }
 #endif
diff --git a/src/index_notation/index_notation.cpp b/src/index_notation/index_notation.cpp
index f28b60228..f819dd3be 100644
--- a/src/index_notation/index_notation.cpp
+++ b/src/index_notation/index_notation.cpp
@@ -22,6 +22,7 @@
 #include "taco/index_notation/index_notation_rewriter.h"
 #include "taco/index_notation/index_notation_printer.h"
 #include "taco/ir/ir.h"
+#include "taco/lower/lower.h"
 #include "taco/codegen/module.h"
 #include "taco/tensor.h"
 
@@ -32,6 +33,8 @@
 #include "taco/util/collections.h"
 #include "taco/util/functions.h"
 #include "taco/util/env.h"
+#include "taco/lower/mode_format_dense.h"
+#include "taco/lower/mode_format_compressed.h"
 
 using namespace std;
 
@@ -4198,4 +4201,105 @@ IndexStmt generatePackCOOStmt(TensorVar tensor,
 
   return generatePackStmt(tensor, tensorName + "_COO", bufferFormat, indexVars, otherIsOnRight);
 }
+
+bool preservesNonZeroStructure(IndexStmt stmt, NonZeroAnalyzerResult& res) {
+  // TODO (rohany): Handle when the statement can contain workspaces.
+
+  // We have to use a unique_ptr here to get around the overloaded operator=
+  // on Access types.
+  // std::cout << stmt << std::endl;
+  std::unique_ptr<Access> resultAccess = nullptr;
+  Where where = nullptr;
+  // First, let's find the output access.
+  match(stmt, std::function<void(const AssignmentNode*)>([&](const AssignmentNode* node){
+    // There should only be one output access.
+
+    taco_iassert(!resultAccess);
+    resultAccess = std::make_unique<Access>(node->lhs);
+  }),
+  std::function<void(const WhereNode*, Matcher*)>([&](const WhereNode* node, Matcher* ctx){
+      // FIXME (owhsu): Patch to handle where statements with workspaces.
+      // Only need to go to the consumer side since that will
+      // get the outer-most output access
+      ctx->match(node->consumer);
+  }));
+  // Some expressions don't have assignments, and thus won't have an RHS to consider.
+  if (resultAccess == nullptr) {
+    return false;
+  }
+
+  // Now, there can only be one non-dense tensor in the LHS.
+  std::vector<Access> sparseRHSTensors;
+  match(stmt, std::function<void(const AssignmentNode*, Matcher*)>([&](const AssignmentNode* node, Matcher* m) {
+    // Only visit the RHS to ensure that we count only RHS tensors.
+    m->match(node->rhs);
+  }), std::function<void(const AccessNode*)>([&](const AccessNode* node) {
+    bool allDense = true;
+    auto formats = node->tensorVar.getFormat().getModeFormats();
+    for (size_t i = 0; i < formats.size(); i++) {
+      if (!formats[i].is<DenseModeFormat>()) {
+        allDense = false;
+        break;
+      }
+    }
+    if (!allDense) {
+      sparseRHSTensors.push_back(node);
+    }
+  }));
+
+  // If there is more than one sparse tensor in the RHS, then the operation
+  // does not preserve non-zero structure.
+  // TODO (rohany): This is a little too strict. A more general policy would allow
+  //  multiple sparse tensors in the RHS as long the merges occurring with those
+  //  sparse tensors occur at a lower level than the
+  if (sparseRHSTensors.size() != 1) {
+    return false;
+  }
+
+  Access inputAccess(sparseRHSTensors[0]);
+  // Finally, the result access must have the same formats and index
+  // variable accesses as the same sized prefix of the input.
+  // TODO (rohany): This is actually too restrictive of a check. This
+  //  will matter more if we have any benchmarks that do this, but the
+  //  real check is as follows:
+  //  * Some prefix of result matches (in variables and formats) to input.
+  //  * After that prefix, the remaining levels of result must be dense.
+  //  This check allows for the following formats: Result({Sparse, Dense}),
+  //  Input({Sparse}). In this case, the non-zero structure of result is
+  //  in fact determined by the input, and the remaining dimensions of result
+  //  are dense so they don't matter when considering non-zero structure.
+  //  For reference on how to implement something like this, see
+  //  https://github.com/tensor-compiler/taco/compare/parallel-sparse-results.
+  auto rVars = resultAccess->getIndexVars();
+  auto iVars = inputAccess.getIndexVars();
+  auto rFormat = resultAccess->getTensorVar().getFormat().getModeFormats();
+  auto iFormat = inputAccess.getTensorVar().getFormat().getModeFormats();
+
+  // The result tensor must have dimension <= the input dimension.
+  if (rVars.size() > iVars.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < rVars.size(); i++) {
+    if (rVars[i] != iVars[i]) return false;
+    // The mode formats don't have dynamic type tags right now, so
+    // this is all that we can do. We can't just check if they are
+    // exactly equal, because the RectCompressedModeFormats are
+    // parametrized on their dimensionality.
+    if (rFormat[i].is<DenseModeFormat>()) {
+      if (!iFormat[i].is<DenseModeFormat>()) return false;
+    } else if (rFormat[i].is<CompressedModeFormat>()) {
+      if (!iFormat[i].is<CompressedModeFormat>()) return false;
+    } else {
+      return false;
+    }
+  }
+
+  // At this point we are sure that the input statement
+  // has the same output non-zero pattern as the input,
+  // so populate the output and return true.
+  res = NonZeroAnalyzerResult(std::move(resultAccess), std::make_unique<Access>(inputAccess));
+  return true;
+}
+
 }
diff --git a/src/index_notation/transformations.cpp b/src/index_notation/transformations.cpp
index cb5437322..1598473aa 100644
--- a/src/index_notation/transformations.cpp
+++ b/src/index_notation/transformations.cpp
@@ -802,6 +802,7 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
     set<IndexVar> reductionIndexVars;
     set<ParallelUnit> parentParallelUnits;
     std::string reason = "";
+    bool preservesNonZeroOutputStructure = false;
 
     IndexStmt rewriteParallel(IndexStmt stmt) {
       provGraph = ProvenanceGraph(stmt);
@@ -824,6 +825,9 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
         assembledByUngroupedInsert.push_back(tensorVars[result]);
       }
 
+      NonZeroAnalyzerResult res;
+      this->preservesNonZeroOutputStructure = preservesNonZeroStructure(stmt, res);
+
       return rewrite(stmt);
     }
 
@@ -874,27 +878,29 @@ IndexStmt Parallelize::apply(IndexStmt stmt, std::string* reason) const {
                                                            iterators, provGraph, 
                                                            definedIndexVars);
 
-        // Precondition 3: Every result iterator must have insert capability
-        for (Iterator iterator : underivedLattice.results()) {
-          if (util::contains(assembledByUngroupedInsert, iterator.getTensor())) {
-            for (Iterator it = iterator; !it.isRoot(); it = it.getParent()) {
-              if (it.hasInsertCoord() || !it.isYieldPosPure()) {
-                reason = "Precondition failed: The output tensor does not "
-                         "support parallelized inserts";
-                return;
-              }
-            }
-          } else {
-            while (true) {
-              if (!iterator.hasInsert()) {
-                reason = "Precondition failed: The output tensor must support " 
-                         "inserts";
-                return;
+        // Precondition 3: Every result iterator must have insert capability,
+        // or the result tensor preserves the same non-zero structure as the input.
+        if (!this->preservesNonZeroOutputStructure) {
+          for (Iterator iterator : lattice.results()) {
+            if (util::contains(assembledByUngroupedInsert, iterator.getTensor())) {
+              for (Iterator it = iterator; !it.isRoot(); it = it.getParent()) {
+                if (it.hasInsertCoord() || !it.isYieldPosPure()) {
+                  reason = "Precondition failed: The output tensor does not "
+                           "support parallelized inserts";
+                  return;
+                } 
               }
-              if (iterator.isLeaf()) {
-                break;
+            } else {
+              while (true) {
+                if (!iterator.hasInsert()) {
+                  reason = "Precondition failed: The output tensor must allow inserts";
+                  return;
+                }
+                if (iterator.isLeaf()) {
+                  break;
+                }
+                iterator = iterator.getChild();
               }
-              iterator = iterator.getChild();
             }
           }
         }
diff --git a/src/lower/iterator.cpp b/src/lower/iterator.cpp
index a3f8cface..51058dd14 100644
--- a/src/lower/iterator.cpp
+++ b/src/lower/iterator.cpp
@@ -44,8 +44,11 @@ struct Iterator::Content {
     Window(ir::Expr _lo, ir::Expr _hi, ir::Expr _stride, ir::Expr _windowVar) :
       windowVar(_windowVar), lo(_lo), hi(_hi), stride(_stride) {};
   };
-  std::unique_ptr<Window> window;
+  std::shared_ptr<Window> window;
   Iterator indexSetIterator;
+  
+  // The iterator that this iterator is tracking, if any
+  Iterator trackingIterator;
 };
 
 Iterator::Iterator() : content(nullptr) {
@@ -452,7 +455,7 @@ bool Iterator::isStrided() const {
 void Iterator::setWindowBounds(ir::Expr lo, ir::Expr hi, ir::Expr stride) {
   auto windowVarName = this->getIndexVar().getName() + this->getMode().getName() + "_window";
   auto wvar = ir::Var::make(windowVarName, Int());
-  this->content->window = std::make_unique<Content::Window>(Content::Window(lo, hi, stride, wvar));
+  this->content->window = std::make_shared<Content::Window>(Content::Window(lo, hi, stride, wvar));
 }
 
 bool Iterator::hasIndexSet() const {
@@ -467,6 +470,10 @@ void Iterator::setIndexSetIterator(Iterator iter) {
   this->content->indexSetIterator = iter;
 }
 
+Iterator Iterator::getTrackingIterator() const {
+  return this->content->trackingIterator;
+}
+
 bool operator==(const Iterator& a, const Iterator& b) {
   if (a.isDimensionIterator() && b.isDimensionIterator()) {
     return a.getIndexVar() == b.getIndexVar();
@@ -521,6 +528,12 @@ Iterators::Iterators(IndexStmt stmt, const map<TensorVar, Expr>& tensorVars)
   ProvenanceGraph provGraph = ProvenanceGraph(stmt);
   set<IndexVar> underivedAdded;
   set<IndexVar> computeVars;
+
+  // Figure out whether we are able to get away with keeping the non-zero
+  // structure of the input for the output tensor.
+  NonZeroAnalyzerResult nonZeroResult;
+  auto preservesNonZeros = preservesNonZeroStructure(stmt, nonZeroResult);
+
   // Create dimension iterators
   match(stmt,
     function<void(const ForallNode*, Matcher*)>([&](auto n, auto m) {
@@ -562,6 +575,23 @@ Iterators::Iterators(IndexStmt stmt, const map<TensorVar, Expr>& tensorVars)
     })
   );
 
+  if (preservesNonZeros) {
+    // If our output preserves the non-zero structure of the input
+    // tensor, then we'll modify the result iterator to track variables
+    // of the input tensor that control iteration. However, information
+    // about the mode etc will be retained.
+    auto resultTV = nonZeroResult.resultAccess->getTensorVar();
+    auto inputTV = nonZeroResult.inputAccess->getTensorVar();
+
+    for (int i = 0; i < resultTV.getOrder(); i++) {
+        auto resultIter = this->content->levelIterators[{*nonZeroResult.resultAccess, i+1}];
+        auto inputIter = this->content->levelIterators[{*nonZeroResult.inputAccess, i+1}];
+        taco_iassert(resultIter.defined());
+        taco_iassert(inputIter.defined());
+        resultIter.content->trackingIterator = inputIter;
+    }
+  }
+
   // Reverse the levelIterators map for fast modeAccess lookup
   for (auto& iterator : content->levelIterators) {
     content->modeAccesses.insert({iterator.second, iterator.first});
@@ -582,6 +612,33 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI
 
   int level = 1;
   ModeFormat parentModeType;
+
+  // (owhsu) TODO: See if this needs to be uncommented out
+  // From rohany patch for nonzero structure support
+//  // This logic is for setting up iterators for pos splits. In a fused position split,
+//  // the original logic assigned all level iterators to have the index variable of the
+//  // fused pos split. This logic seems incorrect. Instead, having only the deepest level
+//  // that the pos split occurred use that index variable seemed like the right thing to
+//  // satisfy how the code was used in lowerForallFusedPosition. This block attempts to
+//  // find the deepest match for the position split in the coordinate tree of the tensor.
+//  int deepestPosMatch = -1;
+//  {
+//    int counter = 0;
+//    for (auto modeTypePack : format.getModeFormatPacks()) {
+//      taco_iassert(modeTypePack.getModeFormats().size() == 1);
+//      for (auto& modeType : modeTypePack.getModeFormats()) {
+//        int modeNumber = format.getModeOrdering()[counter];
+//        IndexVar indexVar = access.getIndexVars()[modeNumber];
+//        IndexVar iteratorIndexVar;
+//        if (provGraph.getPosIteratorDescendant(indexVar, &iteratorIndexVar) &&
+//            provGraph.isPosOfAccess(iteratorIndexVar, access)) {
+//          deepestPosMatch = std::max(deepestPosMatch, modeNumber);
+//        }
+//      }
+//      counter++;
+//    }
+//  }
+
   for (ModeFormatPack modeTypePack : format.getModeFormatPacks()) {
     vector<Expr> arrays;
     taco_iassert(modeTypePack.getModeFormats().size() > 0);
@@ -604,6 +661,15 @@ void Iterators::createAccessIterators(Access access, Format format, Expr tensorI
       else if (!provGraph.isPosOfAccess(iteratorIndexVar, access)) {
         // want to iterate across level as a position variable if has irregular descendant, but otherwise iterate normally
         iteratorIndexVar = indexVar;
+
+        // TODO (owhsu): See if this needs to be uncommented out
+        // From rohany support sparse output nonzero structure preserved
+//        IndexVar posFuseVar;
+//        IndexVar iteratorIndexVar = indexVar;
+//        // As described above, only set the deepest position split match to be the fused iterator.
+//        if (provGraph.getPosIteratorDescendant(indexVar, &posFuseVar) &&
+//            provGraph.isPosOfAccess(posFuseVar, access) && modeNumber == deepestPosMatch) {
+//          iteratorIndexVar = posFuseVar;
       }
       Mode mode(tensorIR, dim, level, modeType, modePack, pos,
                 parentModeType);
diff --git a/src/lower/lower.cpp b/src/lower/lower.cpp
index 511dcb442..d7ad6bbfd 100644
--- a/src/lower/lower.cpp
+++ b/src/lower/lower.cpp
@@ -45,13 +45,13 @@ std::shared_ptr<LowererImpl> Lowerer::getLowererImpl() {
 }
 
 ir::Stmt lower(IndexStmt stmt, std::string name, 
-               bool assemble, bool compute, bool pack, bool unpack,
+               bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros,
                Lowerer lowerer) {
   string reason;
   taco_iassert(isLowerable(stmt, &reason))
       << "Not lowerable, because " << reason << ": " << stmt;
   
-  ir::Stmt lowered = lowerer.getLowererImpl()->lower(stmt, name, assemble, compute, pack, unpack);
+  ir::Stmt lowered = lowerer.getLowererImpl()->lower(stmt, name, assemble, compute, pack, unpack, enablePreserveNonZeros);
 
   // TODO: re-enable this
   // std::string messages;
diff --git a/src/lower/lowerer_impl_imperative.cpp b/src/lower/lowerer_impl_imperative.cpp
index 6a3baadb3..a7c21c95e 100644
--- a/src/lower/lowerer_impl_imperative.cpp
+++ b/src/lower/lowerer_impl_imperative.cpp
@@ -232,10 +232,11 @@ static std::set<Expr> hasSparseInserts(IndexStmt stmt, Iterators iterators,
 
 Stmt
 LowererImplImperative::lower(IndexStmt stmt, string name,
-                   bool assemble, bool compute, bool pack, bool unpack)
+                   bool assemble, bool compute, bool pack, bool unpack, bool enablePreserveNonZeros)
 {
   this->assemble = assemble;
   this->compute = compute;
+  this->enablePreserveNonZeros = enablePreserveNonZeros;
   definedIndexVarsOrdered = {};
   definedIndexVars = {};
   loopOrderAllowsShortCircuit = allForFreeLoopsBeforeAllReductionLoops(stmt);
@@ -297,6 +298,14 @@ LowererImplImperative::lower(IndexStmt stmt, string name,
   // Create variables for keeping track of result values array capacity
   createCapacityVars(resultVars, &capacityVars);
 
+  // Figure out whether we can preserve the non-zero structure
+  // of an input tensor for the result tensor. However, to ensure
+  // that we don't break normal TACO usage, we'll only do this
+  // when enabled.
+  if (this->enablePreserveNonZeros) {
+    this->preservesNonZeros = preservesNonZeroStructure(stmt, this->nonZeroAnalyzerResult);
+  }
+
   // Create iterators
   iterators = Iterators(stmt, tensorVars);
 
@@ -2075,14 +2084,48 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
                                   const set<Access>& reducedAccesses, 
                                   MergeStrategy mergeStrategy) {
 
-  // Inserter positions
-  Stmt declInserterPosVars = declLocatePosVars(inserters);
+  // If we're performing the optimization that allows for the non-zero structure
+  // of an input tensor to be maintained in the result tensor, then we need to
+  // do some cleanup here before dealing with appenders and inserters. In particular,
+  // the iterator for the result tensor will be an appender (as it is sparse), but
+  // will not quite be a inserter or locator, as it is tracking the variables from
+  // the input tensor it is aligned with. Therefore, we will just ignore generating
+  // appender related code for the result tensor in this case.
+  Iterator resultNonZeroIter;
+  if (this->preservesNonZeros) {
+    std::vector<Iterator> realAppenders;
+    for (auto &iter : appenders) {
+      if (iter.getTensor() != this->tensorVars[this->nonZeroAnalyzerResult.resultAccess->getTensorVar()]) {
+        realAppenders.push_back(iter);
+      } else {
+        resultNonZeroIter = iter;
+      }
+    }
+    appenders = realAppenders;
+  }
+
+  // There can be overlaps between the inserters and locators, which results in
+  // duplicate emitting of variable declarations. We'll fix that here.
+  std::vector<Iterator> itersWithLocators;
+  for (auto it : inserters) {
+    if (!util::contains(itersWithLocators, it)) { itersWithLocators.push_back(it); }
+  }
+  for (auto it : locators) {
+    if (!util::contains(itersWithLocators, it)) { itersWithLocators.push_back(it); }
+  }
+  auto declPosVars = declLocatePosVars(itersWithLocators);
 
-  // Locate positions
-  Stmt declLocatorPosVars = declLocatePosVars(locators);
+  Stmt trackPosVars;
+  if (this->preservesNonZeros && resultNonZeroIter.defined()) {
+    if (resultNonZeroIter.hasPosIter()) {
+      auto tracking = resultNonZeroIter.getTrackingIterator();
+      taco_iassert(tracking.defined());
+      trackPosVars = ir::VarDecl::make(resultNonZeroIter.getPosVar(), tracking.getPosVar());
+    }
+  }
 
   if (captureNextLocatePos) {
-    capturedLocatePos = Block::make(declInserterPosVars, declLocatorPosVars);
+    capturedLocatePos = declPosVars;
     captureNextLocatePos = false;
   }
 
@@ -2111,7 +2154,7 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
     append(stmts, loweredCases);
     Stmt body = Block::make(stmts);
 
-    return Block::make(declInserterPosVars, declLocatorPosVars, body);
+    return Block::make(declPosVars, trackPosVars, body);
   }
 
   Stmt initVals = resizeAndInitValues(appenders, reducedAccesses);
@@ -2137,8 +2180,8 @@ Stmt LowererImplImperative::lowerForallBody(Expr coordinate, IndexStmt stmt,
   // TODO: Emit code to insert coordinates
 
   return Block::make(initVals,
-                     declInserterPosVars,
-                     declLocatorPosVars,
+                     declPosVars,
+                     trackPosVars,
                      body,
                      appendCoords,
                      incr);
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 9588bf224..1d83a724c 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -60,7 +60,7 @@ TensorBase::TensorBase(std::string name, Datatype ctype)
     : TensorBase(name, ctype, {}, Format(), Literal::zero(ctype))  {
 }
 
-TensorBase::TensorBase(Datatype ctype, vector<int> dimensions, 
+TensorBase::TensorBase(Datatype ctype, vector<int> dimensions,
                        ModeFormat modeType, Literal fill)
     : TensorBase(util::uniqueName('A'), ctype, dimensions, 
                  std::vector<ModeFormatPack>(dimensions.size(), modeType), fill) {
@@ -225,6 +225,10 @@ void TensorBase::setNeedsCompute(bool needsCompute) {
   content->needsCompute = needsCompute;
 }
 
+void TensorBase::setPreserveNonZero(bool preserveNonZero) {
+  content->preserveNonZero = preserveNonZero;
+}
+
 bool TensorBase::neverPacked() {
   return content->neverPacked;
 }
@@ -672,13 +676,21 @@ void TensorBase::compile(taco::IndexStmt stmt, bool assembleWhileCompute) {
     }
   }
 
-  content->assembleFunc = lower(stmtToCompile, "assemble", true, false);
-  content->computeFunc = lower(stmtToCompile, "compute",  assembleWhileCompute, true);
+  if (content->preserveNonZero) {
+    content->assembleFunc = lower(stmtToCompile, "assemble", true, false, false,
+                                  false, true);
+    content->computeFunc = lower(stmtToCompile, "compute", assembleWhileCompute, true,
+                                 false, false, true);
+  } else {
+    content->assembleFunc = lower(stmtToCompile, "assemble", true, false);
+    content->computeFunc = lower(stmtToCompile, "compute", assembleWhileCompute, true);
+  }
   // If we have to recompile the kernel, we need to create a new Module. Since
   // the module we are holding on to could have been retrieved from the cache,
   // we can't modify it.
   content->module = make_shared<Module>();
-  content->module->addFunction(content->assembleFunc);
+  if (!content->preserveNonZero)
+    content->module->addFunction(content->assembleFunc);
   content->module->addFunction(content->computeFunc);
   content->module->compile();
   cacheComputeKernel(concretizedAssign, content->module);
@@ -1064,6 +1076,55 @@ bool scalarEquals(std::complex<T> a, std::complex<T> b) {
   return true;
 }
 
+template<typename T>
+bool equalsTypedInt64(const TensorBase& a, const TensorBase& b) {
+  auto at = iterate<T>(a);
+  auto bt = iterate<T>(b);
+  auto ait = at.template beginTyped<int64_t>();
+  auto bit = bt.template beginTyped<int64_t>();
+
+  while (ait != at.template endTyped<int64_t>() && bit != bt.template endTyped<int64_t>()) {
+    auto acoord = ait->first;
+    auto bcoord = bit->first;
+    auto aval = ait->second;
+    auto bval = bit->second;
+
+    if (acoord != bcoord) {
+      if (isZero(aval)) {
+        ++ait;
+        continue;
+      }
+      else if (isZero(bval)) {
+        ++bit;
+        continue;
+      }
+
+      return false;
+    }
+    if (!scalarEquals(aval, bval)) {
+      return false;
+    }
+
+    ++ait;
+    ++bit;
+  }
+  while (ait != at.template endTyped<int64_t>()) {
+    auto aval = ait->second;
+    if (!isZero(aval)) {
+      return false;
+    }
+    ++ait;
+  }
+  while (bit != bt.template endTyped<int64_t>()) {
+    auto bval = bit->second;
+    if (!isZero(bval)) {
+      return false;
+    }
+    ++bit;
+  }
+  return (ait == at.template endTyped<int64_t>() && bit == bt.template endTyped<int64_t>());
+}
+
 template<typename T>
 bool equalsTyped(const TensorBase& a, const TensorBase& b) {
   auto at = iterate<T>(a);
@@ -1113,6 +1174,52 @@ bool equalsTyped(const TensorBase& a, const TensorBase& b) {
   return (ait == at.end() && bit == bt.end());
 }
 
+bool equalsInt64(const TensorBase& a, const TensorBase& b) {
+  // Component type must be the same
+  if (a.getComponentType() != b.getComponentType()) {
+    return false;
+  }
+
+  // Fill values must be the same
+  if (!equals(a.getFillValue(), b.getFillValue())) {
+    return false;
+  }
+
+  // Orders must be the same
+  if (a.getOrder() != b.getOrder()) {
+    return false;
+  }
+
+  // Dimensions must be the same
+  for (int mode = 0; mode < a.getOrder(); mode++) {
+    if (a.getDimension(mode) != b.getDimension(mode)) {
+      return false;
+    }
+  }
+
+  // Values must be the same
+  switch(a.getComponentType().getKind()) {
+    case Datatype::Bool: taco_ierror; return false;
+    case Datatype::UInt8: return equalsTypedInt64<uint8_t>(a, b);
+    case Datatype::UInt16: return equalsTypedInt64<uint16_t>(a, b);
+    case Datatype::UInt32: return equalsTypedInt64<uint32_t>(a, b);
+    case Datatype::UInt64: return equalsTypedInt64<uint64_t>(a, b);
+    case Datatype::UInt128: return equalsTypedInt64<unsigned long long>(a, b);
+    case Datatype::Int8: return equalsTypedInt64<int8_t>(a, b);
+    case Datatype::Int16: return equalsTypedInt64<int16_t>(a, b);
+    case Datatype::Int32: return equalsTypedInt64<int32_t>(a, b);
+    case Datatype::Int64: return equalsTypedInt64<int64_t>(a, b);
+    case Datatype::Int128: return equalsTypedInt64<long long>(a, b);
+    case Datatype::Float32: return equalsTypedInt64<float>(a, b);
+    case Datatype::Float64: return equalsTypedInt64<double>(a, b);
+    case Datatype::Complex64: return equalsTypedInt64<std::complex<float>>(a, b);
+    case Datatype::Complex128: return equalsTypedInt64<std::complex<double>>(a, b);
+    case Datatype::Undefined: taco_ierror << "Undefined data type";
+  }
+  taco_unreachable;
+  return false;
+}
+
 bool equals(const TensorBase& a, const TensorBase& b) {
   // Component type must be the same
   if (a.getComponentType() != b.getComponentType()) {
diff --git a/test/test.cpp b/test/test.cpp
index a49f10ff7..f61381c11 100644
--- a/test/test.cpp
+++ b/test/test.cpp
@@ -32,6 +32,12 @@ void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual) {
   ASSERT_TRUE(equals(expected, actual));
 }
 
+void ASSERT_TENSOR_EQ_INT64(TensorBase expected, TensorBase actual) {
+  SCOPED_TRACE(string("expected: ") + util::toString(expected));
+  SCOPED_TRACE(string("  actual: ") + util::toString(actual));
+  ASSERT_TRUE(equalsInt64(expected, actual));
+}
+
 void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual) {
   SCOPED_TRACE(string("expected: ") + util::toString(expected));
   SCOPED_TRACE(string("  actual: ") + util::toString(actual));
diff --git a/test/test.h b/test/test.h
index 3302bf81f..d19452ccc 100644
--- a/test/test.h
+++ b/test/test.h
@@ -61,6 +61,7 @@ void ASSERT_VECTOR_EQ(std::vector<T> expected,
 
 void ASSERT_STORAGE_EQ(TensorStorage expected, TensorStorage actual);
 void ASSERT_TENSOR_EQ(TensorBase expected, TensorBase actual);
+void ASSERT_TENSOR_EQ_INT64(TensorBase expected, TensorBase actual);
 
 template <typename T>
 void ASSERT_COMPONENTS_EQUALS(vector<vector<vector<int>>> expectedIndices,
diff --git a/test/tests-scheduling-eval.cpp b/test/tests-scheduling-eval.cpp
index a16ab11f9..40e123f5f 100644
--- a/test/tests-scheduling-eval.cpp
+++ b/test/tests-scheduling-eval.cpp
@@ -2249,3 +2249,107 @@ TEST(scheduling_eval, DISABLED_bfsPullScheduled) {
   expected.compute();
   ASSERT_TENSOR_EQ(expected, y);
 }
+
+TEST(hypermapper, TTM_unsched) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 4;
+  int NUM_J = 4;
+  int NUM_K = 4;
+  int NUM_L = 4;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_L}, {Dense, Dense});
+
+  srand(935);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (i)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int l = 0; l < NUM_L; l++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, l}, (double) ((int) (l)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  Tensor<double> result("result",{NUM_I, NUM_J, NUM_L}, {Sparse, Sparse, Dense});
+  result(i,j,l) = B(i,j,k) * C(k,l);
+  result.setPreserveNonZero(true);
+  result.compile();
+  result.assemble();
+  result.compute();
+
+  std::cout << "RESULT: " << result << std::endl;
+
+  A(i,j,l) = B(i,j,k) * C(k, l);
+  A.compile();
+  A.assemble();
+  A.compute();
+  std::cout << "A: " << A << std::endl;
+  ASSERT_TENSOR_EQ_INT64(A, result);
+}
+
+TEST(hypermapper, TTM_unsched_copy) {
+  if (should_use_CUDA_codegen()) {
+    return;
+  }
+  int NUM_I = 4;
+  int NUM_J = 4;
+  int NUM_K = 4;
+  int NUM_L = 4;
+  float SPARSITY = .1;
+  Tensor<double> A("A", {NUM_I, NUM_J, NUM_L}, {Dense, Dense, Dense}); // TODO: change to sparse outputs
+  Tensor<double> B("B", {NUM_I, NUM_J, NUM_K}, {Sparse, Sparse, Sparse});
+  Tensor<double> C("C", {NUM_K, NUM_L}, {Dense, Dense});
+
+  srand(935);
+  for (int i = 0; i < NUM_I; i++) {
+    for (int j = 0; j < NUM_J; j++) {
+      for (int k = 0; k < NUM_K; k++) {
+        float rand_float = (float) rand() / (float) (RAND_MAX);
+        if (rand_float < SPARSITY) {
+          B.insert({i, j, k}, (double) ((int) (i)));
+        }
+      }
+    }
+  }
+
+  for (int k = 0; k < NUM_K; k++) {
+    for (int l = 0; l < NUM_L; l++) {
+      float rand_float = (float)rand()/(float)(RAND_MAX);
+      C.insert({k, l}, (double) ((int) (l)));
+    }
+  }
+
+  B.pack();
+  C.pack();
+
+  std::vector<int> dims = {NUM_I, NUM_J, NUM_L};
+  Tensor<double> result = copyNonZeroStructure(dims, {Sparse, Sparse, Dense}, B, 2);
+  result(i,j,l) = B(i,j,k) * C(k,l);
+  result.setPreserveNonZero(true);
+  result.compile();
+  result.compute();
+
+  std::cout << "RESULT: " << result << std::endl;
+
+  A(i,j,l) = B(i,j,k) * C(k, l);
+  A.compile();
+  A.assemble();
+  A.compute();
+  std::cout << "A: " << A << std::endl;
+  ASSERT_TENSOR_EQ_INT64(A, result);
+}
\ No newline at end of file

From 54a31fa52a07a5d875e951484f70ad05eb54d4eb Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Sun, 26 Feb 2023 13:14:11 -0800
Subject: [PATCH 2/8] Add in preserveNonZeros for TTV

---
 apps/taco_dse/hypermapper_taco_client.cpp |  6 ++++--
 apps/taco_dse/taco_helper.h               | 14 +++++++++-----
 include/taco/tensor.h                     |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 25ebf7d11..099a50b76 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -899,7 +899,8 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     int temp_chunk_size_k = 1;
     int temp_omp_num_threads = 32;
     // default_config_time = ttv_handler->get_default_compute_time();
-    ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k, temp_loop_ordering, temp_omp_scheduling_type, temp_omp_chunk_size, temp_omp_num_threads, false);
+    ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k,
+                                      temp_loop_ordering, temp_omp_scheduling_type, temp_omp_chunk_size, temp_omp_num_threads, false, 5);
     ttv_handler->set_cold_run();
 
     default_config_time = ttv_handler->get_compute_time();
@@ -908,7 +909,8 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
 
   if(!no_sched_init) {
     try{
-  	   ttv_handler->schedule_and_compute(temp_result, chunk_size_i, chunk_size_fpos, chunk_size_k, loop_ordering, omp_scheduling_type, omp_chunk_size, omp_num_threads, false);
+  	   ttv_handler->schedule_and_compute(temp_result, chunk_size_i, chunk_size_fpos, chunk_size_k,
+                                         loop_ordering, omp_scheduling_type, omp_chunk_size, omp_num_threads, false, 5);
     	 ttv_handler->set_cold_run();
        double compute_time = ttv_handler->get_compute_time();
        Obj.compute_time = compute_time;
diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h
index 0cdc63b3e..7855aebf7 100644
--- a/apps/taco_dse/taco_helper.h
+++ b/apps/taco_dse/taco_helper.h
@@ -1379,11 +1379,14 @@ class TTV : public tacoOp {
     int get_num_j() { return NUM_J; }
 
     double compute_unscheduled() {
-        taco::Tensor<double> result({NUM_I, NUM_J}, taco::dense);
+        taco::Tensor<double> result = copyNonZeroStructure({NUM_I, NUM_J}, {taco::Sparse, taco::Sparse}, B, 2);
         result(i, j) = B(i, j, k) * c(k);
         taco::util::Timer timer;
+        result.setPreserveNonZero(true);
+        result.setNeedsAssemble(false);
+        result.setAssembleWhileCompute(false);
         result.compile();
-        result.assemble();
+        //result.assemble();
         timer.start();
         result.compute();
         timer.stop();
@@ -1430,7 +1433,7 @@ class TTV : public tacoOp {
     void schedule_and_compute(taco::Tensor<double> &result_, int chunk_size_i, int chunk_size_fpos, int chunk_size_k,
                               std::vector<int> order, int omp_scheduling_type=0, int omp_chunk_size=0, int num_threads=32, bool default_config=false,
                               int num_reps=10) {
-        taco::Tensor<double> result("result", {NUM_I, NUM_J}, taco::dense);
+        taco::Tensor<double> result = copyNonZeroStructure({NUM_I, NUM_J}, {taco::Sparse, taco::Sparse}, B, 2);
         result(i, j) = B(i, j, k) * c(k);
 
         // std::cout << "Elements: " << std::endl;
@@ -1456,9 +1459,10 @@ class TTV : public tacoOp {
 	      taco::util::Timer timer;
         std::vector<double> compute_times;
         timer.clear_cache();
+        result.setPreserveNonZero(true);
+        result.setNeedsAssemble(false);
         result.compile(sched);
-        result.setNeedsAssemble(true);
-        result.assemble();
+//        result.assemble();
         for(int i = 0; i < num_reps; i++) {
             timer.start();
             result.setNeedsCompute(true);
diff --git a/include/taco/tensor.h b/include/taco/tensor.h
index 683bbabc1..bb8b8e12d 100644
--- a/include/taco/tensor.h
+++ b/include/taco/tensor.h
@@ -1352,7 +1352,7 @@ Tensor<T> copyNonZeroStructure(std::vector<int> resDims, Format format, Tensor<T
 
   // Double check that the result format is a prefix of the source format.
   taco_uassert(src.getFormat().getOrder() >= srcLevels);
-  taco_uassert(format.getOrder() >= src.getFormat().getOrder());
+  taco_uassert(format.getOrder() >= srcLevels);
 
   for (size_t i = 0; i < (size_t)srcLevels; i++) {
     taco_uassert(resDims[i] == src.getDimensions()[i]);

From 74579a2066416d7d59a7b39797d7bb0d0d16d6e7 Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Sun, 26 Feb 2023 13:41:14 -0800
Subject: [PATCH 3/8] Add in changes to TTV to get the schedule faster

---
 apps/taco_dse/hypermapper_taco_client.cpp |  2 +-
 apps/taco_dse/taco_helper.h               | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 099a50b76..595a122c7 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -896,7 +896,7 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     int temp_omp_chunk_size = 1;
     int temp_chunk_size_i = 1;
     int temp_chunk_size_fpos = 1;
-    int temp_chunk_size_k = 1;
+    int temp_chunk_size_k = 16;
     int temp_omp_num_threads = 32;
     // default_config_time = ttv_handler->get_default_compute_time();
     ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k,
diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h
index 7855aebf7..41215eeb3 100644
--- a/apps/taco_dse/taco_helper.h
+++ b/apps/taco_dse/taco_helper.h
@@ -1272,7 +1272,7 @@ class TTV : public tacoOp {
     taco::Tensor<double> B;
     taco::Tensor<double> c;
     taco::IndexStmt stmt;
-    taco::IndexVar f, fpos, chunk, fpos2, k1, k2, i0, i1;
+    taco::IndexVar f, fpos, chunk, fpos2, k1, k2, kpos, kpos1, kpos2, i0, i1;
     int run_mode, num_reps;
     TTV(int mode, int NUM_I = 1000, int NUM_J = 1000, int NUM_K = 1000, float SPARSITY = .3) : NUM_I{NUM_I},
                                                                                      NUM_J{NUM_J},
@@ -1288,7 +1288,8 @@ class TTV : public tacoOp {
     {
     }
     TTV() : run_mode(1), initialized{false}, cold_run{true},
-            f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), k1("k1"), k2("k2"), i0("i0"), i1("i1") {}
+            f("f"), fpos("fpos"), chunk("chunk"), fpos2("fpos2"), k1("k1"), k2("k2"), i0("i0"), i1("i1"),
+            kpos("kpos"), kpos1("kpos1"), kpos2("kpos2"){}
     float get_sparsity() { return (run_mode == 0) ? SPARSITY : inputCache.get_sparsity(); }
     void set_cold_run() { cold_run = true; }
     void initialize_data(int mode = RANDOM) override
@@ -1348,7 +1349,7 @@ class TTV : public tacoOp {
         B.pack();
         c.pack();
 
-        std::vector<taco::IndexVar> reorder_{i0, chunk, fpos2, k1, k2};
+        std::vector<taco::IndexVar> reorder_{i0, chunk, fpos2, kpos1, kpos2};
         compute_reordering(reorder_);
         // Avoid duplicate reinitialize
         initialized = true;
@@ -1386,7 +1387,7 @@ class TTV : public tacoOp {
         result.setNeedsAssemble(false);
         result.setAssembleWhileCompute(false);
         result.compile();
-        //result.assemble();
+        // result.assemble();
         timer.start();
         result.compute();
         timer.stop();
@@ -1413,9 +1414,12 @@ class TTV : public tacoOp {
         return sched.split(i, i0, i1, chunk_size_i).fuse(i1, j, f)
             .pos(f, fpos, B(i,j,k))
             .split(fpos, chunk, fpos2, chunk_size_fpos)
-            .split(k, k1, k2, chunk_size_k)
+            .pos(k, kpos, B(i,j,k))
+            .split(kpos, kpos1, kpos2, chunk_size_k)
             .reorder(reorder)
-            .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces);
+            .parallelize(chunk, ParallelUnit::CPUThread, OutputRaceStrategy::NoRaces)
+            .parallelize(kpos2, ParallelUnit::CPUVector, OutputRaceStrategy::ParallelReduction);
+
 
             // return stmt.fuse(i, j, f)
             //             .pos(f, fpos, B(i,j,k))

From dd4cf5fb5b6089973bf98b471692e9eaef3547ae Mon Sep 17 00:00:00 2001
From: Olivia Hsu <owhsu@stanford.edu>
Date: Tue, 28 Feb 2023 16:26:47 -0800
Subject: [PATCH 4/8] Add in fix to predictor value

---
 apps/taco_dse/hypermapper_taco_client.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 595a122c7..08d264536 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -1336,7 +1336,7 @@ int main(int argc, char **argv) {
   int omp_chunk_size = program.get<int>("--omp_chunk_size");
   bool Predictor = false;
   if (test_name == "TTV" || test_name == "MTTKRP" || test_name == "TTM") {
-    Predictor = false;
+    Predictor = true;
   }
 
   std::string log_file_ = "hypermapper_taco_log.csv";

From adffc62c06ee7fa8f0834055e3746eaf924535ae Mon Sep 17 00:00:00 2001
From: lrubens <rubenslacouture@gmail.com>
Date: Wed, 1 Mar 2023 00:43:33 +0000
Subject: [PATCH 5/8] Upate starting time for runs

---
 apps/taco_dse/hypermapper_taco_client.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 08d264536..938a8726f 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -874,7 +874,8 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
       timer = ttv_handler->compute_unscheduled();
       compute_times.push_back(timer);
     }
-    Obj.compute_time = median(compute_times);
+    // Obj.compute_time = median(compute_times);
+    no_sched_time = median(compute_times);
     no_sched_init = true;
     cout << "computed unscheduled" << endl;
   }
@@ -896,7 +897,7 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     int temp_omp_chunk_size = 1;
     int temp_chunk_size_i = 1;
     int temp_chunk_size_fpos = 1;
-    int temp_chunk_size_k = 16;
+    int temp_chunk_size_k = 8;
     int temp_omp_num_threads = 32;
     // default_config_time = ttv_handler->get_default_compute_time();
     ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k,
@@ -904,6 +905,7 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     ttv_handler->set_cold_run();
 
     default_config_time = ttv_handler->get_compute_time();
+    Obj.compute_time = default_config_time;
     logger << ttv_handler->get_num_i() << "," << ttv_handler->get_num_j() << "," << default_config_time << "," << no_sched_time << std::endl;
   }
 

From b8b52ae0be05073643a1899399963ce8b9362ef4 Mon Sep 17 00:00:00 2001
From: lrubens <lacouture.r@northeastern.edu>
Date: Wed, 1 Mar 2023 14:47:29 -0800
Subject: [PATCH 6/8] pushing randomized ttv

---
 apps/taco_dse/hypermapper_taco_client.cpp | 20 +++---
 apps/taco_dse/taco_helper.h               | 78 ++++++++++++++---------
 2 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 938a8726f..5afcbd4ea 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -846,9 +846,9 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
   std::vector<int> loop_ordering = static_cast<HMInputParam<std::vector<int>>*>(InputParams[6])->getVal();
   std::vector<int> default_ordering{0,1,2,3,4};
 
-  int NUM_I = 10000;
-  int NUM_J = 10000;
-  int NUM_K = 1000;
+  int NUM_I = 1000;
+  int NUM_J = 100;
+  int NUM_K = 100;
 
   std::vector<double> compute_times;
 
@@ -858,7 +858,11 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     cout << "INITIALIZING" << endl;
     ttv_handler = new TTV();
     ttv_handler->matrix_name = matrix_name;
-    ttv_handler->initialize_data(1);
+    ttv_handler->SPARSITY = 0.1;
+    ttv_handler->NUM_I = NUM_I;
+    ttv_handler->NUM_J = NUM_J;
+    ttv_handler->NUM_K = NUM_K;
+    ttv_handler->initialize_data(0);
     initialized = true;
     // sparsity = ttv_handler->get_sparsity();
     num_i = ttv_handler->NUM_I;
@@ -894,10 +898,10 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
     // int temp_unroll_factor = 8;
     std::vector<int> temp_loop_ordering{0,1,2,3,4};
     int temp_omp_scheduling_type = 0;
-    int temp_omp_chunk_size = 1;
-    int temp_chunk_size_i = 1;
-    int temp_chunk_size_fpos = 1;
-    int temp_chunk_size_k = 8;
+    int temp_omp_chunk_size = 16;
+    int temp_chunk_size_i = 16;
+    int temp_chunk_size_fpos = 16;
+    int temp_chunk_size_k = 16;
     int temp_omp_num_threads = 32;
     // default_config_time = ttv_handler->get_default_compute_time();
     ttv_handler->schedule_and_compute(temp_result, temp_chunk_size_i, temp_chunk_size_fpos, temp_chunk_size_k,
diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h
index 41215eeb3..287fe0eb6 100644
--- a/apps/taco_dse/taco_helper.h
+++ b/apps/taco_dse/taco_helper.h
@@ -170,7 +170,7 @@ struct UfuncInputCache {
   }
 
   template<typename U>
-  taco::Tensor<double> getTensor(std::string path, U format, bool countNNZ = false, float sparsity=0.3, int num_k = 1000, bool includeThird = false) {
+  taco::Tensor<double> getTensor(std::string path, U format, bool shift_dim=false, bool countNNZ = false, float sparsity=0.3, int num_k = 1000, bool includeThird = false) {
     // See if the paths match.
     if (this->lastPath == path) {
       // TODO (rohany): Not worrying about whether the format was the same as what was asked for.
@@ -191,6 +191,17 @@ struct UfuncInputCache {
     this->num_j = this->inputTensor.getDimensions()[1];
     this->num_k = this->inputTensor.getDimensions()[2];
 
+    int last_dim = 0;
+    if (shift_dim) {
+        last_dim = this->inputTensor.getDimensions()[3];
+    }
+
+    taco::Tensor<double> copy("test", {this->num_i, this->num_k, last_dim}, taco::Sparse);
+
+    // for (auto component : this->inputTensor) {
+
+    // }
+
     if (countNNZ) {
       this->nnz = 0;
 #ifdef TACO_DEFAULT_INTEGER_TYPE
@@ -1302,39 +1313,46 @@ class TTV : public tacoOp {
             return;
 
         srand(9536);
-        // for (int i = 0; i < NUM_I; i++)
-        // {
-        //     for (int j = 0; j < NUM_J; j++)
-        //     {
-        //         for (int k = 0; k < NUM_K; k++)
-        //         {
-        //             float rand_float = (float)rand() / (float)(RAND_MAX);
-        //             if (rand_float < SPARSITY)
-        //             {
-        //                 B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY)));
-        //             }
-        //         }
-        //     }
-        // }
 
-        auto ssPath = std::getenv("FROST_PATH");
-        if(ssPath == nullptr) {
-            std::cout << "Environment variable FROST_PATH not set\n";
+        if (mode == RANDOM) {
+            taco::Tensor<double> res("res", {NUM_I, NUM_J, NUM_K}, taco::Sparse);
+            B = res;
+            for (int i = 0; i < NUM_I; i++)
+                {
+                    for (int j = 0; j < NUM_J; j++)
+                        {
+                            for (int k = 0; k < NUM_K; k++)
+                                {
+                                    float rand_float = (float)rand() / (float)(RAND_MAX);
+                                    if (rand_float < SPARSITY)
+                                        {
+                                            B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY)));
+                                        }
+                                }
+                        }
+                }
         }
-        std::string ssPathStr = std::string(ssPath);
 
-        char sep = '/';
-        std::string matrix_path;
-        if (ssPathStr[ssPathStr.length()] == sep) {
-            matrix_path = ssPathStr + matrix_name;
-        } else {
-            matrix_path = ssPathStr + "/" + matrix_name;
-        }
+        else {
+            auto ssPath = std::getenv("FROST_PATH");
+            if(ssPath == nullptr) {
+                std::cout << "Environment variable FROST_PATH not set\n";
+            }
+            std::string ssPathStr = std::string(ssPath);
 
-        B = inputCache.getTensor(matrix_path, Sparse, true);
-        NUM_I = inputCache.num_i;
-        NUM_J = inputCache.num_j;
-        NUM_K = inputCache.num_k;
+            char sep = '/';
+            std::string matrix_path;
+            if (ssPathStr[ssPathStr.length()] == sep) {
+                matrix_path = ssPathStr + matrix_name;
+            } else {
+                matrix_path = ssPathStr + "/" + matrix_name;
+            }
+
+            B = inputCache.getTensor(matrix_path, Sparse, true);
+            NUM_I = inputCache.num_i;
+            NUM_J = inputCache.num_j;
+            NUM_K = inputCache.num_k;
+        }
 
         std::cout << "Dimensions: " << NUM_I << ", " << NUM_J << ", " << NUM_K << std::endl;
 

From 11ff6466a9ddcdff3aa4a6ba9c91dd69769cefc3 Mon Sep 17 00:00:00 2001
From: lrubens <lacouture.r@northeastern.edu>
Date: Wed, 1 Mar 2023 14:56:53 -0800
Subject: [PATCH 7/8] adding changes

---
 apps/taco_dse/taco_helper.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/apps/taco_dse/taco_helper.h b/apps/taco_dse/taco_helper.h
index 287fe0eb6..47fe97b04 100644
--- a/apps/taco_dse/taco_helper.h
+++ b/apps/taco_dse/taco_helper.h
@@ -1314,6 +1314,7 @@ class TTV : public tacoOp {
 
         srand(9536);
 
+        int nnz = 0;
         if (mode == RANDOM) {
             taco::Tensor<double> res("res", {NUM_I, NUM_J, NUM_K}, taco::Sparse);
             B = res;
@@ -1327,6 +1328,7 @@ class TTV : public tacoOp {
                                     if (rand_float < SPARSITY)
                                         {
                                             B.insert({i, j, k}, (double)((int)(rand_float * 3 / SPARSITY)));
+                                            nnz++;
                                         }
                                 }
                         }
@@ -1355,6 +1357,7 @@ class TTV : public tacoOp {
         }
 
         std::cout << "Dimensions: " << NUM_I << ", " << NUM_J << ", " << NUM_K << std::endl;
+        std::cout << "NNZ: " << nnz << std::endl;
 
         taco::Tensor<double> c_("c", {NUM_K}, taco::Format{taco::ModeFormat::Dense});
         c = c_;

From 2a7be92e22634e8bc3e46b7e996351d8e5d5087a Mon Sep 17 00:00:00 2001
From: lrubens <lacouture.r@northeastern.edu>
Date: Wed, 1 Mar 2023 15:02:54 -0800
Subject: [PATCH 8/8] fix minor bug

---
 apps/taco_dse/hypermapper_taco_client.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/apps/taco_dse/hypermapper_taco_client.cpp b/apps/taco_dse/hypermapper_taco_client.cpp
index 5afcbd4ea..0c0bdcbc1 100644
--- a/apps/taco_dse/hypermapper_taco_client.cpp
+++ b/apps/taco_dse/hypermapper_taco_client.cpp
@@ -887,6 +887,7 @@ HMObjective calculateObjectiveTTVDense(std::vector<HMInputParamBase *> &InputPar
   //Initiate scheduling passing in chunk_size (param to optimize)
   bool default_config = (chunk_size_i == 16);
   bool valid = true;
+  Obj.valid = valid;
 
   compute_times = vector<double>();
   ttv_handler->set_cold_run();