diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4523040..a017baae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,7 @@ project(MACIS VERSION 0.1 LANGUAGES C CXX)
 option( MACIS_ENABLE_MPI    "Enable MPI Bindings"    ON )
 option( MACIS_ENABLE_OPENMP "Enable OpenMP Bindings" ON )
 option( MACIS_ENABLE_BOOST  "Enable Boost" ON )
+option( MACIS_ENABLE_TREXIO "Enable TrexIO" ON )
 
 # CMake Modules
 list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
diff --git a/include/macis/asci/alpha_constraint.hpp b/include/macis/asci/alpha_constraint.hpp
new file mode 100644
index 00000000..ae12ca3f
--- /dev/null
+++ b/include/macis/asci/alpha_constraint.hpp
@@ -0,0 +1,82 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#pragma once
+
+#include <macis/wfn/raw_bitset.hpp>
+namespace macis {
+
+template <typename WfnTraits>
+class alpha_constraint {
+ public:
+  using wfn_traits = WfnTraits;
+  using wfn_type = typename WfnTraits::wfn_type;
+  using spin_wfn_type = spin_wfn_t<wfn_type>;
+
+  using constraint_type = spin_wfn_type;
+  using constraint_traits = wavefunction_traits<spin_wfn_type>;
+
+ private:
+  constraint_type C_;
+  constraint_type B_;
+  uint32_t C_min_;
+  uint32_t count_;
+
+ public:
+  alpha_constraint(constraint_type C, constraint_type B, uint32_t C_min)
+      : C_(C), B_(B), C_min_(C_min), count_(constraint_traits::count(C)) {}
+
+  alpha_constraint(const alpha_constraint&) = default;
+  alpha_constraint& operator=(const alpha_constraint&) = default;
+
+  alpha_constraint(alpha_constraint&& other) noexcept = default;
+  alpha_constraint& operator=(alpha_constraint&&) noexcept = default;
+
+  inline auto C() const { return C_; }
+  inline auto B() const { return B_; }
+  inline auto C_min() const { return C_min_; }
+  inline auto count() const { return count_; }
+
+  inline spin_wfn_type c_mask_union(spin_wfn_type state) const {
+    return state & C_;
+  }
+  inline spin_wfn_type b_mask_union(spin_wfn_type state) const {
+    return state & B_;
+  }
+
+  inline spin_wfn_type symmetric_difference(spin_wfn_type state) const {
+    return state ^ C_;
+  }
+  // inline spin_wfn_type symmetric_difference(wfn_type state) const {
+  //   return symmetric_difference(wfn_traits::alpha_string(state));
+  // }
+
+  template <typename WfnType>
+  inline auto overlap(WfnType state) const {
+    return constraint_traits::count(c_mask_union(state));
+  }
+
+  template <typename WfnType>
+  inline bool satisfies_constraint(WfnType state) const {
+    return overlap(state) == count_ and
+           constraint_traits::count(symmetric_difference(state) >> C_min_) == 0;
+  }
+
+  static alpha_constraint make_triplet(unsigned i, unsigned j, unsigned k) {
+    constraint_type C = 0;
+    C.flip(i).flip(j).flip(k);
+    // constraint_type B = 1;
+    // static_assert(B.size() <= 64, "ULLONG NOT POSSIBLE HERE");
+    // B <<= k;
+    // B = B.to_ullong() - 1;
+    constraint_type B = full_mask<B.size()>(k);
+    return alpha_constraint(C, B, k);
+  }
+};
+
+}  // namespace macis
diff --git a/include/macis/asci/determinant_contributions.hpp b/include/macis/asci/determinant_contributions.hpp
index 7b3315f3..3de8de56 100644
--- a/include/macis/asci/determinant_contributions.hpp
+++ b/include/macis/asci/determinant_contributions.hpp
@@ -16,22 +16,27 @@ namespace macis {
 template <typename WfnT>
 struct asci_contrib {
   WfnT state;
-  double rv;
+  double c_times_matel;
+  double h_diag;
+
+  auto rv() const { return c_times_matel / h_diag; }
+  auto pt2() const { return rv() * c_times_matel; }
 };
 
 template <typename WfnT>
 using asci_contrib_container = std::vector<asci_contrib<WfnT>>;
 
-template <size_t N, size_t NShift>
+template <Spin Sigma, typename WfnType, typename SpinWfnType>
 void append_singles_asci_contributions(
-    double coeff, wfn_t<2 * N> state_full, wfn_t<N> state_same,
+    double coeff, WfnType state_full, SpinWfnType state_same,
     const std::vector<uint32_t>& occ_same,
     const std::vector<uint32_t>& vir_same,
     const std::vector<uint32_t>& occ_othr, const double* eps_same,
     const double* T_pq, const size_t LDT, const double* G_kpq, const size_t LDG,
     const double* V_kpq, const size_t LDV, double h_el_tol, double root_diag,
-    double E0, HamiltonianGenerator<2 * N>& ham_gen,
-    asci_contrib_container<wfn_t<2 * N>>& asci_contributions) {
+    double E0, const HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   const auto LDG2 = LDG * LDG;
   const auto LDV2 = LDV * LDV;
   for(auto i : occ_same)
@@ -44,11 +49,11 @@ void append_singles_asci_contributions(
       for(auto p : occ_othr) h_el += V_ov[p];
 
       // Early Exit
-      if(std::abs(h_el) < h_el_tol) continue;
+      if(std::abs(coeff * h_el) < h_el_tol) continue;
 
       // Calculate Excited Determinant
-      auto ex_det = state_full;
-      ex_det.flip(i + NShift).flip(a + NShift);
+      auto ex_det = wfn_traits::template single_excitation_no_check<Sigma>(
+          state_full, i, a);
 
       // Calculate Excitation Sign in a Canonical Way
       auto sign = single_excitation_sign(state_same, a, i);
@@ -57,22 +62,25 @@ void append_singles_asci_contributions(
       // Calculate fast diagonal matrix element
       auto h_diag =
           ham_gen.fast_diag_single(eps_same[i], eps_same[a], i, a, root_diag);
-      h_el /= (E0 - h_diag);
+      // h_el /= (E0 - h_diag);
 
       // Append to return values
-      asci_contributions.push_back({ex_det, coeff * h_el});
+      asci_contributions.push_back({ex_det, coeff * h_el, E0 - h_diag});
 
     }  // Loop over single extitations
 }
 
-template <size_t N, size_t NShift>
+template <Spin Sigma, typename WfnType, typename SpinWfnType>
 void append_ss_doubles_asci_contributions(
-    double coeff, wfn_t<2 * N> state_full, wfn_t<N> state_spin,
-    const std::vector<uint32_t>& ss_occ, const std::vector<uint32_t>& vir,
-    const std::vector<uint32_t>& os_occ, const double* eps_same,
-    const double* G, size_t LDG, double h_el_tol, double root_diag, double E0,
-    HamiltonianGenerator<2 * N>& ham_gen,
-    asci_contrib_container<wfn_t<2 * N>>& asci_contributions) {
+    double coeff, WfnType state_full, SpinWfnType state_same,
+    SpinWfnType state_other, const std::vector<uint32_t>& ss_occ,
+    const std::vector<uint32_t>& vir, const std::vector<uint32_t>& os_occ,
+    const double* eps_same, const double* G, size_t LDG, double h_el_tol,
+    double root_diag, double E0,
+    const HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using spin_wfn_traits = wavefunction_traits<SpinWfnType>;
   const size_t nocc = ss_occ.size();
   const size_t nvir = vir.size();
 
@@ -90,8 +98,9 @@ void append_ss_doubles_asci_contributions(
           const auto jb = b + j * LDG;
           const auto G_aibj = G_ai[jb];
 
-          if(std::abs(G_aibj) < h_el_tol) continue;
+          if(std::abs(coeff * G_aibj) < h_el_tol) continue;
 
+#if 0
           // Calculate excited determinant string (spin)
           const auto full_ex_spin = wfn_t<N>(0).flip(i).flip(j).flip(a).flip(b);
           auto ex_det_spin = state_spin ^ full_ex_spin;
@@ -102,6 +111,21 @@ void append_ss_doubles_asci_contributions(
           // Calculate full excited determinant
           const auto full_ex = expand_bitset<2 * N>(full_ex_spin) << NShift;
           auto ex_det = state_full ^ full_ex;
+#else
+          // TODO: Can this be made faster since the orbital indices are known
+          //       in advance?
+          // Compute excited determinant (spin)
+          const auto full_ex_spin = spin_wfn_traits::double_excitation_no_check(
+              SpinWfnType(0), i, j, a, b);
+          const auto ex_det_spin = state_same ^ full_ex_spin;
+
+          // Calculate the sign in a canonical way
+          double sign = doubles_sign(state_same, ex_det_spin, full_ex_spin);
+
+          // Calculate full excited determinant
+          auto ex_det =
+              wfn_traits::template from_spin<Sigma>(ex_det_spin, state_other);
+#endif
 
           // Update sign of matrix element
           auto h_el = sign * G_aibj;
@@ -110,25 +134,27 @@ void append_ss_doubles_asci_contributions(
           auto h_diag =
               ham_gen.fast_diag_ss_double(eps_same[i], eps_same[j], eps_same[a],
                                           eps_same[b], i, j, a, b, root_diag);
-          h_el /= (E0 - h_diag);
+          // h_el /= (E0 - h_diag);
 
           // Append {det, c*h_el}
-          asci_contributions.push_back({ex_det, coeff * h_el});
+          asci_contributions.push_back({ex_det, coeff * h_el, E0 - h_diag});
 
         }  // Restricted BJ loop
     }      // AI Loop
 }
 
-template <size_t N>
+template <typename WfnType, typename SpinWfnType>
 void append_os_doubles_asci_contributions(
-    double coeff, wfn_t<2 * N> state_full, wfn_t<N> state_alpha,
-    wfn_t<N> state_beta, const std::vector<uint32_t>& occ_alpha,
+    double coeff, WfnType state_full, SpinWfnType state_alpha,
+    SpinWfnType state_beta, const std::vector<uint32_t>& occ_alpha,
     const std::vector<uint32_t>& occ_beta,
     const std::vector<uint32_t>& vir_alpha,
     const std::vector<uint32_t>& vir_beta, const double* eps_alpha,
     const double* eps_beta, const double* V, size_t LDV, double h_el_tol,
-    double root_diag, double E0, HamiltonianGenerator<2 * N>& ham_gen,
-    asci_contrib_container<wfn_t<2 * N>>& asci_contributions) {
+    double root_diag, double E0,
+    const HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   const size_t LDV2 = LDV * LDV;
   for(auto i : occ_alpha)
     for(auto a : vir_alpha) {
@@ -140,21 +166,26 @@ void append_os_doubles_asci_contributions(
           const auto jb = b + j * LDV;
           const auto V_aibj = V_ai[jb * LDV2];
 
-          if(std::abs(V_aibj) < h_el_tol) continue;
+          if(std::abs(coeff * V_aibj) < h_el_tol) continue;
 
           double sign_beta = single_excitation_sign(state_beta, b, j);
           double sign = sign_alpha * sign_beta;
-          auto ex_det = state_full;
-          ex_det.flip(a).flip(i).flip(j + N).flip(b + N);
+          // auto ex_det = state_full;
+          // ex_det.flip(a).flip(i).flip(j + N).flip(b + N);
+          auto ex_det =
+              wfn_traits::template single_excitation_no_check<Spin::Alpha>(
+                  state_full, a, i);
+          ex_det = wfn_traits::template single_excitation_no_check<Spin::Beta>(
+              ex_det, b, j);
           auto h_el = sign * V_aibj;
 
           // Evaluate fast diagonal element
           auto h_diag = ham_gen.fast_diag_os_double(eps_alpha[i], eps_beta[j],
                                                     eps_alpha[a], eps_beta[b],
                                                     i, j, a, b, root_diag);
-          h_el /= (E0 - h_diag);
+          // h_el /= (E0 - h_diag);
 
-          asci_contributions.push_back({ex_det, coeff * h_el});
+          asci_contributions.push_back({ex_det, coeff * h_el, E0 - h_diag});
         }  // BJ loop
     }      // AI loop
 }
diff --git a/include/macis/asci/determinant_search.hpp b/include/macis/asci/determinant_search.hpp
index 7d66813c..7b1bae34 100644
--- a/include/macis/asci/determinant_search.hpp
+++ b/include/macis/asci/determinant_search.hpp
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <omp.h>
 #include <spdlog/sinks/null_sink.h>
 #include <spdlog/sinks/stdout_color_sinks.h>
 #include <spdlog/spdlog.h>
@@ -27,7 +28,7 @@ template <typename WfnT>
 struct asci_contrib_topk_comparator {
   using type = asci_contrib<WfnT>;
   constexpr bool operator()(const type& a, const type& b) const {
-    return std::abs(a.rv) > std::abs(b.rv);
+    return std::abs(a.rv()) > std::abs(b.rv());
   }
 };
 
@@ -38,6 +39,18 @@ struct ASCISettings {
   double h_el_tol = 1e-8;
   double rv_prune_tol = 1e-8;
   size_t pair_size_max = 5e8;
+
+  double pt2_tol = 1e-16;
+  size_t pt2_reserve_count = 70000000;
+  bool pt2_prune = false;
+  bool pt2_precompute_eps = false;
+  bool pt2_precompute_idx = false;
+  bool pt2_print_progress = false;
+  size_t pt2_bigcon_thresh = 250;
+
+  size_t nxtval_bcount_thresh = 1000;
+  size_t nxtval_bcount_inc = 10;
+
   bool just_singles = false;
   size_t grow_factor = 8;
   size_t max_refine_iter = 6;
@@ -48,6 +61,9 @@ struct ASCISettings {
 
   // bool dist_triplet_random = false;
   int constraint_level = 2;  // Up To Quints
+  int pt2_max_constraint_level = 5;
+  int pt2_min_constraint_level = 0;
+  int64_t pt2_constraint_refine_force = 0;
 };
 
 template <size_t N>
@@ -56,7 +72,10 @@ asci_contrib_container<wfn_t<N>> asci_contributions_standard(
     wavefunction_iterator_t<N> cdets_end, const double E_ASCI,
     const std::vector<double>& C, size_t norb, const double* T_pq,
     const double* G_red, const double* V_red, const double* G_pqrs,
-    const double* V_pqrs, HamiltonianGenerator<N>& ham_gen) {
+    const double* V_pqrs, HamiltonianGenerator<wfn_t<N>>& ham_gen) {
+  using wfn_traits = wavefunction_traits<wfn_t<N>>;
+  using spin_wfn_type = spin_wfn_t<wfn_t<N>>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
   auto logger = spdlog::get("asci_search");
 
   const size_t ncdets = std::distance(cdets_begin, cdets_end);
@@ -68,13 +87,13 @@ asci_contrib_container<wfn_t<N>> asci_contributions_standard(
   for(size_t i = 0; i < ncdets; ++i) {
     // Alias state data
     auto state = *(cdets_begin + i);
-    auto state_alpha = bitset_lo_word(state);
-    auto state_beta = bitset_hi_word(state);
+    auto state_alpha = wfn_traits::alpha_string(state);
+    auto state_beta = wfn_traits::beta_string(state);
     auto coeff = C[i];
 
     // Get occupied and virtual indices
-    bitset_to_occ_vir(norb, state_alpha, occ_alpha, vir_alpha);
-    bitset_to_occ_vir(norb, state_beta, occ_beta, vir_beta);
+    spin_wfn_traits::state_to_occ_vir(norb, state_alpha, occ_alpha, vir_alpha);
+    spin_wfn_traits::state_to_occ_vir(norb, state_beta, occ_beta, vir_beta);
 
     // Precompute orbital energies
     auto eps_alpha = ham_gen.single_orbital_ens(norb, occ_alpha, occ_beta);
@@ -86,27 +105,27 @@ asci_contrib_container<wfn_t<N>> asci_contributions_standard(
     const double h_el_tol = asci_settings.h_el_tol;
 
     // Singles - AA
-    append_singles_asci_contributions<(N / 2), 0>(
+    append_singles_asci_contributions<Spin::Alpha>(
         coeff, state, state_alpha, occ_alpha, vir_alpha, occ_beta,
         eps_alpha.data(), T_pq, norb, G_red, norb, V_red, norb, h_el_tol,
         h_diag, E_ASCI, ham_gen, asci_pairs);
 
     // Singles - BB
-    append_singles_asci_contributions<(N / 2), (N / 2)>(
+    append_singles_asci_contributions<Spin::Beta>(
         coeff, state, state_beta, occ_beta, vir_beta, occ_alpha,
         eps_beta.data(), T_pq, norb, G_red, norb, V_red, norb, h_el_tol, h_diag,
         E_ASCI, ham_gen, asci_pairs);
 
     if(not asci_settings.just_singles) {
       // Doubles - AAAA
-      append_ss_doubles_asci_contributions<N / 2, 0>(
-          coeff, state, state_alpha, occ_alpha, vir_alpha, occ_beta,
+      append_ss_doubles_asci_contributions<Spin::Alpha>(
+          coeff, state, state_alpha, state_beta, occ_alpha, vir_alpha, occ_beta,
           eps_alpha.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI, ham_gen,
           asci_pairs);
 
       // Doubles - BBBB
-      append_ss_doubles_asci_contributions<N / 2, N / 2>(
-          coeff, state, state_beta, occ_beta, vir_beta, occ_alpha,
+      append_ss_doubles_asci_contributions<Spin::Beta>(
+          coeff, state, state_beta, state_alpha, occ_beta, vir_beta, occ_alpha,
           eps_beta.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI, ham_gen,
           asci_pairs);
 
@@ -122,7 +141,7 @@ asci_contrib_container<wfn_t<N>> asci_contributions_standard(
       // Remove small contributions
       auto it = std::partition(
           asci_pairs.begin(), asci_pairs.end(), [=](const auto& x) {
-            return std::abs(x.rv) > asci_settings.rv_prune_tol;
+            return std::abs(x.rv()) > asci_settings.rv_prune_tol;
           });
       asci_pairs.erase(it, asci_pairs.end());
       logger->info("  * Pruning at DET = {} NSZ = {}", i, asci_pairs.size());
@@ -143,37 +162,31 @@ asci_contrib_container<wfn_t<N>> asci_contributions_standard(
 #ifdef MACIS_ENABLE_MPI
 template <size_t N>
 asci_contrib_container<wfn_t<N>> asci_contributions_constraint(
-    ASCISettings asci_settings, wavefunction_iterator_t<N> cdets_begin,
+    ASCISettings asci_settings, const size_t ntdets,
+    wavefunction_iterator_t<N> cdets_begin,
     wavefunction_iterator_t<N> cdets_end, const double E_ASCI,
     const std::vector<double>& C, size_t norb, const double* T_pq,
     const double* G_red, const double* V_red, const double* G_pqrs,
-    const double* V_pqrs, HamiltonianGenerator<N>& ham_gen, MPI_Comm comm) {
+    const double* V_pqrs, HamiltonianGenerator<wfn_t<N>>& ham_gen,
+    MPI_Comm comm) {
   using clock_type = std::chrono::high_resolution_clock;
   using duration_type = std::chrono::duration<double, std::milli>;
+  using wfn_traits = wavefunction_traits<wfn_t<N>>;
+  using spin_wfn_type = spin_wfn_t<wfn_t<N>>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
+  using wfn_comp = typename wfn_traits::spin_comparator;
+  if(!std::is_sorted(cdets_begin, cdets_end, wfn_comp{}))
+    throw std::runtime_error("ASCI Search Only Works with Sorted Wfns");
 
   auto logger = spdlog::get("asci_search");
   const size_t ncdets = std::distance(cdets_begin, cdets_end);
 
-  asci_contrib_container<wfn_t<N>> asci_pairs;
-  std::vector<uint32_t> occ_alpha, vir_alpha;
-  std::vector<uint32_t> occ_beta, vir_beta;
-
-  // Get unique alpha strings
-  std::vector<wfn_t<N>> uniq_alpha_wfn(cdets_begin, cdets_end);
-  std::transform(uniq_alpha_wfn.begin(), uniq_alpha_wfn.end(),
-                 uniq_alpha_wfn.begin(),
-                 [=](const auto& w) { return w & full_mask<N / 2, N>(); });
-  std::sort(uniq_alpha_wfn.begin(), uniq_alpha_wfn.end(),
-            bitset_less_comparator<N>{});
-  {
-    auto it = std::unique(uniq_alpha_wfn.begin(), uniq_alpha_wfn.end());
-    uniq_alpha_wfn.erase(it, uniq_alpha_wfn.end());
-  }
-  const size_t nuniq_alpha = uniq_alpha_wfn.size();
+  auto world_rank = comm_rank(comm);
+  auto world_size = comm_size(comm);
 
   // For each unique alpha, create a list of beta string and store metadata
   struct beta_coeff_data {
-    wfn_t<N> beta_string;
+    spin_wfn_type beta_string;
     std::vector<uint32_t> occ_beta;
     std::vector<uint32_t> vir_beta;
     std::vector<double> orb_ens_alpha;
@@ -183,19 +196,16 @@ asci_contrib_container<wfn_t<N>> asci_contributions_constraint(
 
     beta_coeff_data(double c, size_t norb,
                     const std::vector<uint32_t>& occ_alpha, wfn_t<N> w,
-                    const HamiltonianGenerator<N>& ham_gen) {
+                    const HamiltonianGenerator<wfn_t<N>>& ham_gen) {
       coeff = c;
 
-      // Compute Beta string
-      const auto beta_shift = w >> N / 2;
-      // Reduce the number of times things shift in inner loop
-      beta_string = beta_shift << N / 2;
+      beta_string = wfn_traits::beta_string(w);
 
       // Compute diagonal matrix element
       h_diag = ham_gen.matrix_element(w, w);
 
       // Compute occ/vir for beta string
-      bitset_to_occ_vir(norb, beta_shift, occ_beta, vir_beta);
+      spin_wfn_traits::state_to_occ_vir(norb, beta_string, occ_beta, vir_beta);
 
       // Precompute orbital energies
       orb_ens_alpha = ham_gen.single_orbital_ens(norb, occ_alpha, occ_beta);
@@ -203,32 +213,40 @@ asci_contrib_container<wfn_t<N>> asci_contributions_constraint(
     }
   };
 
-  struct unique_alpha_data {
-    std::vector<beta_coeff_data> bcd;
-  };
+  auto uniq_alpha = get_unique_alpha(cdets_begin, cdets_end);
+  const size_t nuniq_alpha = uniq_alpha.size();
+  std::vector<wfn_t<N>> uniq_alpha_wfn(nuniq_alpha);
+  std::transform(
+      uniq_alpha.begin(), uniq_alpha.end(), uniq_alpha_wfn.begin(),
+      [](const auto& p) { return wfn_traits::from_spin(p.first, 0); });
 
+  using unique_alpha_data = std::vector<beta_coeff_data>;
   std::vector<unique_alpha_data> uad(nuniq_alpha);
-  for(auto i = 0; i < nuniq_alpha; ++i) {
-    const auto wfn_a = uniq_alpha_wfn[i];
+  for(auto i = 0, iw = 0; i < nuniq_alpha; ++i) {
     std::vector<uint32_t> occ_alpha, vir_alpha;
-    bitset_to_occ_vir(norb, wfn_a, occ_alpha, vir_alpha);
-    for(auto j = 0; j < ncdets; ++j) {
-      const auto w = *(cdets_begin + j);
-      if((w & full_mask<N / 2, N>()) == wfn_a) {
-        uad[i].bcd.emplace_back(C[j], norb, occ_alpha, w, ham_gen);
-      }
+    spin_wfn_traits::state_to_occ_vir(norb, uniq_alpha[i].first, occ_alpha,
+                                      vir_alpha);
+
+    const auto nbeta = uniq_alpha[i].second;
+    uad[i].reserve(nbeta);
+    for(auto j = 0; j < nbeta; ++j, ++iw) {
+      const auto& w = *(cdets_begin + iw);
+      uad[i].emplace_back(C[iw], norb, occ_alpha, w, ham_gen);
     }
   }
 
-  auto world_rank = comm_rank(comm);
-  auto world_size = comm_size(comm);
-
-  const auto n_occ_alpha = uniq_alpha_wfn[0].count();
+  // const auto n_occ_alpha = wfn_traits::count(uniq_alpha_wfn[0]);
+  const auto n_occ_alpha = spin_wfn_traits::count(uniq_alpha[0].first);
   const auto n_vir_alpha = norb - n_occ_alpha;
   const auto n_sing_alpha = n_occ_alpha * n_vir_alpha;
   const auto n_doub_alpha = (n_sing_alpha * (n_sing_alpha - norb + 1)) / 4;
 
-  logger->info("  * NS = {} ND = {}", n_sing_alpha, n_doub_alpha);
+  const auto n_occ_beta = cdets_begin->count() - n_occ_alpha;
+  const auto n_vir_beta = norb - n_occ_beta;
+  const auto n_sing_beta = n_occ_beta * n_vir_beta;
+  const auto n_doub_beta = (n_sing_beta * (n_sing_beta - norb + 1)) / 4;
+
+  // logger->info("  * NS = {} ND = {}", n_sing_alpha, n_doub_alpha);
 
   // Generate mask constraints
   if(!world_rank) {
@@ -255,139 +273,152 @@ asci_contrib_container<wfn_t<N>> asci_contributions_constraint(
   }
 
   auto gen_c_st = clock_type::now();
-  auto constraints =
-      dist_constraint_general(asci_settings.constraint_level, norb,
-                              n_sing_alpha, n_doub_alpha, uniq_alpha_wfn, comm);
+  auto constraints = gen_constraints_general<wfn_t<N>>(
+      asci_settings.constraint_level, norb, n_sing_beta, n_doub_beta,
+      uniq_alpha, world_size);
   auto gen_c_en = clock_type::now();
   duration_type gen_c_dur = gen_c_en - gen_c_st;
   logger->info("  * GEN_DUR = {:.2e} ms", gen_c_dur.count());
 
-  size_t max_size = std::min(asci_settings.pair_size_max,
-                             ncdets * (2 * n_sing_alpha +  // AA + BB
-                                       2 * n_doub_alpha +  // AAAA + BBBB
-                                       n_sing_alpha * n_sing_alpha  // AABB
-                                       ));
-  asci_pairs.reserve(max_size);
-
-  // Process ASCI pair contributions for each constraint
-  for(auto con : constraints) {
-    auto size_before = asci_pairs.size();
-
-    const double h_el_tol = asci_settings.h_el_tol;
-    const auto& [C, B, C_min] = con;
-    wfn_t<N> O = full_mask<N>(norb);
-
-    // Loop over unique alpha strings
-    for(size_t i_alpha = 0; i_alpha < nuniq_alpha; ++i_alpha) {
-      const auto& det = uniq_alpha_wfn[i_alpha];
-      const auto occ_alpha = bits_to_indices(det);
-
-      // AA excitations
-      for(const auto& bcd : uad[i_alpha].bcd) {
-        const auto& beta = bcd.beta_string;
-        const auto& coeff = bcd.coeff;
-        const auto& h_diag = bcd.h_diag;
-        const auto& occ_beta = bcd.occ_beta;
-        const auto& orb_ens_alpha = bcd.orb_ens_alpha;
-        generate_constraint_singles_contributions_ss(
-            coeff, det, C, O, B, beta, occ_alpha, occ_beta,
-            orb_ens_alpha.data(), T_pq, norb, G_red, norb, V_red, norb,
-            h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
-      }
+  size_t max_size =
+      std::min(std::min(ntdets, asci_settings.pair_size_max),
+               ncdets * (n_sing_alpha + n_sing_beta +  // AA + BB
+                         n_doub_alpha + n_doub_beta +  // AAAA + BBBB
+                         n_sing_alpha * n_sing_beta    // AABB
+                         ));
 
-      // AAAA excitations
-      for(const auto& bcd : uad[i_alpha].bcd) {
-        const auto& beta = bcd.beta_string;
-        const auto& coeff = bcd.coeff;
-        const auto& h_diag = bcd.h_diag;
-        const auto& occ_beta = bcd.occ_beta;
-        const auto& orb_ens_alpha = bcd.orb_ens_alpha;
-        generate_constraint_doubles_contributions_ss(
-            coeff, det, C, O, B, beta, occ_alpha, occ_beta,
-            orb_ens_alpha.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI,
-            ham_gen, asci_pairs);
-      }
+  const size_t ncon_total = constraints.size();
 
-      // AABB excitations
-      for(const auto& bcd : uad[i_alpha].bcd) {
-        const auto& beta = bcd.beta_string;
-        const auto& coeff = bcd.coeff;
-        const auto& h_diag = bcd.h_diag;
-        const auto& occ_beta = bcd.occ_beta;
-        const auto& vir_beta = bcd.vir_beta;
-        const auto& orb_ens_alpha = bcd.orb_ens_alpha;
-        const auto& orb_ens_beta = bcd.orb_ens_beta;
-        generate_constraint_doubles_contributions_os(
-            coeff, det, C, O, B, beta, occ_alpha, occ_beta, vir_beta,
-            orb_ens_alpha.data(), orb_ens_beta.data(), V_pqrs, norb, h_el_tol,
-            h_diag, E_ASCI, ham_gen, asci_pairs);
-      }
+  // Global atomic task-id counter
+  global_atomic<size_t> nxtval(comm);
 
-      // If the alpha determinant satisfies the constraint,
-      // append BB and BBBB excitations
-      if(satisfies_constraint(det, C, C_min)) {
-        for(const auto& bcd : uad[i_alpha].bcd) {
-          const auto& beta = bcd.beta_string;
-          const auto& coeff = bcd.coeff;
-          const auto& h_diag = bcd.h_diag;
-          const auto& occ_beta = bcd.occ_beta;
-          const auto& vir_beta = bcd.vir_beta;
-          const auto& eps_beta = bcd.orb_ens_beta;
-
-          const auto state = det | beta;
-          const auto state_beta = bitset_hi_word(beta);
-          // BB Excitations
-          append_singles_asci_contributions<(N / 2), (N / 2)>(
-              coeff, state, state_beta, occ_beta, vir_beta, occ_alpha,
-              eps_beta.data(), T_pq, norb, G_red, norb, V_red, norb, h_el_tol,
-              h_diag, E_ASCI, ham_gen, asci_pairs);
-
-          // BBBB Excitations
-          append_ss_doubles_asci_contributions<N / 2, N / 2>(
-              coeff, state, state_beta, occ_beta, vir_beta, occ_alpha,
-              eps_beta.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI, ham_gen,
-              asci_pairs);
-
-        }  // Beta Loop
-      }    // Triplet Check
-
-      // Prune Down Contributions
-      if(asci_pairs.size() > asci_settings.pair_size_max) {
-        // Remove small contributions
-        auto it = std::partition(
-            asci_pairs.begin(), asci_pairs.end(), [=](const auto& x) {
-              return std::abs(x.rv) > asci_settings.rv_prune_tol;
-            });
-        asci_pairs.erase(it, asci_pairs.end());
-
-        auto c_indices = bits_to_indices(C);
-        std::string c_string;
-        for(int i = 0; i < c_indices.size(); ++i)
-          c_string += std::to_string(c_indices[i]) + " ";
-        logger->info("  * Pruning at CON = {}, NSZ = {}", c_string,
-                     asci_pairs.size());
-
-        // Extra Pruning if not sufficient
-        if(asci_pairs.size() > asci_settings.pair_size_max) {
-          logger->info("    * Removing Duplicates");
+  asci_contrib_container<wfn_t<N>> asci_pairs_total;
+#pragma omp parallel
+  {
+    // Process ASCI pair contributions for each constraint
+    asci_contrib_container<wfn_t<N>> asci_pairs;
+    asci_pairs.reserve(max_size);
+
+    size_t ic = 0;
+    while(ic < ncon_total) {
+      auto size_before = asci_pairs.size();
+      const double h_el_tol = asci_settings.h_el_tol;
+
+      // Atomically get the next task ID and increment for other
+      // MPI ranks and threads
+      size_t ntake = ic < asci_settings.nxtval_bcount_thresh
+                         ? 1
+                         : asci_settings.nxtval_bcount_inc;
+      ic = nxtval.fetch_and_add(ntake);
+
+      // Loop over assigned tasks
+      const size_t c_end = std::min(ncon_total, ic + ntake);
+      for(; ic < c_end; ++ic) {
+        const auto& con = constraints[ic].first;
+        // printf("[rank %4d tid:%4d] %10lu / %10lu\n", world_rank,
+        //        omp_get_thread_num(), ic, ncon_total);
+
+        for(size_t i_alpha = 0, iw = 0; i_alpha < nuniq_alpha; ++i_alpha) {
+          const auto& alpha_det = uniq_alpha[i_alpha].first;
+          const auto occ_alpha = bits_to_indices(alpha_det);
+          const bool alpha_satisfies_con = satisfies_constraint(alpha_det, con);
+
+          const auto& bcd = uad[i_alpha];
+          const size_t nbeta = bcd.size();
+          for(size_t j_beta = 0; j_beta < nbeta; ++j_beta, ++iw) {
+            const auto w = *(cdets_begin + iw);
+            const auto c = C[iw];
+            const auto& beta_det = bcd[j_beta].beta_string;
+            const auto h_diag = bcd[j_beta].h_diag;
+            const auto& occ_beta = bcd[j_beta].occ_beta;
+            const auto& vir_beta = bcd[j_beta].vir_beta;
+            const auto& orb_ens_alpha = bcd[j_beta].orb_ens_alpha;
+            const auto& orb_ens_beta = bcd[j_beta].orb_ens_beta;
+
+            // AA excitations
+            generate_constraint_singles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), T_pq,
+                norb, G_red, norb, V_red, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            // AAAA excitations
+            generate_constraint_doubles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), G_pqrs,
+                norb, h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+            // AABB excitations
+            generate_constraint_doubles_contributions_os(
+                c, w, con, occ_alpha, occ_beta, vir_beta, orb_ens_alpha.data(),
+                orb_ens_beta.data(), V_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            if(alpha_satisfies_con) {
+              // BB excitations
+              append_singles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), T_pq, norb, G_red, norb, V_red, norb,
+                  h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+              // BBBB excitations
+              append_ss_doubles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, alpha_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                  ham_gen, asci_pairs);
+
+              // No excitation (push inf to remove from list)
+              asci_pairs.push_back(
+                  {w, std::numeric_limits<double>::infinity(), 1.0});
+            }
+          }
+
+          // Prune Down Contributions
+          if(asci_pairs.size() > asci_settings.pair_size_max) {
+            logger->info("  * PRUNING AT CON = {} IALPHA = {}", ic, i_alpha);
+            auto uit = sort_and_accumulate_asci_pairs(
+                asci_pairs.begin() + size_before, asci_pairs.end());
+            asci_pairs.erase(uit, asci_pairs.end());
+            if(asci_pairs.size() > asci_settings.pair_size_max)
+              throw std::runtime_error("DIE DIE DIE");
+          }
+
+        }  // Unique Alpha Loop
+
+        // Local S&A for each quad
+        {
+          if(size_before > asci_pairs.size())
+            throw std::runtime_error("DIE DIE DIE");
           auto uit = sort_and_accumulate_asci_pairs(
               asci_pairs.begin() + size_before, asci_pairs.end());
           asci_pairs.erase(uit, asci_pairs.end());
-          logger->info("    * NSZ = {}", asci_pairs.size());
-        }
 
-      }  // Pruning
-    }    // Unique Alpha Loop
+          // Remove small contributions
+          uit = std::partition(asci_pairs.begin() + size_before,
+                               asci_pairs.end(), [=](const auto& x) {
+                                 return std::abs(x.rv()) >
+                                        asci_settings.rv_prune_tol;
+                               });
+          asci_pairs.erase(uit, asci_pairs.end());
+        }
+      }  // Loc constraint loop
+    }    // Constraint Loop
 
-    // Local S&A for each quad
+// Insert into list
+#pragma omp critical
     {
-      auto uit = sort_and_accumulate_asci_pairs(
-          asci_pairs.begin() + size_before, asci_pairs.end());
-      asci_pairs.erase(uit, asci_pairs.end());
+      if(asci_pairs_total.size()) {
+        // Preallocate space for insertion
+        asci_pairs_total.reserve(asci_pairs.size() + asci_pairs_total.size());
+        asci_pairs_total.insert(asci_pairs_total.end(), asci_pairs.begin(),
+                                asci_pairs.end());
+      } else {
+        asci_pairs_total = std::move(asci_pairs);
+      }
+      asci_contrib_container<wfn_t<N>>().swap(asci_pairs);
     }
-  }  // Constraint Loop
 
-  return asci_pairs;
+  }  // OpenMP
+
+  return asci_pairs_total;
 }
 #endif
 
@@ -399,7 +430,7 @@ std::vector<wfn_t<N>> asci_search(
     const std::vector<double>& C, size_t norb, const double* T_pq,
     const double* G_red, const double* V_red, const double* G_pqrs,
     const double* V_pqrs,
-    HamiltonianGenerator<N>& ham_gen MACIS_MPI_CODE(, MPI_Comm comm)) {
+    HamiltonianGenerator<wfn_t<N>>& ham_gen MACIS_MPI_CODE(, MPI_Comm comm)) {
   using clock_type = std::chrono::high_resolution_clock;
   using duration_type = std::chrono::duration<double>;
 
@@ -437,6 +468,9 @@ std::vector<wfn_t<N>> asci_search(
       ncdets, ndets_max, asci_settings.h_el_tol, asci_settings.rv_prune_tol);
   logger->info("  MAX_RV_SIZE = {}, JUST_SINGLES = {}",
                asci_settings.pair_size_max, asci_settings.just_singles);
+  logger->info("  CDET_SUM = {:.2e}",
+               std::accumulate(C.begin(), C.begin() + ncdets, 0.0,
+                               [](auto s, auto c) { return s + c * c; }));
 
   MACIS_MPI_CODE(MPI_Barrier(comm);)
   auto asci_search_st = clock_type::now();
@@ -444,16 +478,16 @@ std::vector<wfn_t<N>> asci_search(
   // Expand Search Space with Connected ASCI Contributions
   auto pairs_st = clock_type::now();
   asci_contrib_container<wfn_t<N>> asci_pairs;
-  if(world_size == 1)
-    asci_pairs = asci_contributions_standard(
-        asci_settings, cdets_begin, cdets_end, E_ASCI, C, norb, T_pq, G_red,
-        V_red, G_pqrs, V_pqrs, ham_gen);
-#ifdef MACIS_ENABLE_MPI
-  else
-    asci_pairs = asci_contributions_constraint(
-        asci_settings, cdets_begin, cdets_end, E_ASCI, C, norb, T_pq, G_red,
-        V_red, G_pqrs, V_pqrs, ham_gen MACIS_MPI_CODE(, comm));
-#endif
+  // if(world_size == 1)
+  //   asci_pairs = asci_contributions_standard(
+  //       asci_settings, cdets_begin, cdets_end, E_ASCI, C, norb, T_pq, G_red,
+  //       V_red, G_pqrs, V_pqrs, ham_gen);
+  // #ifdef MACIS_ENABLE_MPI
+  // else
+  asci_pairs = asci_contributions_constraint(
+      asci_settings, ndets_max, cdets_begin, cdets_end, E_ASCI, C, norb, T_pq,
+      G_red, V_red, G_pqrs, V_pqrs, ham_gen MACIS_MPI_CODE(, comm));
+  // #endif
   auto pairs_en = clock_type::now();
 
   {
@@ -463,6 +497,8 @@ std::vector<wfn_t<N>> asci_search(
     size_t npairs = asci_pairs.size();
 #endif
     logger->info("  * ASCI Kept {} Pairs", npairs);
+    if(npairs < ndets_max)
+      logger->info("    * WARNING: Kept ASCI pairs less than requested TDETS");
 
 #ifdef MACIS_ENABLE_MPI
     if(world_size > 1) {
@@ -526,8 +562,12 @@ std::vector<wfn_t<N>> asci_search(
   }
 
   auto keep_large_st = clock_type::now();
+#if 0
   // Finalize scores
-  for(auto& x : asci_pairs) x.rv = -std::abs(x.rv);
+  for(auto& x : asci_pairs) {
+    x.c_times_matel = -std::abs(x.c_times_matel);
+    x.h_diag = std::abs(x.h_diag);
+  }
 
   // Insert all dets with their coefficients as seeds
   for(size_t i = 0; i < ncdets; ++i) {
@@ -540,11 +580,18 @@ std::vector<wfn_t<N>> asci_search(
   keep_only_largest_copy_asci_pairs(asci_pairs);
 
   asci_pairs.erase(std::partition(asci_pairs.begin(), asci_pairs.end(),
-                                  [](const auto& p) { return p.rv < 0.0; }),
+                                  [](const auto& p) { return p.rv() < 0.0; }),
                    asci_pairs.end());
 
-  // Only do top-K on (ndets_max - ncdets) b/c CDETS will be added later
-  const size_t top_k_elements = ndets_max - ncdets;
+#else
+  // Remove core dets
+  // XXX: This assumes the constraint search for now
+  {
+    auto inf_ptr = std::partition(asci_pairs.begin(), asci_pairs.end(),
+                                  [](auto& p) { return !std::isinf(p.rv()); });
+    asci_pairs.erase(inf_ptr, asci_pairs.end());
+  }
+#endif
 
   auto keep_large_en = clock_type::now();
   duration_type keep_large_dur = keep_large_en - keep_large_st;
@@ -560,6 +607,9 @@ std::vector<wfn_t<N>> asci_search(
     logger->info("  * KEEP_LARG_DUR = {:.2e} s", keep_large_dur.count());
   }
 
+  // Only do top-K on (ndets_max - ncdets) b/c CDETS will be added later
+  const size_t top_k_elements = ndets_max - ncdets;
+
   // Do Top-K to get the largest determinant contributions
   auto asci_sort_st = clock_type::now();
   if(world_size > 1 or asci_pairs.size() > top_k_elements) {
@@ -569,18 +619,19 @@ std::vector<wfn_t<N>> asci_search(
       // Strip scores
       std::vector<double> scores(asci_pairs.size());
       std::transform(asci_pairs.begin(), asci_pairs.end(), scores.begin(),
-                     [](const auto& p) { return std::abs(p.rv); });
+                     [](const auto& p) { return std::abs(p.rv()); });
 
       // Determine kth-ranked scores
       auto kth_score =
           dist_quickselect(scores.begin(), scores.end(), top_k_elements, comm,
                            std::greater<double>{}, std::equal_to<double>{});
 
+      logger->info("  * Kth Score Pivot = {:.2e}", kth_score);
       // Partition local pairs into less / eq batches
       auto [g_begin, e_begin, l_begin, _end] = leg_partition(
           asci_pairs.begin(), asci_pairs.end(), kth_score,
-          [=](const auto& p, const auto& s) { return std::abs(p.rv) > s; },
-          [=](const auto& p, const auto& s) { return std::abs(p.rv) == s; });
+          [=](const auto& p, const auto& s) { return std::abs(p.rv()) > s; },
+          [=](const auto& p, const auto& s) { return std::abs(p.rv()) == s; });
 
       // Determine local counts
       size_t n_greater = std::distance(g_begin, e_begin);
diff --git a/include/macis/asci/determinant_sort.hpp b/include/macis/asci/determinant_sort.hpp
index 829f7240..ec738f14 100644
--- a/include/macis/asci/determinant_sort.hpp
+++ b/include/macis/asci/determinant_sort.hpp
@@ -11,6 +11,7 @@
 #if __has_include(<boost/sort/pdqsort/pdqsort.hpp>)
 #define MACIS_USE_BOOST_SORT
 #include <boost/sort/pdqsort/pdqsort.hpp>
+#include <boost/sort/sort.hpp>
 #endif
 
 namespace macis {
@@ -36,6 +37,54 @@ void reorder_ci_on_coeff(std::vector<WfnT>& dets, std::vector<double>& C) {
   dets = std::move(reorder_dets);
 }
 
+template <typename WfnIterator>
+void reorder_ci_on_alpha(WfnIterator begin, WfnIterator end, double* C) {
+  using wfn_type = typename WfnIterator::value_type;
+  using wfn_traits = wavefunction_traits<wfn_type>;
+  using cmp_type = typename wfn_traits::spin_comparator;
+  const size_t ndets = std::distance(begin, end);
+
+  cmp_type comparator{};
+  std::vector<size_t> idx(ndets);
+  std::iota(idx.begin(), idx.end(), 0);
+  std::sort(idx.begin(), idx.end(), [&](auto i, auto j) {
+    return comparator(*(begin + i), *(begin + j));
+  });
+
+  std::vector<double> reorder_C(ndets);
+  std::vector<wfn_type> reorder_dets(ndets);
+  for(auto i = 0ul; i < ndets; ++i) {
+    reorder_C[i] = C[idx[i]];
+    reorder_dets[i] = *(begin + idx[i]);
+  }
+
+  std::copy(reorder_dets.begin(), reorder_dets.end(), begin);
+  std::copy(reorder_C.begin(), reorder_C.end(), C);
+}
+
+template <typename PairIterator>
+PairIterator accumulate_asci_pairs(PairIterator pairs_begin,
+                                   PairIterator pairs_end) {
+  // Accumulate the ASCI scores into first instance of unique bitstrings
+  auto cur_it = pairs_begin;
+  for(auto it = cur_it + 1; it != pairs_end; ++it) {
+    // If iterate is not the one being tracked, update the iterator
+    if(it->state != cur_it->state) {
+      cur_it = it;
+    }
+
+    // Accumulate
+    else {
+      cur_it->c_times_matel += it->c_times_matel;
+      it->c_times_matel = NAN;  // Zero out to expose potential bugs
+    }
+  }
+
+  // Remote duplicate bitstrings
+  return std::unique(pairs_begin, pairs_end,
+                     [](auto x, auto y) { return x.state == y.state; });
+}
+
 template <typename PairIterator>
 PairIterator sort_and_accumulate_asci_pairs(PairIterator pairs_begin,
                                             PairIterator pairs_end) {
@@ -55,24 +104,29 @@ PairIterator sort_and_accumulate_asci_pairs(PairIterator pairs_begin,
 #endif
       (pairs_begin, pairs_end, comparator);
 
-  // Accumulate the ASCI scores into first instance of unique bitstrings
-  auto cur_it = pairs_begin;
-  for(auto it = cur_it + 1; it != pairs_end; ++it) {
-    // If iterate is not the one being tracked, update the iterator
-    if(it->state != cur_it->state) {
-      cur_it = it;
-    }
+  return accumulate_asci_pairs(pairs_begin, pairs_end);
+}
 
-    // Accumulate
-    else {
-      cur_it->rv += it->rv;
-      it->rv = 0;  // Zero out to expose potential bugs
-    }
-  }
+template <typename PairIterator>
+PairIterator stable_sort_and_accumulate_asci_pairs(PairIterator pairs_begin,
+                                                   PairIterator pairs_end) {
+  const size_t npairs = std::distance(pairs_begin, pairs_end);
 
-  // Remote duplicate bitstrings
-  return std::unique(pairs_begin, pairs_end,
-                     [](auto x, auto y) { return x.state == y.state; });
+  if(!npairs) return pairs_end;
+
+  auto comparator = [](const auto& x, const auto& y) {
+    return bitset_less(x.state, y.state);
+  };
+
+// Sort by bitstring
+#ifdef MACIS_USE_BOOST_SORT
+  boost::sort::flat_stable_sort
+#else
+  std::stable_sort
+#endif
+      (pairs_begin, pairs_end, comparator);
+
+  return accumulate_asci_pairs(pairs_begin, pairs_end);
 }
 
 template <typename WfnT>
@@ -108,7 +162,8 @@ void keep_only_largest_copy_asci_pairs(
 
     // Keep only max value
     else {
-      cur_it->rv = std::max(cur_it->rv, it->rv);
+      cur_it->c_times_matel =
+          std::max(cur_it->c_times_matel, it->c_times_matel);
     }
   }
 
diff --git a/include/macis/asci/grow.hpp b/include/macis/asci/grow.hpp
index f3f4260e..f72226f3 100644
--- a/include/macis/asci/grow.hpp
+++ b/include/macis/asci/grow.hpp
@@ -16,7 +16,7 @@ namespace macis {
 template <size_t N, typename index_t = int32_t>
 auto asci_grow(ASCISettings asci_settings, MCSCFSettings mcscf_settings,
                double E0, std::vector<wfn_t<N>> wfn, std::vector<double> X,
-               HamiltonianGenerator<N>& ham_gen,
+               HamiltonianGenerator<wfn_t<N>>& ham_gen,
                size_t norb MACIS_MPI_CODE(, MPI_Comm comm)) {
 #ifdef MACIS_ENABLE_MPI
   auto world_rank = comm_rank(comm);
@@ -119,12 +119,13 @@ auto asci_grow(ASCISettings asci_settings, MCSCFSettings mcscf_settings,
 #endif
 
       // Regenerate intermediates
-      ham_gen.generate_integral_intermediates(ham_gen.V_pqrs_);
+      ham_gen.generate_integral_intermediates();
 
       logger->trace("  * Rediagonalizing");
       auto rdg_st = hrt_t::now();
       std::vector<double> X_local;
-      selected_ci_diag(
+      throw "DIE DIE DIE";
+      selected_ci_diag<index_t>(
           wfn.begin(), wfn.end(), ham_gen, mcscf_settings.ci_matel_tol,
           mcscf_settings.ci_max_subspace, mcscf_settings.ci_res_tol,
           X_local MACIS_MPI_CODE(, comm));
diff --git a/include/macis/asci/iteration.hpp b/include/macis/asci/iteration.hpp
index aa6e271c..604ffde6 100644
--- a/include/macis/asci/iteration.hpp
+++ b/include/macis/asci/iteration.hpp
@@ -8,15 +8,15 @@
 
 #pragma once
 #include <macis/asci/determinant_search.hpp>
+#include <macis/mcscf/mcscf.hpp>
 #include <macis/solvers/selected_ci_diag.hpp>
-#include <macis/util/mcscf.hpp>
 
 namespace macis {
 
-template <size_t N, typename index_t = int32_t>
+template <size_t N, typename index_t>
 auto asci_iter(ASCISettings asci_settings, MCSCFSettings mcscf_settings,
                size_t ndets_max, double E0, std::vector<wfn_t<N>> wfn,
-               std::vector<double> X, HamiltonianGenerator<N>& ham_gen,
+               std::vector<double> X, HamiltonianGenerator<wfn_t<N>>& ham_gen,
                size_t norb MACIS_MPI_CODE(, MPI_Comm comm)) {
   // Sort wfn on coefficient weights
   if(wfn.size() > 1) reorder_ci_on_coeff(wfn, X);
@@ -24,14 +24,23 @@ auto asci_iter(ASCISettings asci_settings, MCSCFSettings mcscf_settings,
   // Sanity check on search determinants
   size_t nkeep = std::min(asci_settings.ncdets_max, wfn.size());
 
+  // Sort kept dets on alpha string
+  if(wfn.size() > 1)
+    reorder_ci_on_alpha(wfn.begin(), wfn.begin() + nkeep, X.data());
+
   // Perform the ASCI search
   wfn = asci_search(asci_settings, ndets_max, wfn.begin(), wfn.begin() + nkeep,
                     E0, X, norb, ham_gen.T(), ham_gen.G_red(), ham_gen.V_red(),
                     ham_gen.G(), ham_gen.V(), ham_gen MACIS_MPI_CODE(, comm));
 
+  // std::sort(wfn.begin(), wfn.end(), bitset_less_comparator<N>{});
+  using wfn_traits = wavefunction_traits<wfn_t<N>>;
+  using wfn_comp = typename wfn_traits::spin_comparator;
+  std::sort(wfn.begin(), wfn.end(), wfn_comp{});
+
   // Rediagonalize
   std::vector<double> X_local;  // Precludes guess reuse
-  auto E = selected_ci_diag<N, index_t>(
+  auto E = selected_ci_diag<index_t>(
       wfn.begin(), wfn.end(), ham_gen, mcscf_settings.ci_matel_tol,
       mcscf_settings.ci_max_subspace, mcscf_settings.ci_res_tol,
       X_local MACIS_MPI_CODE(, comm));
diff --git a/include/macis/asci/mask_constraints.hpp b/include/macis/asci/mask_constraints.hpp
index c9907e09..03881225 100644
--- a/include/macis/asci/mask_constraints.hpp
+++ b/include/macis/asci/mask_constraints.hpp
@@ -7,6 +7,7 @@
  */
 
 #pragma once
+#include <macis/asci/alpha_constraint.hpp>
 #include <macis/sd_operations.hpp>
 #include <macis/types.hpp>
 #include <macis/util/mpi.hpp>
@@ -14,96 +15,73 @@
 
 namespace macis {
 
-template <size_t N>
-struct wfn_constraint {
-  wfn_t<N> C;
-  wfn_t<N> B;
-  unsigned C_min;
-};
-
-template <size_t N>
-auto make_triplet(unsigned i, unsigned j, unsigned k) {
-  wfn_constraint<N> con;
-
-  con.C = 0;
-  con.C.flip(i).flip(j).flip(k);
-  con.B = 1;
-  con.B <<= k;
-  con.B = con.B.to_ullong() - 1;
-  con.C_min = k;
-
-  return con;
+template <typename WfnType, typename ConType>
+bool satisfies_constraint(WfnType det, ConType C) {
+  return C.satisfies_constraint(det);
 }
 
-template <size_t N>
-auto make_quad(unsigned i, unsigned j, unsigned k, unsigned l) {
-  wfn_constraint<N> con;
-
-  con.C = 0;
-  con.C.flip(i).flip(j).flip(k).flip(l);
-  con.B = 1;
-  con.B <<= l;
-  con.B = con.B.to_ullong() - 1;
-  con.C_min = l;
+template <typename WfnType, typename ConType>
+auto generate_constraint_single_excitations(WfnType det, ConType constraint) {
+  using constraint_traits = typename ConType::constraint_traits;
+  const auto C = constraint.C();
+  const auto B = constraint.B();
 
-  return con;
-}
+  // need to have at most one different from the constraint
+  if(constraint.overlap(det) < (constraint.count() - 1))
+    return std::make_pair(WfnType(0), WfnType(0));
 
-template <size_t N>
-bool satisfies_constraint(wfn_t<N> det, wfn_t<N> C, unsigned C_min) {
-  return (det & C).count() == C.count() and ((det ^ C) >> C_min).count() == 0;
-}
+  auto o = constraint.symmetric_difference(det);
+  auto v = constraint.b_mask_union(~det);
 
-template <size_t N>
-auto generate_constraint_single_excitations(wfn_t<N> det, wfn_t<N> C,
-                                            wfn_t<N> O_mask, wfn_t<N> B) {
-  if((det & C).count() <
-     (C.count() - 1))  // need to have at most one different from the constraint
-    return std::make_pair(wfn_t<N>(0), wfn_t<N>(0));
-
-  auto o = det ^ C;
-  auto v = (~det) & O_mask & B;
-
-  if((o & C).count() ==
-     1) {  // don't have to change this necessarily, but more clear without >=
+  if(constraint_traits::count(o & C) == 1) {
     v = o & C;
     o ^= v;
   }
 
-  if((o & ~B).count() > 1) return std::make_pair(wfn_t<N>(0), wfn_t<N>(0));
+  const auto o_and_not_b = o & ~B;
+  const auto o_and_not_b_count = constraint_traits::count(o_and_not_b);
+  if(o_and_not_b_count > 1) return std::make_pair(WfnType(0), WfnType(0));
 
-  if((o & ~B).count() == 1) o &= ~B;
+  if(o_and_not_b_count == 1) o = o_and_not_b;
 
   return std::make_pair(o, v);
 }
 
-template <size_t N>
-auto generate_constraint_double_excitations(wfn_t<N> det, wfn_t<N> C,
-                                            wfn_t<N> O_mask, wfn_t<N> B) {
+template <typename WfnType, typename ConType>
+auto generate_constraint_double_excitations(WfnType det, ConType constraint) {
+  using constraint_traits = typename ConType::constraint_traits;
+  using constraint_type = typename ConType::constraint_type;
+  const auto C = constraint.C();
+  const auto B = constraint.B();
   // Occ/Vir pairs to generate excitations
-  std::vector<wfn_t<N>> O, V;
+  std::vector<constraint_type> O, V;
 
-  if((det & C) == 0) return std::make_tuple(O, V);
+  if(constraint.overlap(det) == 0) return std::make_tuple(O, V);
 
-  auto o = det ^ C;
-  auto v = (~det) & O_mask & B;
+  auto o = constraint.symmetric_difference(det);
+  auto v = constraint.b_mask_union(~det);
 
-  if((o & C).count() >= 3) return std::make_tuple(O, V);
+  auto o_and_c = o & C;
+  auto o_and_c_count = constraint_traits::count(o_and_c);
+  if(o_and_c_count >= 3) return std::make_tuple(O, V);
 
   // Generate Virtual Pairs
-  if((o & C).count() == 2) {
-    v = o & C;
+  if(o_and_c_count == 2) {
+    v = o_and_c;
     o ^= v;
+    // Regenerate since o changed
+    // XXX: This apparently is not needed, but leaving because <shrug>
+    o_and_c = o & C;
+    o_and_c_count = constraint_traits::count(o_and_c);
   }
 
   const auto virt_ind = bits_to_indices(v);
-  const auto o_and_t = o & C;
-  switch((o & C).count()) {
+  switch(o_and_c_count) {
     case 1:
       for(auto a : virt_ind) {
-        V.emplace_back(o_and_t).flip(a);
+        V.emplace_back(constraint_traits::create_no_check(o_and_c, a));
       }
-      o ^= o_and_t;
+      o ^= o_and_c;
       break;
     default:
       generate_pairs(virt_ind, V);
@@ -112,16 +90,17 @@ auto generate_constraint_double_excitations(wfn_t<N> det, wfn_t<N> C,
 
   // Generate Occupied Pairs
   const auto o_and_not_b = o & ~B;
-  if(o_and_not_b.count() > 2) return std::make_tuple(O, V);
+  const auto o_and_not_b_count = constraint_traits::count(o_and_not_b);
+  if(o_and_not_b_count > 2) return std::make_tuple(O, V);
 
-  switch(o_and_not_b.count()) {
+  switch(o_and_not_b_count) {
     case 1:
       for(auto i : bits_to_indices(o & B)) {
-        O.emplace_back(o_and_not_b).flip(i);
+        O.emplace_back(constraint_traits::create_no_check(o_and_not_b, i));
       }
       break;
     default:
-      if(o_and_not_b.count() == 2) o = o_and_not_b;
+      if(o_and_not_b_count == 2) o = o_and_not_b;
       generate_pairs(bits_to_indices(o), O);
       break;
   }
@@ -129,36 +108,39 @@ auto generate_constraint_double_excitations(wfn_t<N> det, wfn_t<N> C,
   return std::make_tuple(O, V);
 }
 
-template <size_t N>
-void generate_constraint_singles(wfn_t<N> det, wfn_t<N> T, wfn_t<N> O_mask,
-                                 wfn_t<N> B, std::vector<wfn_t<N>>& t_singles) {
-  auto [o, v] = generate_constraint_single_excitations(det, T, O_mask, B);
-  const auto oc = o.count();
-  const auto vc = v.count();
+template <typename WfnType, typename ConType>
+void generate_constraint_singles(WfnType det, ConType constraint,
+                                 std::vector<WfnType>& t_singles) {
+  using constraint_traits = typename ConType::constraint_traits;
+  auto [o, v] = generate_constraint_single_excitations(det, constraint);
+  const auto oc = constraint_traits::count(o);
+  const auto vc = constraint_traits::count(v);
   if(!oc or !vc) return;
 
   t_singles.clear();
   t_singles.reserve(oc * vc);
-  const auto occ = bits_to_indices(o);
-  const auto vir = bits_to_indices(v);
+  const auto occ = constraint_traits::state_to_occ(o);
+  const auto vir = constraint_traits::state_to_occ(v);
   for(auto i : occ) {
-    auto temp = det;
-    temp.flip(i);
-    for(auto a : vir) t_singles.emplace_back(temp).flip(a);
+    auto temp = constraint_traits::create_no_check(det, i);
+    for(auto a : vir)
+      t_singles.emplace_back(constraint_traits::create_no_check(temp, a));
   }
 }
 
-template <typename... Args>
-unsigned count_constraint_singles(Args&&... args) {
-  auto [o, v] =
-      generate_constraint_single_excitations(std::forward<Args>(args)...);
-  return o.count() * v.count();
+template <typename WfnType, typename ConType>
+unsigned count_constraint_singles(WfnType det, ConType constraint) {
+  using constraint_traits = typename ConType::constraint_traits;
+  auto [o, v] = generate_constraint_single_excitations(det, constraint);
+  const auto oc = constraint_traits::count(o);
+  const auto vc = constraint_traits::count(v);
+  return oc * vc;
 }
 
-template <size_t N>
-void generate_constraint_doubles(wfn_t<N> det, wfn_t<N> T, wfn_t<N> O_mask,
-                                 wfn_t<N> B, std::vector<wfn_t<N>>& t_doubles) {
-  auto [O, V] = generate_constraint_double_excitations(det, T, O_mask, B);
+template <typename WfnType, typename ConType>
+void generate_constraint_doubles(WfnType det, ConType constraint,
+                                 std::vector<WfnType>& t_doubles) {
+  auto [O, V] = generate_constraint_double_excitations(det, constraint);
 
   t_doubles.clear();
   for(auto ij : O) {
@@ -172,30 +154,36 @@ void generate_constraint_doubles(wfn_t<N> det, wfn_t<N> T, wfn_t<N> O_mask,
 /**
  *  @param[in]  det Input root determinant
  *  @param[in]  T   Triplet constraint mask
- *  @param[in]  O   Overfill mask (full mask 0 -> norb)
  *  @param[in]  B   B mask (?)
  */
-template <size_t N>
-unsigned count_constraint_doubles(wfn_t<N> det, wfn_t<N> C, wfn_t<N> O,
-                                  wfn_t<N> B) {
-  if((det & C) == 0) return 0;
+template <typename WfnType, typename ConType>
+unsigned count_constraint_doubles(WfnType det, ConType constraint) {
+  using constraint_traits = typename ConType::constraint_traits;
+  const auto C = constraint.C();
+  const auto B = constraint.B();
+  if(constraint.overlap(det) == 0) return 0;
 
-  auto o = det ^ C;
-  auto v = (~det) & O & B;
+  auto o = constraint.symmetric_difference(det);
+  auto v = constraint.b_mask_union(~det);
 
-  if((o & C).count() >= 3) return 0;
+  auto o_and_c = o & C;
+  auto o_and_c_count = constraint_traits::count(o_and_c);
+  if(o_and_c_count >= 3) return 0;
 
   // Generate Virtual Pairs
-  if((o & C).count() == 2) {
-    v = o & C;
+  if(o_and_c_count == 2) {
+    v = o_and_c;
     o ^= v;
+    // Regenerate since o changed
+    // XXX: This apparently is not needed, but leaving because <shrug>
+    o_and_c = o & C;
+    o_and_c_count = constraint_traits::count(o_and_c);
   }
 
-  unsigned nv_pairs = v.count();
-  const auto o_and_t = o & C;
-  switch((o & C).count()) {
+  unsigned nv_pairs = constraint_traits::count(v);
+  switch(o_and_c_count) {
     case 1:
-      o ^= o_and_t;
+      o ^= o_and_c;
       break;
     default:
       nv_pairs = (nv_pairs * (nv_pairs - 1)) / 2;
@@ -204,16 +192,17 @@ unsigned count_constraint_doubles(wfn_t<N> det, wfn_t<N> C, wfn_t<N> O,
 
   // Generate Occupied Pairs
   const auto o_and_not_b = o & ~B;
-  if(o_and_not_b.count() > 2) return 0;
+  const auto o_and_not_b_count = constraint_traits::count(o_and_not_b);
+  if(o_and_not_b_count > 2) return 0;
 
   unsigned no_pairs = 0;
-  switch(o_and_not_b.count()) {
+  switch(o_and_not_b_count) {
     case 1:
-      no_pairs = (o & B).count();
+      no_pairs = constraint_traits::count(o & B);
       break;
     default:
-      if(o_and_not_b.count() == 2) o = o_and_not_b;
-      no_pairs = o.count();
+      if(o_and_not_b_count == 2) o = o_and_not_b;
+      no_pairs = constraint_traits::count(o);
       no_pairs = (no_pairs * (no_pairs - 1)) / 2;
       break;
   }
@@ -221,48 +210,49 @@ unsigned count_constraint_doubles(wfn_t<N> det, wfn_t<N> C, wfn_t<N> O,
   return no_pairs * nv_pairs;
 }
 
-template <size_t N, typename... Args>
-size_t constraint_histogram(wfn_t<N> det, size_t n_os_singles,
-                            size_t n_os_doubles, wfn_t<N> T, wfn_t<N> O_mask,
-                            wfn_t<N> B) {
-  auto ns = count_constraint_singles(det, T, O_mask, B);
-  auto nd = count_constraint_doubles(det, T, O_mask, B);
+template <typename WfnType, typename ConType>
+size_t constraint_histogram(WfnType det, size_t n_os_singles,
+                            size_t n_os_doubles, ConType constraint) {
+  auto ns = count_constraint_singles(det, constraint);
+  auto nd = count_constraint_doubles(det, constraint);
 
   size_t ndet = 0;
   ndet += ns;                 // AA
   ndet += nd;                 // AAAA
   ndet += ns * n_os_singles;  // AABB
-  auto T_min = ffs(T) - 1;
-  if(satisfies_constraint(det, T, T_min)) {
+  if(satisfies_constraint(det, constraint)) {
     ndet += n_os_singles + n_os_doubles + 1;  // BB + BBBB + No Excitations
   }
 
   return ndet;
 }
 
-template <size_t N>
+template <typename WfnType, typename ConType>
 void generate_constraint_singles_contributions_ss(
-    double coeff, wfn_t<N> det, wfn_t<N> T, wfn_t<N> O, wfn_t<N> B,
-    wfn_t<N> os_det, const std::vector<uint32_t>& occ_same,
+    double coeff, WfnType det, ConType constraint,
+    const std::vector<uint32_t>& occ_same,
     const std::vector<uint32_t>& occ_othr, const double* eps,
     const double* T_pq, const size_t LDT, const double* G_kpq, const size_t LDG,
     const double* V_kpq, const size_t LDV, double h_el_tol, double root_diag,
-    double E0, HamiltonianGenerator<N>& ham_gen,
-    asci_contrib_container<wfn_t<N>>& asci_contributions) {
-  auto [o, v] = generate_constraint_single_excitations(det, T, O, B);
-  const auto no = o.count();
-  const auto nv = v.count();
+    double E0, HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using constraint_traits = typename ConType::constraint_traits;
+  auto [o, v] = generate_constraint_single_excitations(
+      wfn_traits::alpha_string(det), constraint);
+  const auto no = constraint_traits::count(o);
+  const auto nv = constraint_traits::count(v);
   if(!no or !nv) return;
 
   const size_t LDG2 = LDG * LDG;
   const size_t LDV2 = LDV * LDV;
   for(int ii = 0; ii < no; ++ii) {
     const auto i = fls(o);
-    o.flip(i);
+    o = constraint_traits::create_no_check(o, i);  // o.flip(i)
     auto v_cpy = v;
     for(int aa = 0; aa < nv; ++aa) {
       const auto a = fls(v_cpy);
-      v_cpy.flip(a);
+      v_cpy = constraint_traits::create_no_check(v_cpy, a);  // v_cpy.flip(a)
 
       double h_el = T_pq[a + i * LDT];
       const double* G_ov = G_kpq + a * LDG + i * LDG2;
@@ -274,8 +264,9 @@ void generate_constraint_singles_contributions_ss(
       if(std::abs(coeff * h_el) < h_el_tol) continue;
 
       // Calculate Excited Determinant
-      auto ex_det = det | os_det;
-      ex_det.flip(i).flip(a);
+      auto ex_det =
+          wfn_traits::template single_excitation_no_check<Spin::Alpha>(det, i,
+                                                                       a);
 
       // Compute Sign in a Canonical Way
       auto sign = single_excitation_sign(det, a, i);
@@ -283,22 +274,25 @@ void generate_constraint_singles_contributions_ss(
 
       // Compute Fast Diagonal Matrix Element
       auto h_diag = ham_gen.fast_diag_single(eps[i], eps[a], i, a, root_diag);
-      h_el /= (E0 - h_diag);
+      // h_el /= (E0 - h_diag);
 
-      asci_contributions.push_back({ex_det, coeff * h_el});
+      asci_contributions.push_back({ex_det, coeff * h_el, E0 - h_diag});
     }
   }
 }
 
-template <size_t N>
+template <typename WfnType, typename ConType>
 void generate_constraint_doubles_contributions_ss(
-    double coeff, wfn_t<N> det, wfn_t<N> T, wfn_t<N> O_mask, wfn_t<N> B,
-    wfn_t<N> os_det, const std::vector<uint32_t>& occ_same,
+    double coeff, WfnType det, ConType constraint,
+    const std::vector<uint32_t>& occ_same,
     const std::vector<uint32_t>& occ_othr, const double* eps, const double* G,
     const size_t LDG, double h_el_tol, double root_diag, double E0,
-    HamiltonianGenerator<N>& ham_gen,
-    asci_contrib_container<wfn_t<N>>& asci_contributions) {
-  auto [O, V] = generate_constraint_double_excitations(det, T, O_mask, B);
+    HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_t<WfnType>>;
+  auto [O, V] = generate_constraint_double_excitations(
+      wfn_traits::alpha_string(det), constraint);
   const auto no_pairs = O.size();
   const auto nv_pairs = V.size();
   if(!no_pairs or !nv_pairs) return;
@@ -309,7 +303,9 @@ void generate_constraint_doubles_contributions_ss(
     const auto i = ffs(ij) - 1;
     const auto j = fls(ij);
     const auto G_ij = G + (j + i * LDG2) * LDG;
-    const auto ex_ij = det ^ ij;
+    const auto ex_ij =
+        wfn_traits::template single_excitation_no_check<Spin::Alpha>(
+            det, i, j);  // det ^ ij;
     for(int _ab = 0; _ab < nv_pairs; ++_ab) {
       const auto ab = V[_ab];
       const auto a = ffs(ab) - 1;
@@ -321,14 +317,20 @@ void generate_constraint_doubles_contributions_ss(
       if(std::abs(coeff * G_aibj) < h_el_tol) continue;
 
       // Calculate Excited Determinant (spin)
-      const auto full_ex_spin = ij | ab;
-      const auto ex_det_spin = ex_ij | ab;
+      const auto full_ex_spin =
+          spin_wfn_traits::template single_excitation_no_check<Spin::Alpha>(
+              ij, a, b);  // ij | ab;
+      const auto ex_det_spin =
+          wfn_traits::template single_excitation_no_check<Spin::Alpha>(
+              ex_ij, a, b);  // ex_ij | ab;
 
       // Compute Sign in a Canonical Way
-      auto sign = doubles_sign(det, ex_det_spin, full_ex_spin);
+      auto sign =
+          doubles_sign(wfn_traits::alpha_string(det),
+                       wfn_traits::alpha_string(ex_det_spin), full_ex_spin);
 
       // Calculate Full Excited Determinant
-      const auto full_ex = ex_det_spin | os_det;
+      const auto full_ex = ex_det_spin;  // | os_det;
 
       // Update Sign of Matrix Element
       auto h_el = sign * G_aibj;
@@ -336,36 +338,39 @@ void generate_constraint_doubles_contributions_ss(
       // Evaluate fast diagonal matrix element
       auto h_diag = ham_gen.fast_diag_ss_double(eps[i], eps[j], eps[a], eps[b],
                                                 i, j, a, b, root_diag);
-      h_el /= (E0 - h_diag);
+      // h_el /= (E0 - h_diag);
 
-      asci_contributions.push_back({full_ex, coeff * h_el});
+      asci_contributions.push_back({full_ex, coeff * h_el, E0 - h_diag});
     }
   }
 }
 
-template <size_t N>
+template <typename WfnType, typename ConType>
 void generate_constraint_doubles_contributions_os(
-    double coeff, wfn_t<N> det, wfn_t<N> T, wfn_t<N> O, wfn_t<N> B,
-    wfn_t<N> os_det, const std::vector<uint32_t>& occ_same,
+    double coeff, WfnType det, ConType constraint,
+    const std::vector<uint32_t>& occ_same,
     const std::vector<uint32_t>& occ_othr,
     const std::vector<uint32_t>& vir_othr, const double* eps_same,
     const double* eps_othr, const double* V, const size_t LDV, double h_el_tol,
-    double root_diag, double E0, HamiltonianGenerator<N>& ham_gen,
-    asci_contrib_container<wfn_t<N>>& asci_contributions) {
+    double root_diag, double E0, HamiltonianGeneratorBase<double>& ham_gen,
+    asci_contrib_container<WfnType>& asci_contributions) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using constraint_traits = typename ConType::constraint_traits;
   // Generate Single Excitations that Satisfy the Constraint
-  auto [o, v] = generate_constraint_single_excitations(det, T, O, B);
-  const auto no = o.count();
-  const auto nv = v.count();
+  auto [o, v] = generate_constraint_single_excitations(
+      wfn_traits::alpha_string(det), constraint);
+  const auto no = constraint_traits::count(o);
+  const auto nv = constraint_traits::count(v);
   if(!no or !nv) return;
 
   const size_t LDV2 = LDV * LDV;
   for(int ii = 0; ii < no; ++ii) {
     const auto i = fls(o);
-    o.flip(i);
+    o = constraint_traits::create_no_check(o, i);  // o.flip(i)
     auto v_cpy = v;
     for(int aa = 0; aa < nv; ++aa) {
       const auto a = fls(v_cpy);
-      v_cpy.flip(a);
+      v_cpy = constraint_traits::create_no_check(v_cpy, a);  // v_cpy.flip(a)
 
       const auto* V_ai = V + a + i * LDV;
       double sign_same = single_excitation_sign(det, a, i);
@@ -381,12 +386,17 @@ void generate_constraint_doubles_contributions_os(
           // double sign_othr = single_excitation_sign( os_det >> (N/2),  b, j
           // );
           double sign_othr =
-              single_excitation_sign(bitset_hi_word(os_det), b, j);
+              single_excitation_sign(wfn_traits::beta_string(det), b, j);
           double sign = sign_same * sign_othr;
 
           // Compute Excited Determinant
-          auto ex_det = det | os_det;
-          ex_det.flip(i).flip(a).flip(j + N / 2).flip(b + N / 2);
+          // auto ex_det = det | os_det;
+          // ex_det.flip(i).flip(a).flip(j + N / 2).flip(b + N / 2);
+          auto ex_det =
+              wfn_traits::template single_excitation_no_check<Spin::Alpha>(
+                  det, i, a);
+          ex_det = wfn_traits::template single_excitation_no_check<Spin::Beta>(
+              ex_det, j, b);
 
           // Finalize Matrix Element
           auto h_el = sign * V_aibj;
@@ -394,9 +404,9 @@ void generate_constraint_doubles_contributions_os(
           auto h_diag =
               ham_gen.fast_diag_os_double(eps_same[i], eps_othr[j], eps_same[a],
                                           eps_othr[b], i, j, a, b, root_diag);
-          h_el /= (E0 - h_diag);
+          // h_el /= (E0 - h_diag);
 
-          asci_contributions.push_back({ex_det, coeff * h_el});
+          asci_contributions.push_back({ex_det, coeff * h_el, E0 - h_diag});
         }  // BJ
 
     }  // A
@@ -482,43 +492,370 @@ auto dist_triplets_histogram(size_t norb, size_t ns_othr, size_t nd_othr,
 }
 #endif
 
-#ifdef MACIS_ENABLE_MPI
 template <size_t N>
+auto make_triplet(unsigned i, unsigned j, unsigned k) {
+  using wfn_type = wfn_t<N>;
+  using wfn_traits = wavefunction_traits<wfn_type>;
+  using constraint_type = alpha_constraint<wfn_traits>;
+  using string_type = typename constraint_type::constraint_type;
+
+  string_type C = 0;
+  C.flip(i).flip(j).flip(k);
+  string_type B = 1;
+  static_assert(B.size() <= 64, "ULLONG NOT POSSIBLE HERE");
+  B <<= k;
+  B = B.to_ullong() - 1;
+
+  return constraint_type(C, B, k);
+}
+
+#ifdef MACIS_ENABLE_MPI
+#if 0
+template <typename WfnType, typename ContainerType>
 auto dist_constraint_general(size_t nlevels, size_t norb, size_t ns_othr,
                              size_t nd_othr,
-                             const std::vector<wfn_t<N>>& unique_alpha,
+                             const ContainerType& unique_alpha,
                              MPI_Comm comm) {
+
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using constraint_type = alpha_constraint<wfn_traits>;
+  using string_type = typename constraint_type::constraint_type;
   auto world_rank = comm_rank(comm);
   auto world_size = comm_size(comm);
 
-  wfn_t<N> O = full_mask<N>(norb);
+  constexpr bool flat_container = std::is_same_v<
+      std::decay_t<WfnType>, 
+      std::decay_t<typename ContainerType::value_type>
+  >;
+
+  // Generate triplets + heuristic
+  std::vector<std::pair<constraint_type, size_t>> constraint_sizes;
+  constraint_sizes.reserve(norb * norb * norb);
+  size_t total_work = 0;
+  for(int t_i = 0; t_i < norb; ++t_i)
+    for(int t_j = 0; t_j < t_i; ++t_j)
+      for(int t_k = 0; t_k < t_j; ++t_k) {
+        auto constraint = constraint_type::make_triplet(t_i, t_j, t_k);
+
+        size_t nw = 0;
+        for(const auto& alpha : unique_alpha) {
+          if constexpr (flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, constraint);
+          else
+            nw += alpha.second * 
+              constraint_histogram(alpha.first, ns_othr, nd_othr, constraint);
+        }
+        if(nw) constraint_sizes.emplace_back(constraint, nw);
+        total_work += nw;
+      }
+
+  size_t local_average = (0.8 * total_work) / world_size;
+
+  for(size_t ilevel = 0; ilevel < nlevels; ++ilevel) {
+    // Select constraints larger than average to be broken apart
+    std::vector<std::pair<constraint_type, size_t>> tps_to_next;
+    {
+      auto it = std::partition(
+          constraint_sizes.begin(), constraint_sizes.end(),
+          [=](const auto& a) { return a.second <= local_average; });
+
+      // Remove constraints from full list
+      tps_to_next = decltype(tps_to_next)(it, constraint_sizes.end());
+      constraint_sizes.erase(it, constraint_sizes.end());
+      for(auto [t, s] : tps_to_next) total_work -= s;
+    }
+
+    if(!tps_to_next.size()) break;
+
+    // Break apart constraints
+    for(auto [c, nw_trip] : tps_to_next) {
+      const auto C_min = c.C_min();
+
+      // Loop over possible constraints with one more element
+      for(auto q_l = 0; q_l < C_min; ++q_l) {
+        // Generate masks / counts
+        string_type cn_C = c.C();
+        cn_C.flip(q_l);
+        string_type cn_B = c.B() >> (C_min - q_l);
+        constraint_type c_next(cn_C, cn_B, q_l);
+
+        size_t nw = 0;
+
+        for(const auto& alpha : unique_alpha) {
+          if constexpr (flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, c_next);
+          else
+            nw += alpha.second * 
+              constraint_histogram(alpha.first, ns_othr, nd_othr, c_next);
+        }
+        if(nw) constraint_sizes.emplace_back(c_next, nw);
+        total_work += nw;
+      }
+    }
+  }  // Recurse into constraints
+
+  // Sort to get optimal bucket partitioning
+  std::sort(constraint_sizes.begin(), constraint_sizes.end(),
+            [](const auto& a, const auto& b) { return a.second > b.second; });
+
 
   // Global workloads
   std::vector<size_t> workloads(world_size, 0);
 
+  // Assign work
+  std::vector<constraint_type> constraints;
+  constraints.reserve(constraint_sizes.size() / world_size);
+
+  for(auto [c, nw] : constraint_sizes) {
+    // Get rank with least amount of work
+    auto min_rank_it = std::min_element(workloads.begin(), workloads.end());
+    int min_rank = std::distance(workloads.begin(), min_rank_it);
+
+    // Assign constraint
+    *min_rank_it += nw;
+    if(world_rank == min_rank) {
+      constraints.emplace_back(c);
+    }
+  }
+
+  // if(world_rank == 0)
+  // printf("[rank %2d] AFTER LOCAL WORK = %lu TOTAL WORK = %lu\n", world_rank,
+  //   workloads[world_rank], total_work);
+
+  return constraints;
+}
+#else
+template <typename WfnType, typename ContainerType>
+auto gen_constraints_general(size_t nlevels, size_t norb, size_t ns_othr,
+                             size_t nd_othr, const ContainerType& unique_alpha,
+                             int world_size, size_t nlevel_min = 0,
+                             int64_t nrec_min = -1) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using constraint_type = alpha_constraint<wfn_traits>;
+  using string_type = typename constraint_type::constraint_type;
+
+  constexpr bool flat_container =
+      std::is_same_v<std::decay_t<WfnType>,
+                     std::decay_t<typename ContainerType::value_type>>;
+
   // Generate triplets + heuristic
-  std::vector<std::pair<wfn_constraint<N>, size_t>> constraint_sizes;
+  std::vector<std::pair<constraint_type, size_t>> constraint_sizes;
   constraint_sizes.reserve(norb * norb * norb);
+#if 0
   size_t total_work = 0;
   for(int t_i = 0; t_i < norb; ++t_i)
     for(int t_j = 0; t_j < t_i; ++t_j)
       for(int t_k = 0; t_k < t_j; ++t_k) {
-        auto constraint = make_triplet<N>(t_i, t_j, t_k);
-        const auto& [T, B, _] = constraint;
+        auto constraint = constraint_type::make_triplet(t_i, t_j, t_k);
 
         size_t nw = 0;
         for(const auto& alpha : unique_alpha) {
-          nw += constraint_histogram(alpha, ns_othr, nd_othr, T, O, B);
+          if constexpr(flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, constraint);
+          else
+            nw += alpha.second * constraint_histogram(alpha.first, ns_othr,
+                                                      nd_othr, constraint);
         }
         if(nw) constraint_sizes.emplace_back(constraint, nw);
         total_work += nw;
       }
 
-  size_t local_average = (0.6 * total_work) / world_size;
+  size_t local_average = (0.8 * total_work) / world_size;
+#else
+  // Generate all the triplets
+  for(int t_i = 0; t_i < norb; ++t_i)
+    for(int t_j = 0; t_j < t_i; ++t_j)
+      for(int t_k = 0; t_k < t_j; ++t_k) {
+        auto constraint = constraint_type::make_triplet(t_i, t_j, t_k);
+        constraint_sizes.emplace_back(constraint, 0ul);
+      }
+
+  // Build up higher-order constraints as base if requested
+  if(nrec_min < 0 or
+     nrec_min >= constraint_sizes.size())  // nrec_min < 0 implies that you want
+                                           // all the constraints upfront
+    for(size_t ilevel = 0; ilevel < nlevel_min; ++ilevel) {
+      decltype(constraint_sizes) cur_constraints;
+      cur_constraints.reserve(constraint_sizes.size() * norb);
+      for(auto [c, nw] : constraint_sizes) {
+        const auto C_min = c.C_min();
+        for(auto q_l = 0; q_l < C_min; ++q_l) {
+          // Generate masks / counts
+          string_type cn_C = c.C();
+          cn_C.flip(q_l);
+          string_type cn_B = c.B() >> (C_min - q_l);
+          constraint_type c_next(cn_C, cn_B, q_l);
+          cur_constraints.emplace_back(c_next, 0ul);
+        }
+      }
+      constraint_sizes = std::move(cur_constraints);
+    }
+
+  struct atomic_wrapper {
+    std::atomic<size_t> value;
+    atomic_wrapper(size_t i = 0) : value(i){};
+    atomic_wrapper(const atomic_wrapper& other)
+        : atomic_wrapper(other.value.load()){};
+    atomic_wrapper& operator=(const atomic_wrapper& other) {
+      value.store(other.value.load());
+      return *this;
+    }
+  };
+
+  // Compute histogram
+  const auto ntrip_full = constraint_sizes.size();
+  std::vector<atomic_wrapper> constraint_work(ntrip_full, 0ul);
+  {
+    global_atomic<size_t> nxtval(MPI_COMM_WORLD);
+#pragma omp parallel
+    {
+      size_t i_trip = 0;
+      while(i_trip < ntrip_full) {
+        i_trip = nxtval.fetch_and_add(1);
+        if(i_trip >= ntrip_full) break;
+        // if(!(i_trip%1000)) printf("cgen %lu / %lu\n", i_trip, ntrip_full);
+        auto& [constraint, __nw] = constraint_sizes[i_trip];
+        auto& c_nw = constraint_work[i_trip];
+        size_t nw = 0;
+        for(const auto& alpha : unique_alpha) {
+          if constexpr(flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, constraint);
+          else
+            nw += alpha.second * constraint_histogram(alpha.first, ns_othr,
+                                                      nd_othr, constraint);
+        }
+        if(nw) c_nw.value.fetch_add(nw);
+      }
+    }
+  }  // Scope nxtval
+
+  std::vector<size_t> constraint_work_bare(ntrip_full);
+  for(auto i_trip = 0; i_trip < ntrip_full; ++i_trip) {
+    constraint_work_bare[i_trip] = constraint_work[i_trip].value.load();
+  }
+  allreduce(constraint_work_bare.data(), ntrip_full, MPI_SUM, MPI_COMM_WORLD);
+
+  // Copy over constraint work
+  for(auto i_trip = 0; i_trip < ntrip_full; ++i_trip) {
+    constraint_sizes[i_trip].second = constraint_work_bare[i_trip];
+  }
+
+  // Remove zeros
+  {
+    auto it = std::partition(constraint_sizes.begin(), constraint_sizes.end(),
+                             [](const auto& p) { return p.second > 0; });
+    constraint_sizes.erase(it, constraint_sizes.end());
+  }
+
+  // Compute average
+  size_t total_work =
+      std::accumulate(constraint_sizes.begin(), constraint_sizes.end(), 0ul,
+                      [](auto s, const auto& p) { return s + p.second; });
+  size_t local_average = total_work / world_size;
+
+  // Manual refinement of top configurations
+  if(nrec_min > 0 and nrec_min < constraint_sizes.size()) {
+    const size_t nleave = constraint_sizes.size() - nrec_min;
+    std::vector<std::pair<constraint_type, size_t>> constraint_to_refine,
+        constraint_to_leave;
+    constraint_to_refine.reserve(nrec_min);
+    constraint_to_refine.reserve(nleave);
+
+    std::copy_n(constraint_sizes.begin(), nrec_min,
+                std::back_inserter(constraint_to_refine));
+    std::copy_n(constraint_sizes.begin() + nrec_min, nleave,
+                std::back_inserter(constraint_to_leave));
+
+    // Deallocate original array
+    decltype(constraint_sizes)().swap(constraint_sizes);
+
+    // Generate refined constraints
+    for(size_t ilevel = 0; ilevel < nlevel_min; ++ilevel) {
+      decltype(constraint_sizes) cur_constraints;
+      cur_constraints.reserve(constraint_to_refine.size() * norb);
+      for(auto [c, nw] : constraint_to_refine) {
+        const auto C_min = c.C_min();
+        for(auto q_l = 0; q_l < C_min; ++q_l) {
+          // Generate masks / counts
+          string_type cn_C = c.C();
+          cn_C.flip(q_l);
+          string_type cn_B = c.B() >> (C_min - q_l);
+          constraint_type c_next(cn_C, cn_B, q_l);
+          cur_constraints.emplace_back(c_next, 0ul);
+        }
+      }
+      constraint_to_refine = std::move(cur_constraints);
+    }
+
+    const size_t nrefine = constraint_to_refine.size();
+
+    global_atomic<size_t> nxtval(MPI_COMM_WORLD);
+    std::vector<atomic_wrapper>().swap(constraint_work);
+    std::vector<size_t>().swap(constraint_work_bare);
+    constraint_work.resize(nrefine, 0ul);
+#pragma omp parallel
+    {
+      size_t i_ref = 0;
+      while(i_ref < nrefine) {
+        i_ref = nxtval.fetch_and_add(1);
+        if(i_ref >= nrefine) break;
+        // if(!(i_ref%1000)) printf("cgen %lu / %lu\n", i_ref, nrefine);
+        auto& [constraint, __nw] = constraint_to_refine[i_ref];
+        auto& c_nw = constraint_work[i_ref];
+        size_t nw = 0;
+        for(const auto& alpha : unique_alpha) {
+          if constexpr(flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, constraint);
+          else
+            nw += alpha.second * constraint_histogram(alpha.first, ns_othr,
+                                                      nd_othr, constraint);
+        }
+        if(nw) c_nw.value.fetch_add(nw);
+      }  // constraint "loop"
+    }    // OpenMP Context
+
+    constraint_work_bare.resize(nrefine);
+    for(auto i_ref = 0; i_ref < nrefine; ++i_ref) {
+      constraint_work_bare[i_ref] = constraint_work[i_ref].value.load();
+    }
+    allreduce(constraint_work_bare.data(), nrefine, MPI_SUM, MPI_COMM_WORLD);
+
+    // Copy over constraint work
+    for(auto i_ref = 0; i_ref < nrefine; ++i_ref) {
+      constraint_to_refine[i_ref].second = constraint_work_bare[i_ref];
+    }
+
+    // Remove zeros
+    {
+      auto it = std::partition(constraint_to_refine.begin(),
+                               constraint_to_refine.end(),
+                               [](const auto& p) { return p.second > 0; });
+      constraint_to_refine.erase(it, constraint_to_refine.end());
+    }
+
+    // Concatenate the arrays
+    constraint_sizes.reserve(nrefine + nleave);
+    std::copy_n(constraint_to_refine.begin(), nrefine,
+                std::back_inserter(constraint_sizes));
+    std::copy_n(constraint_to_leave.begin(), nleave,
+                std::back_inserter(constraint_sizes));
+
+    size_t tmp =
+        std::accumulate(constraint_sizes.begin(), constraint_sizes.end(), 0ul,
+                        [](auto s, const auto& p) { return s + p.second; });
+    if(tmp != total_work) throw std::runtime_error("Incorrect Refinement");
+  }  // Selective refinement logic
+
+#endif
 
   for(size_t ilevel = 0; ilevel < nlevels; ++ilevel) {
     // Select constraints larger than average to be broken apart
-    std::vector<std::pair<wfn_constraint<N>, size_t>> tps_to_next;
+    std::vector<std::pair<constraint_type, size_t>> tps_to_next;
     {
       auto it = std::partition(
           constraint_sizes.begin(), constraint_sizes.end(),
@@ -534,21 +871,25 @@ auto dist_constraint_general(size_t nlevels, size_t norb, size_t ns_othr,
 
     // Break apart constraints
     for(auto [c, nw_trip] : tps_to_next) {
-      const auto C_min = c.C_min;
+      const auto C_min = c.C_min();
 
       // Loop over possible constraints with one more element
       for(auto q_l = 0; q_l < C_min; ++q_l) {
         // Generate masks / counts
-        wfn_constraint<N> c_next = c;
-        c_next.C.flip(q_l);
-        c_next.B >>= (C_min - q_l);
-        c_next.C_min = q_l;
+        string_type cn_C = c.C();
+        cn_C.flip(q_l);
+        string_type cn_B = c.B() >> (C_min - q_l);
+        constraint_type c_next(cn_C, cn_B, q_l);
 
         size_t nw = 0;
 
         for(const auto& alpha : unique_alpha) {
-          nw += constraint_histogram(alpha, ns_othr, nd_othr, c_next.C, O,
-                                     c_next.B);
+          if constexpr(flat_container)
+            nw += constraint_histogram(wfn_traits::alpha_string(alpha), ns_othr,
+                                       nd_othr, c_next);
+          else
+            nw += alpha.second *
+                  constraint_histogram(alpha.first, ns_othr, nd_othr, c_next);
         }
         if(nw) constraint_sizes.emplace_back(c_next, nw);
         total_work += nw;
@@ -556,30 +897,32 @@ auto dist_constraint_general(size_t nlevels, size_t norb, size_t ns_othr,
     }
   }  // Recurse into constraints
 
-  // if(!world_rank) {
-  //   const auto ntrip = std::count_if(constraint_sizes.begin(),
-  //     constraint_sizes.end(), [](auto &c){ return c.first.C.count() == 3; });
-  //   printf("[rank 0] NTRIP = %lu\n", ntrip);
-  //   if(nlevels > 0) {
-  //     const auto nquad = std::count_if(constraint_sizes.begin(),
-  //       constraint_sizes.end(), [](auto &c){ return c.first.C.count() == 4;
-  //       });
-  //     printf("[rank 0] NQUAD = %lu\n", nquad);
-  //   }
-  //   if(nlevels > 1) {
-  //     const auto nquint = std::count_if(constraint_sizes.begin(),
-  //       constraint_sizes.end(), [](auto &c){ return c.first.C.count() == 5;
-  //       });
-  //     printf("[rank 0] NQINT = %lu\n", nquint);
-  //   }
-  // }
-
   // Sort to get optimal bucket partitioning
   std::sort(constraint_sizes.begin(), constraint_sizes.end(),
             [](const auto& a, const auto& b) { return a.second > b.second; });
 
+  return constraint_sizes;
+}
+
+template <typename WfnType, typename ContainerType>
+auto dist_constraint_general(size_t nlevels, size_t norb, size_t ns_othr,
+                             size_t nd_othr, const ContainerType& unique_alpha,
+                             MPI_Comm comm) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using constraint_type = alpha_constraint<wfn_traits>;
+
+  auto world_rank = comm_rank(comm);
+  auto world_size = comm_size(comm);
+
+  // Generate constraints subject to expected workload
+  auto constraint_sizes = gen_constraints_general<WfnType>(
+      nlevels, norb, ns_othr, nd_othr, unique_alpha, world_size);
+
+  // Global workloads
+  std::vector<size_t> workloads(world_size, 0);
+
   // Assign work
-  std::vector<wfn_constraint<N>> constraints;
+  std::vector<constraint_type> constraints;
   constraints.reserve(constraint_sizes.size() / world_size);
 
   for(auto [c, nw] : constraint_sizes) {
@@ -601,6 +944,7 @@ auto dist_constraint_general(size_t nlevels, size_t norb, size_t ns_othr,
   return constraints;
 }
 #endif
+#endif
 
 #if 0
 template <typename Integral, size_t N>
diff --git a/include/macis/asci/pt2.hpp b/include/macis/asci/pt2.hpp
new file mode 100644
index 00000000..ab7ba4c3
--- /dev/null
+++ b/include/macis/asci/pt2.hpp
@@ -0,0 +1,529 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#pragma once
+#include <deque>
+#include <fstream>
+#include <macis/asci/determinant_contributions.hpp>
+#include <macis/asci/determinant_sort.hpp>
+#include <sstream>
+
+namespace macis {
+
+template <size_t N>
+double asci_pt2_constraint(ASCISettings asci_settings,
+                           wavefunction_iterator_t<N> cdets_begin,
+                           wavefunction_iterator_t<N> cdets_end,
+                           const double E_ASCI, const std::vector<double>& C,
+                           size_t norb, const double* T_pq, const double* G_red,
+                           const double* V_red, const double* G_pqrs,
+                           const double* V_pqrs,
+                           HamiltonianGenerator<wfn_t<N>>& ham_gen,
+                           MPI_Comm comm) {
+  using clock_type = std::chrono::high_resolution_clock;
+  using duration_type = std::chrono::duration<double, std::milli>;
+  using wfn_traits = wavefunction_traits<wfn_t<N>>;
+  using spin_wfn_type = spin_wfn_t<wfn_t<N>>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
+  using wfn_comp = typename wfn_traits::spin_comparator;
+  if(!std::is_sorted(cdets_begin, cdets_end, wfn_comp{}))
+    throw std::runtime_error("PT2 Only Works with Sorted Wfns");
+
+  auto world_rank = comm_rank(comm);
+  auto world_size = comm_size(comm);
+  auto logger = spdlog::get("asci_pt2");
+  if(!logger)
+    logger = world_rank ? spdlog::null_logger_mt("asci_pt2")
+                        : spdlog::stdout_color_mt("asci_pt2");
+
+  const size_t ncdets = std::distance(cdets_begin, cdets_end);
+  logger->info("[ASCI PT2 Settings]");
+  logger->info("  * NDETS                  = {}", ncdets);
+  logger->info("  * PT2_TOL                = {}", asci_settings.pt2_tol);
+  logger->info("  * PT2_RESERVE_COUNT      = {}",
+               asci_settings.pt2_reserve_count);
+  logger->info("  * PT2_CONSTRAINT_LVL_MAX = {}",
+               asci_settings.pt2_max_constraint_level);
+  logger->info("  * PT2_CONSTRAINT_LVL_MIN = {}",
+               asci_settings.pt2_min_constraint_level);
+  logger->info("  * PT2_CNSTRNT_RFNE_FORCE = {}",
+               asci_settings.pt2_constraint_refine_force);
+  logger->info("  * PT2_PRUNE              = {}", asci_settings.pt2_prune);
+  logger->info("  * PT2_PRECOMP_EPS        = {}",
+               asci_settings.pt2_precompute_eps);
+  logger->info("  * PT2_BIGCON_THRESH      = {}",
+               asci_settings.pt2_bigcon_thresh);
+  logger->info("  * NXTVAL_BCOUNT_THRESH   = {}",
+               asci_settings.nxtval_bcount_thresh);
+  logger->info("  * NXTVAL_BCOUNT_INC      = {}",
+               asci_settings.nxtval_bcount_inc);
+  logger->info("");
+
+  // For each unique alpha, create a list of beta string and store metadata
+  struct beta_coeff_data {
+    spin_wfn_type beta_string;
+    std::vector<uint8_t> occ_beta;
+    std::vector<uint8_t> vir_beta;
+    std::vector<double> orb_ens_alpha;
+    std::vector<double> orb_ens_beta;
+    double coeff;
+    double h_diag;
+
+    size_t mem() const {
+      return sizeof(spin_wfn_type) +
+             (occ_beta.capacity() + vir_beta.capacity()) * sizeof(uint8_t) +
+             (2 + orb_ens_alpha.capacity() + orb_ens_beta.capacity()) *
+                 sizeof(double);
+    }
+
+    beta_coeff_data(double c, size_t norb,
+                    const std::vector<uint32_t>& occ_alpha, wfn_t<N> w,
+                    const HamiltonianGenerator<wfn_t<N>>& ham_gen, bool pce,
+                    bool pci) {
+      coeff = c;
+
+      beta_string = wfn_traits::beta_string(w);
+
+      // Compute diagonal matrix element
+      h_diag = ham_gen.matrix_element(w, w);
+
+      // Compute occ/vir for beta string
+      std::vector<uint32_t> o_32, v_32;
+      if(pce or pci) {
+        spin_wfn_traits::state_to_occ_vir(norb, beta_string, o_32, v_32);
+        occ_beta.resize(o_32.size());
+        std::copy(o_32.begin(), o_32.end(), occ_beta.begin());
+        vir_beta.resize(v_32.size());
+        std::copy(v_32.begin(), v_32.end(), vir_beta.begin());
+      }
+
+      // Precompute orbital energies
+      if(pce) {
+        orb_ens_alpha = ham_gen.single_orbital_ens(norb, occ_alpha, o_32);
+        orb_ens_beta = ham_gen.single_orbital_ens(norb, o_32, occ_alpha);
+      }
+    }
+  };
+
+  auto uniq_alpha = get_unique_alpha(cdets_begin, cdets_end);
+  const size_t nuniq_alpha = uniq_alpha.size();
+  logger->info("  * NUNIQ_ALPHA = {}", nuniq_alpha);
+  std::vector<size_t> uniq_alpha_ioff(nuniq_alpha);
+  std::transform_exclusive_scan(
+      uniq_alpha.begin(), uniq_alpha.end(), uniq_alpha_ioff.begin(), 0ul,
+      std::plus<size_t>(), [](const auto& p) { return p.second; });
+
+  using unique_alpha_data = std::vector<beta_coeff_data>;
+  std::vector<unique_alpha_data> uad(nuniq_alpha);
+  for(auto i = 0, iw = 0; i < nuniq_alpha; ++i) {
+    std::vector<uint32_t> occ_alpha, vir_alpha;
+    spin_wfn_traits::state_to_occ_vir(norb, uniq_alpha[i].first, occ_alpha,
+                                      vir_alpha);
+
+    const auto nbeta = uniq_alpha[i].second;
+    uad[i].reserve(nbeta);
+    for(auto j = 0; j < nbeta; ++j, ++iw) {
+      const auto& w = *(cdets_begin + iw);
+      uad[i].emplace_back(C[iw], norb, occ_alpha, w, ham_gen,
+                          asci_settings.pt2_precompute_eps,
+                          asci_settings.pt2_precompute_idx);
+    }
+  }
+
+  if(world_rank == 0) {
+    constexpr double gib = 1024 * 1024 * 1024;
+    logger->info("MEM REQ DETS = {:.2e}", ncdets * sizeof(wfn_t<N>) / gib);
+    logger->info("MEM REQ C    = {:.2e}", ncdets * sizeof(double) / gib);
+    size_t mem_alpha = 0;
+    for(auto i = 0ul; i < nuniq_alpha; ++i) {
+      mem_alpha += sizeof(spin_wfn_type);
+      for(auto j = 0ul; j < uad[i].size(); ++j) {
+        mem_alpha += uad[i][j].mem();
+      }
+    }
+    logger->info("MEM REQ ALPH = {:.2e}", mem_alpha / gib);
+    logger->info(
+        "MEM REQ CONT = {:.2e}",
+        asci_settings.pt2_reserve_count * sizeof(asci_contrib<wfn_t<N>>) / gib);
+  }
+  MPI_Barrier(comm);
+
+  const auto n_occ_alpha = spin_wfn_traits::count(uniq_alpha[0].first);
+  const auto n_vir_alpha = norb - n_occ_alpha;
+  const auto n_sing_alpha = n_occ_alpha * n_vir_alpha;
+  const auto n_doub_alpha = (n_sing_alpha * (n_sing_alpha - norb + 1)) / 4;
+
+  const auto n_occ_beta = cdets_begin->count() - n_occ_alpha;
+  const auto n_vir_beta = norb - n_occ_beta;
+  const auto n_sing_beta = n_occ_beta * n_vir_beta;
+  const auto n_doub_beta = (n_sing_beta * (n_sing_beta - norb + 1)) / 4;
+
+  logger->info("  * NS = {} ND = {}", n_sing_alpha, n_doub_alpha);
+
+  auto gen_c_st = clock_type::now();
+  // auto constraints = dist_constraint_general<wfn_t<N>>(
+  //     5, norb, n_sing_beta, n_doub_beta, uniq_alpha, comm);
+  auto constraints = gen_constraints_general<wfn_t<N>>(
+      asci_settings.pt2_max_constraint_level, norb, n_sing_beta, n_doub_beta,
+      uniq_alpha, world_size * omp_get_max_threads(),
+      asci_settings.pt2_min_constraint_level,
+      asci_settings.pt2_constraint_refine_force);
+  auto gen_c_en = clock_type::now();
+  duration_type gen_c_dur = gen_c_en - gen_c_st;
+  logger->info("  * GEN_DUR = {:.2e} ms", gen_c_dur.count());
+  // if(!world_rank) {
+  //   std::ofstream c_file("constraint_work.txt");
+  //   std::stringstream ss;
+  //   for(auto [c,s] : constraints) {
+  //     ss << c.C() << " " << s << std::endl;
+  //   }
+  //   auto str = ss.str();
+  //   c_file.write(str.c_str(), str.size());
+  // }
+  // if(!world_rank) {
+  //   std::ofstream c_file("unique_alpha.txt");
+  //   std::stringstream ss;
+  //   for(size_t i = 0; i < nuniq_alpha; ++i) {
+  //     ss << uniq_alpha[i].first << " " << uniq_alpha[i].second << std::endl;
+  //   }
+  //   auto str = ss.str();
+  //   c_file.write(str.c_str(), str.size());
+  // }
+
+  double EPT2 = 0.0;
+  size_t NPT2 = 0;
+
+  const size_t ncon_total = constraints.size();
+  const size_t ncon_big = asci_settings.pt2_bigcon_thresh;
+  const size_t ncon_small = ncon_total - ncon_big;
+
+  // Global atomic task-id counter
+  global_atomic<size_t> nxtval_big(comm, 0);
+  global_atomic<size_t> nxtval_small(comm, ncon_big);
+  const double h_el_tol = asci_settings.pt2_tol;
+
+  auto pt2_st = clock_type::now();
+  // Assign each "big" constraint to an MPI rank, thread over contributions
+  {
+    size_t ic = 0;
+    while(ic < ncon_big) {
+      // Atomically get the next task ID and increment for other
+      // MPI ranks
+      ic = nxtval_big.fetch_and_add(1);
+      if(ic >= ncon_big) continue;
+      if(asci_settings.pt2_print_progress)
+        printf("[pt2_big rank %4d] %10lu / %10lu\n", world_rank, ic,
+               ncon_total);
+      const auto& con = constraints[ic].first;
+
+      asci_contrib_container<wfn_t<N>> asci_pairs_con;
+#pragma omp parallel
+      {
+        asci_contrib_container<wfn_t<N>> asci_pairs;
+#pragma omp for schedule(dynamic)
+        for(size_t i_alpha = 0; i_alpha < nuniq_alpha; ++i_alpha) {
+          const size_t old_pair_size = asci_pairs.size();
+          const auto& alpha_det = uniq_alpha[i_alpha].first;
+          const auto ncon_alpha = constraint_histogram(alpha_det, 1, 1, con);
+          if(!ncon_alpha) continue;
+          const auto occ_alpha = bits_to_indices(alpha_det);
+          const bool alpha_satisfies_con = satisfies_constraint(alpha_det, con);
+
+          const auto& bcd = uad[i_alpha];
+          const size_t nbeta = bcd.size();
+          for(size_t j_beta = 0; j_beta < nbeta; ++j_beta) {
+            const size_t iw = uniq_alpha_ioff[i_alpha] + j_beta;
+            const auto w = *(cdets_begin + iw);
+            const auto c = C[iw];
+            const auto& beta_det = bcd[j_beta].beta_string;
+            const auto h_diag = bcd[j_beta].h_diag;
+
+// TODO: These copies are slow
+#if 0
+          const auto& occ_beta_8 = bcd[j_beta].occ_beta;
+          const auto& vir_beta_8 = bcd[j_beta].vir_beta;
+          std::vector<uint32_t> occ_beta(occ_beta_8.size()), vir_beta(vir_beta_8.size());
+          std::copy(occ_beta_8.begin(), occ_beta_8.end(), occ_beta.begin());
+          std::copy(vir_beta_8.begin(), vir_beta_8.end(), vir_beta.begin());
+#else
+            std::vector<uint32_t> occ_beta, vir_beta;
+            spin_wfn_traits::state_to_occ_vir(norb, beta_det, occ_beta,
+                                              vir_beta);
+#endif
+
+            std::vector<double> orb_ens_alpha, orb_ens_beta;
+            if(asci_settings.pt2_precompute_eps) {
+              orb_ens_alpha = bcd[j_beta].orb_ens_alpha;
+              orb_ens_beta = bcd[j_beta].orb_ens_beta;
+            } else {
+              orb_ens_alpha =
+                  ham_gen.single_orbital_ens(norb, occ_alpha, occ_beta);
+              orb_ens_beta =
+                  ham_gen.single_orbital_ens(norb, occ_beta, occ_alpha);
+            }
+
+            // AA excitations
+            generate_constraint_singles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), T_pq,
+                norb, G_red, norb, V_red, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            // AAAA excitations
+            generate_constraint_doubles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), G_pqrs,
+                norb, h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+            // AABB excitations
+            generate_constraint_doubles_contributions_os(
+                c, w, con, occ_alpha, occ_beta, vir_beta, orb_ens_alpha.data(),
+                orb_ens_beta.data(), V_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            if(alpha_satisfies_con) {
+              // BB excitations
+              append_singles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), T_pq, norb, G_red, norb, V_red, norb,
+                  h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+              // BBBB excitations
+              append_ss_doubles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, alpha_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                  ham_gen, asci_pairs);
+
+              // No excitation (push inf to remove from list)
+              asci_pairs.push_back(
+                  {w, std::numeric_limits<double>::infinity(), 1.0});
+            }
+          }
+#if 0
+        if(asci_settings.pt2_prune and asci_pairs.size() > asci_settings.pt2_reserve_count and asci_pairs.size() != old_pair_size) {
+        // Cleanup
+        auto uit = stable_sort_and_accumulate_asci_pairs(asci_pairs.begin(),
+                                                  asci_pairs.end());
+        asci_pairs.erase(uit, asci_pairs.end());
+        //uit = std::stable_partition(asci_pairs.begin(), asci_pairs.end(), [&](const auto& p){ return std::abs(p.pt2()) > h_el_tol; });
+        //asci_pairs.erase(uit, asci_pairs.end());
+          printf("[pt2_prune rank %4d tid:%4d] IC = %lu / %lu IA = %lu / %lu SZ = %lu\n", world_rank,
+                 omp_get_thread_num(), ic, ncon_total, i_alpha,
+                 nuniq_alpha, asci_pairs.size());
+        }
+#endif
+
+        }  // Unique Alpha Loop
+
+        // S&A Thread local pairs
+        sort_and_accumulate_asci_pairs(asci_pairs);
+
+// Insert
+#pragma omp critical
+        {
+          if(asci_pairs_con.size()) {
+            asci_pairs_con.reserve(asci_pairs.size() + asci_pairs_con.size());
+            asci_pairs_con.insert(asci_pairs_con.end(), asci_pairs.begin(),
+                                  asci_pairs.end());
+          } else {
+            asci_pairs_con = std::move(asci_pairs);
+          }
+        }
+
+      }  // OpenMP
+
+      double EPT2_local = 0.0;
+      size_t NPT2_local = 0;
+      size_t pair_size = 0;
+      // Local S&A for each quad + update EPT2
+      {
+        auto uit = sort_and_accumulate_asci_pairs(asci_pairs_con.begin(),
+                                                  asci_pairs_con.end());
+        pair_size = std::distance(asci_pairs_con.begin(), uit);
+        for(auto it = asci_pairs_con.begin(); it != uit; ++it) {
+          if(!std::isinf(it->c_times_matel)) {
+            EPT2_local += it->pt2();
+            NPT2_local++;
+          }
+        }
+        asci_pairs_con.clear();
+        if(asci_settings.pt2_print_progress)
+          printf("[pt2_big rank %4d] CAPACITY %lu SZ %lu\n", world_rank,
+                 asci_pairs_con.capacity(), pair_size);
+      }
+
+      EPT2 += EPT2_local;
+      NPT2 += NPT2_local;
+    }  // Constraint "loop"
+  }    // "Big constraints"
+
+  // Parallelize over both MPI + threads for "small" constraints
+#pragma omp parallel reduction(+ : EPT2) reduction(+ : NPT2)
+  {
+    // Process ASCI pair contributions for each constraint
+    asci_contrib_container<wfn_t<N>> asci_pairs;
+    // asci_pairs.reserve(asci_settings.pt2_reserve_count);
+    size_t ic = 0;
+    while(ic < ncon_total) {
+      // Atomically get the next task ID and increment for other
+      // MPI ranks and threads
+      size_t ntake = ic < asci_settings.nxtval_bcount_thresh
+                         ? 1
+                         : asci_settings.nxtval_bcount_inc;
+      ic = nxtval_small.fetch_and_add(ntake);
+
+      // Loop over assigned tasks
+      const size_t c_end = std::min(ncon_total, ic + ntake);
+      for(; ic < c_end; ++ic) {
+        const auto& con = constraints[ic].first;
+        if(asci_settings.pt2_print_progress)
+          printf("[pt2_small rank %4d tid:%4d] %10lu / %10lu\n", world_rank,
+                 omp_get_thread_num(), ic, ncon_total);
+
+        for(size_t i_alpha = 0; i_alpha < nuniq_alpha; ++i_alpha) {
+          const size_t old_pair_size = asci_pairs.size();
+          const auto& alpha_det = uniq_alpha[i_alpha].first;
+          const auto ncon_alpha = constraint_histogram(alpha_det, 1, 1, con);
+          if(!ncon_alpha) continue;
+          const auto occ_alpha = bits_to_indices(alpha_det);
+          const bool alpha_satisfies_con = satisfies_constraint(alpha_det, con);
+
+          const auto& bcd = uad[i_alpha];
+          const size_t nbeta = bcd.size();
+          for(size_t j_beta = 0; j_beta < nbeta; ++j_beta) {
+            const size_t iw = uniq_alpha_ioff[i_alpha] + j_beta;
+            const auto w = *(cdets_begin + iw);
+            const auto c = C[iw];
+            const auto& beta_det = bcd[j_beta].beta_string;
+            const auto h_diag = bcd[j_beta].h_diag;
+
+// TODO: These copies are slow
+#if 0
+            const auto& occ_beta_8 = bcd[j_beta].occ_beta;
+            const auto& vir_beta_8 = bcd[j_beta].vir_beta;
+            std::vector<uint32_t> occ_beta(occ_beta_8.size()), vir_beta(vir_beta_8.size());
+            std::copy(occ_beta_8.begin(), occ_beta_8.end(), occ_beta.begin());
+            std::copy(vir_beta_8.begin(), vir_beta_8.end(), vir_beta.begin());
+#else
+            std::vector<uint32_t> occ_beta, vir_beta;
+            spin_wfn_traits::state_to_occ_vir(norb, beta_det, occ_beta,
+                                              vir_beta);
+#endif
+
+            std::vector<double> orb_ens_alpha, orb_ens_beta;
+            if(asci_settings.pt2_precompute_eps) {
+              orb_ens_alpha = bcd[j_beta].orb_ens_alpha;
+              orb_ens_beta = bcd[j_beta].orb_ens_beta;
+            } else {
+              orb_ens_alpha =
+                  ham_gen.single_orbital_ens(norb, occ_alpha, occ_beta);
+              orb_ens_beta =
+                  ham_gen.single_orbital_ens(norb, occ_beta, occ_alpha);
+            }
+
+            // AA excitations
+            generate_constraint_singles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), T_pq,
+                norb, G_red, norb, V_red, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            // AAAA excitations
+            generate_constraint_doubles_contributions_ss(
+                c, w, con, occ_alpha, occ_beta, orb_ens_alpha.data(), G_pqrs,
+                norb, h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+            // AABB excitations
+            generate_constraint_doubles_contributions_os(
+                c, w, con, occ_alpha, occ_beta, vir_beta, orb_ens_alpha.data(),
+                orb_ens_beta.data(), V_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                ham_gen, asci_pairs);
+
+            if(alpha_satisfies_con) {
+              // BB excitations
+              append_singles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), T_pq, norb, G_red, norb, V_red, norb,
+                  h_el_tol, h_diag, E_ASCI, ham_gen, asci_pairs);
+
+              // BBBB excitations
+              append_ss_doubles_asci_contributions<Spin::Beta>(
+                  c, w, beta_det, alpha_det, occ_beta, vir_beta, occ_alpha,
+                  orb_ens_beta.data(), G_pqrs, norb, h_el_tol, h_diag, E_ASCI,
+                  ham_gen, asci_pairs);
+
+              // No excitation (push inf to remove from list)
+              asci_pairs.push_back(
+                  {w, std::numeric_limits<double>::infinity(), 1.0});
+            }
+          }
+          if(asci_settings.pt2_prune and
+             asci_pairs.size() > asci_settings.pt2_reserve_count and
+             asci_pairs.size() != old_pair_size) {
+            // Cleanup
+            auto uit = stable_sort_and_accumulate_asci_pairs(asci_pairs.begin(),
+                                                             asci_pairs.end());
+            asci_pairs.erase(uit, asci_pairs.end());
+            // uit = std::stable_partition(asci_pairs.begin(), asci_pairs.end(),
+            // [&](const auto& p){ return std::abs(p.pt2()) > h_el_tol; });
+            // asci_pairs.erase(uit, asci_pairs.end());
+            if(asci_settings.pt2_print_progress)
+              printf(
+                  "[pt2_prune rank %4d tid:%4d] IC = %lu / %lu IA = %lu / %lu "
+                  "SZ = %lu\n",
+                  world_rank, omp_get_thread_num(), ic, ncon_total, i_alpha,
+                  nuniq_alpha, asci_pairs.size());
+
+            if(asci_pairs.size() > asci_settings.pt2_reserve_count) {
+              printf("* WARNING: PRUNED SIZE LARGER THAN RESERVE COUNT\n");
+            }
+          }
+
+        }  // Unique Alpha Loop
+
+        double EPT2_local = 0.0;
+        size_t NPT2_local = 0;
+        // Local S&A for each quad + update EPT2
+        {
+          auto uit = sort_and_accumulate_asci_pairs(asci_pairs.begin(),
+                                                    asci_pairs.end());
+          for(auto it = asci_pairs.begin(); it != uit; ++it) {
+            if(!std::isinf(it->c_times_matel)) {
+              EPT2_local += it->pt2();
+              NPT2_local++;
+            }
+          }
+          asci_pairs.clear();
+          // Deallocate
+          if(asci_pairs.capacity() > asci_settings.pt2_reserve_count)
+            asci_contrib_container<wfn_t<N>>().swap(asci_pairs);
+        }
+
+        EPT2 += EPT2_local;
+        NPT2 += NPT2_local;
+      }  // Loc constraint loop
+    }    // Constraint Loop
+  }      // OpenMP
+  auto pt2_en = clock_type::now();
+
+  EPT2 = allreduce(EPT2, MPI_SUM, comm);
+
+  double local_pt2_dur = duration_type(pt2_en - pt2_st).count();
+  if(world_size > 1) {
+    double total_dur = allreduce(local_pt2_dur, MPI_SUM, comm);
+    double min_dur = allreduce(local_pt2_dur, MPI_MIN, comm);
+    double max_dur = allreduce(local_pt2_dur, MPI_MAX, comm);
+    logger->info("* PT2_DUR MIN = {:.2e}, MAX = {:.2e}, AVG = {:.2e} ms",
+                 min_dur, max_dur, total_dur / world_size);
+  } else {
+    logger->info("* PT2_DUR = ${:.2e} ms", local_pt2_dur);
+  }
+
+  NPT2 = allreduce(NPT2, MPI_SUM, comm);
+  logger->info("* NPT2 = {}", NPT2);
+
+  return EPT2;
+}
+}  // namespace macis
diff --git a/include/macis/asci/refine.hpp b/include/macis/asci/refine.hpp
index 4a1f53e6..34701c5b 100644
--- a/include/macis/asci/refine.hpp
+++ b/include/macis/asci/refine.hpp
@@ -14,7 +14,7 @@ namespace macis {
 template <size_t N, typename index_t = int32_t>
 auto asci_refine(ASCISettings asci_settings, MCSCFSettings mcscf_settings,
                  double E0, std::vector<wfn_t<N>> wfn, std::vector<double> X,
-                 HamiltonianGenerator<N>& ham_gen,
+                 HamiltonianGenerator<wfn_t<N>>& ham_gen,
                  size_t norb MACIS_MPI_CODE(, MPI_Comm comm)) {
   auto logger = spdlog::get("asci_refine");
 #ifdef MACIS_ENABLE_MPI
diff --git a/include/macis/bitset_operations.hpp b/include/macis/bitset_operations.hpp
index e0f5ec27..66a56a59 100644
--- a/include/macis/bitset_operations.hpp
+++ b/include/macis/bitset_operations.hpp
@@ -86,7 +86,8 @@ template <size_t N>
 uint128_t to_uint128(std::bitset<N> bits) {
   static_assert(N <= 128, "N > 128");
   if constexpr(N == 128) {
-    auto _x = reinterpret_cast<uint128_t*>(&bits);
+    alignas(alignof(uint128_t)) std::bitset<N> cpy = bits;
+    auto _x = reinterpret_cast<uint128_t*>(&cpy);
     return *_x;
   } else {
     return fast_to_ullong(bits);
@@ -156,12 +157,15 @@ uint32_t ffs(std::bitset<N> bits) {
     return ffsl(fast_to_ulong(bits));
   else if constexpr(N <= 64)
     return ffsll(fast_to_ullong(bits));
-  else if constexpr(N <= 128) {
+  else if constexpr(N % 64 == 0) {
     auto as_words = reinterpret_cast<uint64_t*>(&bits);
-    if(as_words[0])
-      return ffsll(as_words[0]);
-    else
-      return ffsll(as_words[1]) + 64;
+    constexpr int n_words = N / 64;
+    int off = 0;
+    for(int i = 0; i < n_words; ++i) {
+      if(as_words[i]) return ffsll(as_words[i]) + off;
+      off += 64;
+    }
+    return 0;
   } else {
     uint32_t ind = 0;
     for(ind = 0; ind < N; ++ind)
@@ -187,17 +191,20 @@ uint32_t fls(std::bitset<N> bits) {
     return fls(fast_to_ulong(bits));
   else if constexpr(N <= 64)
     return fls(fast_to_ullong(bits));
-  else if constexpr(N <= 128) {
+  else if constexpr(N % 64 == 0) {
     auto as_words = reinterpret_cast<uint64_t*>(&bits);
-    if(as_words[1])
-      return fls(as_words[1]) + 64;
-    else
-      return fls(as_words[0]);
+    constexpr int n_words = N / 64;
+    int off = N - 64;
+    for(int i = n_words - 1; i >= 0; --i) {
+      if(as_words[i]) return fls(as_words[i]) + off;
+      off -= 64;
+    }
+    return UINT32_MAX;
   } else {
     uint32_t ind = 0;
     for(ind = N - 1; ind >= 0; ind--)
       if(bits[ind]) return ind;
-    return ind;
+    return UINT32_MAX;
   }
   abort();
 }
@@ -224,6 +231,7 @@ std::vector<uint32_t> bits_to_indices(std::bitset<N> bits) {
   return indices;
 }
 
+#if 0
 /// Truncate a bitset to one of smaller width
 template <size_t N, size_t M>
 inline std::bitset<N> truncate_bitset(std::bitset<M> bits) {
@@ -242,6 +250,7 @@ inline std::bitset<N> truncate_bitset(std::bitset<M> bits) {
     return trunc_bits;
   }
 }
+#endif
 
 /// Expand a bitset to one of larger width
 template <size_t N, size_t M>
@@ -264,24 +273,68 @@ inline std::bitset<N> expand_bitset(std::bitset<M> bits) {
 /// Extract to lo word of a bitset of even width
 template <size_t N>
 inline std::bitset<N / 2> bitset_lo_word(std::bitset<N> bits) {
-  static_assert(N == 128 or N == 64, "Not Supported");
-  if constexpr(N == 128) {
-    return std::bitset<64>(reinterpret_cast<uint64_t*>(&bits)[0]);
-  }
+  static_assert((N % 64 == 0) and (N == 64 or N / 2 % 64 == 0),
+                "Not Supported");
   if constexpr(N == 64) {
     return std::bitset<32>(reinterpret_cast<uint32_t*>(&bits)[0]);
+  } else {
+    std::bitset<N / 2> lo;
+    constexpr int nword = (N / 2) / 64;
+    auto lo_as_words = reinterpret_cast<uint64_t*>(&lo);
+    auto bi_as_words = reinterpret_cast<uint64_t*>(&bits);
+    for(int i = 0; i < nword; ++i) lo_as_words[i] = bi_as_words[i];
+    return lo;
+  }
+}
+
+template <size_t N>
+inline void set_bitset_lo_word(std::bitset<N>& bits, std::bitset<N / 2> word) {
+  static_assert((N % 64 == 0) and (N == 64 or N / 2 % 64 == 0),
+                "Not Supported");
+
+  if constexpr(N == 64) {
+    auto bi_as_words = reinterpret_cast<uint32_t*>(&bits);
+    auto wo_as_words = reinterpret_cast<uint32_t*>(&word);
+    bi_as_words[0] = wo_as_words[0];
+  } else {
+    constexpr int nword = (N / 2) / 64;
+    auto bi_as_words = reinterpret_cast<uint64_t*>(&bits);
+    auto wo_as_words = reinterpret_cast<uint64_t*>(&word);
+    for(int i = 0; i < nword; ++i) bi_as_words[i] = wo_as_words[i];
   }
 }
 
 /// Extract to hi word of a bitset of even width
 template <size_t N>
 inline std::bitset<N / 2> bitset_hi_word(std::bitset<N> bits) {
-  static_assert(N == 128 or N == 64, "Not Supported");
-  if constexpr(N == 128) {
-    return std::bitset<64>(reinterpret_cast<uint64_t*>(&bits)[1]);
-  }
+  static_assert((N % 64 == 0) and (N == 64 or N / 2 % 64 == 0),
+                "Not Supported");
   if constexpr(N == 64) {
     return std::bitset<32>(reinterpret_cast<uint32_t*>(&bits)[1]);
+  } else {
+    std::bitset<N / 2> hi;
+    constexpr int nword = (N / 2) / 64;
+    auto hi_as_words = reinterpret_cast<uint64_t*>(&hi);
+    auto bi_as_words = reinterpret_cast<uint64_t*>(&bits);
+    for(int i = 0; i < nword; ++i) hi_as_words[i] = bi_as_words[i + nword];
+    return hi;
+  }
+}
+
+template <size_t N>
+inline void set_bitset_hi_word(std::bitset<N>& bits, std::bitset<N / 2> word) {
+  static_assert((N % 64 == 0) and (N == 64 or N / 2 % 64 == 0),
+                "Not Supported");
+
+  if constexpr(N == 64) {
+    auto bi_as_words = reinterpret_cast<uint32_t*>(&bits);
+    auto wo_as_words = reinterpret_cast<uint32_t*>(&word);
+    bi_as_words[1] = wo_as_words[0];
+  } else {
+    constexpr int nword = (N / 2) / 64;
+    auto bi_as_words = reinterpret_cast<uint64_t*>(&bits);
+    auto wo_as_words = reinterpret_cast<uint64_t*>(&word);
+    for(int i = 0; i < nword; ++i) bi_as_words[i + nword] = wo_as_words[i];
   }
 }
 
@@ -293,9 +346,19 @@ bool bitset_less(std::bitset<N> x, std::bitset<N> y) {
   else if constexpr(N <= 64)
     return fast_to_ullong(x) < fast_to_ullong(y);
   else if constexpr(N == 128) {
-    auto _x = reinterpret_cast<uint128_t*>(&x);
-    auto _y = reinterpret_cast<uint128_t*>(&y);
-    return *_x < *_y;
+    auto _x = to_uint128(x);
+    auto _y = to_uint128(y);
+    return _x < _y;
+  } else if constexpr(N % 64 == 0) {
+    auto x_as_words = reinterpret_cast<uint64_t*>(&x);
+    auto y_as_words = reinterpret_cast<uint64_t*>(&y);
+    constexpr auto nwords = N / 64;
+    for(int i = nwords - 1; i >= 0; --i) {
+      const auto x_i = x_as_words[i];
+      const auto y_i = y_as_words[i];
+      if(x_i != y_i) return x_i < y_i;
+    }
+    return false;
   } else {
     for(int i = N - 1; i >= 0; i--) {
       if(x[i] ^ y[i]) return y[i];
diff --git a/include/macis/csr_hamiltonian.hpp b/include/macis/csr_hamiltonian.hpp
index 2a9dc24b..cea646cd 100644
--- a/include/macis/csr_hamiltonian.hpp
+++ b/include/macis/csr_hamiltonian.hpp
@@ -18,12 +18,12 @@
 #include <sparsexx/matrix_types/csr_matrix.hpp>
 namespace macis {
 
-// Base implementation of bitset CSR generation
-template <typename index_t, size_t N>
+// Base implementation of CSR hamiltonian generation
+template <typename index_t, typename WfnType, typename WfnIterator>
 sparsexx::csr_matrix<double, index_t> make_csr_hamiltonian_block(
-    wavefunction_iterator_t<N> bra_begin, wavefunction_iterator_t<N> bra_end,
-    wavefunction_iterator_t<N> ket_begin, wavefunction_iterator_t<N> ket_end,
-    HamiltonianGenerator<N>& ham_gen, double H_thresh) {
+    WfnIterator bra_begin, WfnIterator bra_end, WfnIterator ket_begin,
+    WfnIterator ket_end, HamiltonianGenerator<WfnType>& ham_gen,
+    double H_thresh) {
   size_t nbra = std::distance(bra_begin, bra_end);
   size_t nket = std::distance(ket_begin, ket_end);
 
@@ -35,21 +35,21 @@ sparsexx::csr_matrix<double, index_t> make_csr_hamiltonian_block(
   }
 }
 
-template <typename index_t, size_t N>
+template <typename index_t, typename WfnType, typename WfnIterator>
 sparsexx::csr_matrix<double, index_t> make_csr_hamiltonian(
-    wavefunction_iterator_t<N> sd_begin, wavefunction_iterator_t<N> sd_end,
-    HamiltonianGenerator<N>& ham_gen, double H_thresh) {
+    WfnIterator sd_begin, WfnIterator sd_end,
+    HamiltonianGenerator<WfnType>& ham_gen, double H_thresh) {
   return make_csr_hamiltonian_block<index_t>(sd_begin, sd_end, sd_begin, sd_end,
                                              ham_gen, H_thresh);
 }
 
 #ifdef MACIS_ENABLE_MPI
 // Base implementation of dist-CSR H construction for bitsets
-template <typename index_t, size_t N>
+template <typename index_t, typename WfnType, typename WfnIterator>
 sparsexx::dist_sparse_matrix<sparsexx::csr_matrix<double, index_t>>
-make_dist_csr_hamiltonian(MPI_Comm comm, wavefunction_iterator_t<N> sd_begin,
-                          wavefunction_iterator_t<N> sd_end,
-                          HamiltonianGenerator<N>& ham_gen,
+make_dist_csr_hamiltonian(MPI_Comm comm, WfnIterator sd_begin,
+                          WfnIterator sd_end,
+                          HamiltonianGenerator<WfnType>& ham_gen,
                           const double H_thresh) {
   using namespace sparsexx;
   using namespace sparsexx::detail;
@@ -69,7 +69,7 @@ make_dist_csr_hamiltonian(MPI_Comm comm, wavefunction_iterator_t<N> sd_begin,
 
   if(world_size > 1) {
     // Create a copy of SD's with local bra dets zero'd out
-    std::vector<std::bitset<N>> sds_offdiag(sd_begin, sd_end);
+    std::vector<WfnType> sds_offdiag(sd_begin, sd_end);
     for(auto i = bra_st; i < bra_en; ++i) sds_offdiag[i] = 0ul;
 
     // Build off-diagonal part
diff --git a/include/macis/hamiltonian_generator.hpp b/include/macis/hamiltonian_generator.hpp
index 9ef9583b..4c828ded 100644
--- a/include/macis/hamiltonian_generator.hpp
+++ b/include/macis/hamiltonian_generator.hpp
@@ -7,61 +7,19 @@
  */
 
 #pragma once
-#include <macis/bitset_operations.hpp>
+#include <macis/hamiltonian_generator/base.hpp>
 #include <macis/sd_operations.hpp>
-#include <macis/types.hpp>
-#include <sparsexx/matrix_types/csr_matrix.hpp>
 
 namespace macis {
 
-template <size_t N = 64>
-class HamiltonianGenerator {
-  static_assert(N % 2 == 0, "N Must Be Even");
-
+template <typename WfnType>
+class HamiltonianGenerator : public HamiltonianGeneratorBase<double> {
  public:
-  constexpr static size_t nbits = N;
-
-  using full_det_t = std::bitset<N>;
-  using spin_det_t = std::bitset<N / 2>;
-
-  template <typename index_t>
-  using sparse_matrix_type = sparsexx::csr_matrix<double, index_t>;
-
-  using full_det_iterator = wavefunction_iterator_t<N>;
-
-  using matrix_span_t = matrix_span<double>;
-  using rank3_span_t = rank3_span<double>;
-  using rank4_span_t = rank4_span<double>;
-
- public:
-  inline spin_det_t alpha_string(full_det_t str) { return bitset_lo_word(str); }
-  inline spin_det_t beta_string(full_det_t str) { return bitset_hi_word(str); }
-
-  size_t norb_;
-  size_t norb2_;
-  size_t norb3_;
-  matrix_span_t T_pq_;
-  rank4_span_t V_pqrs_;
-
-  // G(i,j,k,l) = (ij|kl) - (il|kj)
-  std::vector<double> G_pqrs_data_;
-  rank4_span_t G_pqrs_;
-
-  // G_red(i,j,k) = G(i,j,k,k)
-  std::vector<double> G_red_data_;
-  rank3_span_t G_red_;
-
-  // V_red(i,j,k) = (ij|kk)
-  std::vector<double> V_red_data_;
-  rank3_span_t V_red_;
-
-  // G2_red(i,j)  = 0.5 * G(i,i,j,j)
-  std::vector<double> G2_red_data_;
-  matrix_span_t G2_red_;
+  using full_det_t = WfnType;
+  using spin_det_t = spin_wfn_t<WfnType>;
 
-  // V2_red(i,j)  = (ii|jj)
-  std::vector<double> V2_red_data_;
-  matrix_span_t V2_red_;
+  using full_det_container = std::vector<WfnType>;
+  using full_det_iterator = typename full_det_container::iterator;
 
   virtual sparse_matrix_type<int32_t> make_csr_hamiltonian_block_32bit_(
       full_det_iterator, full_det_iterator, full_det_iterator,
@@ -72,16 +30,10 @@ class HamiltonianGenerator {
       full_det_iterator, double) = 0;
 
  public:
-  HamiltonianGenerator(matrix_span_t T, rank4_span_t V);
-  virtual ~HamiltonianGenerator() noexcept = default;
-
-  void generate_integral_intermediates(rank4_span_t V);
+  HamiltonianGenerator(matrix_span_t T, rank4_span_t V)
+      : HamiltonianGeneratorBase<double>(T, V){};
 
-  inline auto* T() const { return T_pq_.data_handle(); }
-  inline auto* G_red() const { return G_red_data_.data(); }
-  inline auto* V_red() const { return V_red_data_.data(); }
-  inline auto* G() const { return G_pqrs_data_.data(); }
-  inline auto* V() const { return V_pqrs_.data_handle(); }
+  virtual ~HamiltonianGenerator() noexcept = default;
 
   double matrix_element_4(spin_det_t bra, spin_det_t ket, spin_det_t ex) const;
   double matrix_element_22(spin_det_t bra_alpha, spin_det_t ket_alpha,
@@ -101,42 +53,6 @@ class HamiltonianGenerator {
                         const std::vector<uint32_t>& bra_occ_alpha,
                         const std::vector<uint32_t>& bra_occ_beta) const;
 
-  double single_orbital_en(uint32_t orb, const std::vector<uint32_t>& ss_occ,
-                           const std::vector<uint32_t>& os_occ) const;
-
-  std::vector<double> single_orbital_ens(
-      size_t norb, const std::vector<uint32_t>& ss_occ,
-      const std::vector<uint32_t>& os_occ) const;
-
-  double fast_diag_single(const std::vector<uint32_t>& ss_occ,
-                          const std::vector<uint32_t>& os_occ, uint32_t orb_hol,
-                          uint32_t orb_par, double orig_det_Hii) const;
-
-  double fast_diag_single(double hol_en, double par_en, uint32_t orb_hol,
-                          uint32_t orb_par, double orig_det_Hii) const;
-
-  double fast_diag_ss_double(double en_hol1, double en_hol2, double en_par1,
-                             double en_par2, uint32_t orb_hol1,
-                             uint32_t orb_hol2, uint32_t orb_par1,
-                             uint32_t orb_par2, double orig_det_Hii) const;
-
-  double fast_diag_ss_double(const std::vector<uint32_t>& ss_occ,
-                             const std::vector<uint32_t>& os_occ,
-                             uint32_t orb_hol1, uint32_t orb_hol2,
-                             uint32_t orb_par1, uint32_t orb_par2,
-                             double orig_det_Hii) const;
-
-  double fast_diag_os_double(double en_holu, double en_hold, double en_paru,
-                             double en_pard, uint32_t orb_holu,
-                             uint32_t orb_hold, uint32_t orb_paru,
-                             uint32_t orb_pard, double orig_det_Hii) const;
-
-  double fast_diag_os_double(const std::vector<uint32_t>& up_occ,
-                             const std::vector<uint32_t>& do_occ,
-                             uint32_t orb_holu, uint32_t orb_hold,
-                             uint32_t orb_paru, uint32_t orb_pard,
-                             double orig_det_Hii) const;
-
   double matrix_element(full_det_t bra, full_det_t ket) const;
 
   template <typename index_t>
@@ -180,11 +96,9 @@ class HamiltonianGenerator {
                          full_det_iterator, full_det_iterator, double* C,
                          matrix_span_t ordm, rank4_span_t trdm) = 0;
 
-  void rotate_hamiltonian_ordm(const double* ordm);
-
   virtual void SetJustSingles(bool /*_js*/) {}
   virtual bool GetJustSingles() { return false; }
-  virtual size_t GetNimp() const { return N / 2; }
+  // virtual size_t GetNimp() const { return N / 2; }
 };
 
 }  // namespace macis
diff --git a/include/macis/hamiltonian_generator/base.hpp b/include/macis/hamiltonian_generator/base.hpp
new file mode 100644
index 00000000..64dcb67f
--- /dev/null
+++ b/include/macis/hamiltonian_generator/base.hpp
@@ -0,0 +1,106 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#pragma once
+#include <macis/types.hpp>
+#include <sparsexx/matrix_types/csr_matrix.hpp>
+
+namespace macis {
+
+template <typename Scalar>
+class HamiltonianGeneratorBase {
+ protected:
+  template <typename index_t>
+  using sparse_matrix_type = sparsexx::csr_matrix<Scalar, index_t>;
+
+  using matrix_span_t = matrix_span<Scalar>;
+  using rank3_span_t = rank3_span<Scalar>;
+  using rank4_span_t = rank4_span<Scalar>;
+
+  size_t norb_;
+  size_t norb2_;
+  size_t norb3_;
+  matrix_span_t T_pq_;
+  rank4_span_t V_pqrs_;
+
+  // G(i,j,k,l) = (ij|kl) - (il|kj)
+  std::vector<Scalar> G_pqrs_data_;
+  rank4_span_t G_pqrs_;
+
+  // G_red(i,j,k) = G(i,j,k,k)
+  std::vector<Scalar> G_red_data_;
+  rank3_span_t G_red_;
+
+  // V_red(i,j,k) = (ij|kk)
+  std::vector<Scalar> V_red_data_;
+  rank3_span_t V_red_;
+
+  // G2_red(i,j)  = 0.5 * G(i,i,j,j)
+  std::vector<Scalar> G2_red_data_;
+  matrix_span_t G2_red_;
+
+  // V2_red(i,j)  = (ii|jj)
+  std::vector<Scalar> V2_red_data_;
+  matrix_span_t V2_red_;
+
+  void generate_integral_intermediates_(rank4_span_t V);
+
+ public:
+  HamiltonianGeneratorBase(matrix_span_t T, rank4_span_t V);
+  virtual ~HamiltonianGeneratorBase() noexcept = default;
+
+  inline auto* T() const { return T_pq_.data_handle(); }
+  inline auto* G_red() const { return G_red_data_.data(); }
+  inline auto* V_red() const { return V_red_data_.data(); }
+  inline auto* G() const { return G_pqrs_data_.data(); }
+  inline auto* V() const { return V_pqrs_.data_handle(); }
+
+  inline void generate_integral_intermediates() {
+    generate_integral_intermediates_(V_pqrs_);
+  }
+
+  double single_orbital_en(uint32_t orb, const std::vector<uint32_t>& ss_occ,
+                           const std::vector<uint32_t>& os_occ) const;
+
+  std::vector<double> single_orbital_ens(
+      size_t norb, const std::vector<uint32_t>& ss_occ,
+      const std::vector<uint32_t>& os_occ) const;
+
+  double fast_diag_single(const std::vector<uint32_t>& ss_occ,
+                          const std::vector<uint32_t>& os_occ, uint32_t orb_hol,
+                          uint32_t orb_par, double orig_det_Hii) const;
+
+  double fast_diag_single(double hol_en, double par_en, uint32_t orb_hol,
+                          uint32_t orb_par, double orig_det_Hii) const;
+
+  double fast_diag_ss_double(double en_hol1, double en_hol2, double en_par1,
+                             double en_par2, uint32_t orb_hol1,
+                             uint32_t orb_hol2, uint32_t orb_par1,
+                             uint32_t orb_par2, double orig_det_Hii) const;
+
+  double fast_diag_ss_double(const std::vector<uint32_t>& ss_occ,
+                             const std::vector<uint32_t>& os_occ,
+                             uint32_t orb_hol1, uint32_t orb_hol2,
+                             uint32_t orb_par1, uint32_t orb_par2,
+                             double orig_det_Hii) const;
+
+  double fast_diag_os_double(double en_holu, double en_hold, double en_paru,
+                             double en_pard, uint32_t orb_holu,
+                             uint32_t orb_hold, uint32_t orb_paru,
+                             uint32_t orb_pard, double orig_det_Hii) const;
+
+  double fast_diag_os_double(const std::vector<uint32_t>& up_occ,
+                             const std::vector<uint32_t>& do_occ,
+                             uint32_t orb_holu, uint32_t orb_hold,
+                             uint32_t orb_paru, uint32_t orb_pard,
+                             double orig_det_Hii) const;
+
+  void rotate_hamiltonian_ordm(const Scalar* ordm);
+};
+
+}  // namespace macis
diff --git a/include/macis/hamiltonian_generator/double_loop.hpp b/include/macis/hamiltonian_generator/double_loop.hpp
index 5628a5e6..fe2f85d0 100644
--- a/include/macis/hamiltonian_generator/double_loop.hpp
+++ b/include/macis/hamiltonian_generator/double_loop.hpp
@@ -13,10 +13,10 @@
 
 namespace macis {
 
-template <size_t N>
-class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
+template <typename WfnType>
+class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<WfnType> {
  public:
-  using base_type = HamiltonianGenerator<N>;
+  using base_type = HamiltonianGenerator<WfnType>;
   using full_det_t = typename base_type::full_det_t;
   using spin_det_t = typename base_type::spin_det_t;
   using full_det_iterator = typename base_type::full_det_iterator;
@@ -31,6 +31,7 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
   sparse_matrix_type<index_t> make_csr_hamiltonian_block_(
       full_det_iterator bra_begin, full_det_iterator bra_end,
       full_det_iterator ket_begin, full_det_iterator ket_end, double H_thresh) {
+    using wfn_traits = wavefunction_traits<WfnType>;
     const size_t nbra_dets = std::distance(bra_begin, bra_end);
     const size_t nket_dets = std::distance(ket_begin, ket_end);
 
@@ -47,10 +48,10 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
       const auto bra = *(bra_begin + i);
 
       size_t nrow = 0;
-      if(bra.count()) {
+      if(wfn_traits::count(bra)) {
         // Separate out into alpha/beta components
-        spin_det_t bra_alpha = bitset_lo_word(bra);
-        spin_det_t bra_beta = bitset_hi_word(bra);
+        spin_det_t bra_alpha = wfn_traits::alpha_string(bra);
+        spin_det_t bra_beta = wfn_traits::beta_string(bra);
 
         // Get occupied indices
         bits_to_indices(bra_alpha, bra_occ_alpha);
@@ -59,14 +60,14 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
         // Loop over ket determinants
         for(size_t j = 0; j < nket_dets; ++j) {
           const auto ket = *(ket_begin + j);
-          if(ket.count()) {
-            spin_det_t ket_alpha = bitset_lo_word(ket);
-            spin_det_t ket_beta = bitset_hi_word(ket);
+          if(wfn_traits::count(ket)) {
+            spin_det_t ket_alpha = wfn_traits::alpha_string(ket);
+            spin_det_t ket_beta = wfn_traits::beta_string(ket);
 
             full_det_t ex_total = bra ^ ket;
-            if(ex_total.count() <= 4) {
-              spin_det_t ex_alpha = bitset_lo_word(ex_total);
-              spin_det_t ex_beta = bitset_hi_word(ex_total);
+            if(wfn_traits::count(ex_total) <= 4) {
+              spin_det_t ex_alpha = wfn_traits::alpha_string(ex_total);
+              spin_det_t ex_beta = wfn_traits::beta_string(ex_total);
 
               // Compute Matrix Element
               const auto h_el = this->matrix_element(
@@ -117,6 +118,7 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
   void form_rdms(full_det_iterator bra_begin, full_det_iterator bra_end,
                  full_det_iterator ket_begin, full_det_iterator ket_end,
                  double *C, matrix_span_t ordm, rank4_span_t trdm) override {
+    using wfn_traits = wavefunction_traits<WfnType>;
     const size_t nbra_dets = std::distance(bra_begin, bra_end);
     const size_t nket_dets = std::distance(ket_begin, ket_end);
 
@@ -126,10 +128,10 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
     for(size_t i = 0; i < nbra_dets; ++i) {
       const auto bra = *(bra_begin + i);
       // if( (i%1000) == 0 ) std::cout << i  << std::endl;
-      if(bra.count()) {
+      if(wfn_traits::count(bra)) {
         // Separate out into alpha/beta components
-        spin_det_t bra_alpha = bitset_lo_word(bra);
-        spin_det_t bra_beta = bitset_hi_word(bra);
+        spin_det_t bra_alpha = wfn_traits::alpha_string(bra);
+        spin_det_t bra_beta = wfn_traits::beta_string(bra);
 
         // Get occupied indices
         bits_to_indices(bra_alpha, bra_occ_alpha);
@@ -138,14 +140,14 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
         // Loop over ket determinants
         for(size_t j = 0; j < nket_dets; ++j) {
           const auto ket = *(ket_begin + j);
-          if(ket.count()) {
-            spin_det_t ket_alpha = bitset_lo_word(ket);
-            spin_det_t ket_beta = bitset_hi_word(ket);
+          if(wfn_traits::count(ket)) {
+            spin_det_t ket_alpha = wfn_traits::alpha_string(ket);
+            spin_det_t ket_beta = wfn_traits::beta_string(ket);
 
             full_det_t ex_total = bra ^ ket;
-            if(ex_total.count() <= 4) {
-              spin_det_t ex_alpha = bitset_lo_word(ex_total);
-              spin_det_t ex_beta = bitset_hi_word(ex_total);
+            if(wfn_traits::count(ex_total) <= 4) {
+              spin_det_t ex_alpha = wfn_traits::alpha_string(ex_total);
+              spin_det_t ex_beta = wfn_traits::beta_string(ex_total);
 
               const double val = C[i] * C[j];
 
@@ -167,7 +169,7 @@ class DoubleLoopHamiltonianGenerator : public HamiltonianGenerator<N> {
  public:
   template <typename... Args>
   DoubleLoopHamiltonianGenerator(Args &&...args)
-      : HamiltonianGenerator<N>(std::forward<Args>(args)...) {}
+      : HamiltonianGenerator<WfnType>(std::forward<Args>(args)...) {}
 };
 
 }  // namespace macis
diff --git a/include/macis/hamiltonian_generator/impl.hpp b/include/macis/hamiltonian_generator/impl.hpp
index 2ead987b..d4d8e964 100644
--- a/include/macis/hamiltonian_generator/impl.hpp
+++ b/include/macis/hamiltonian_generator/impl.hpp
@@ -7,70 +7,5 @@
  */
 
 #pragma once
-#include <macis/hamiltonian_generator.hpp>
 
-namespace macis {
-
-template <size_t N>
-HamiltonianGenerator<N>::HamiltonianGenerator(matrix_span<double> T,
-                                              rank4_span_t V)
-    : norb_(T.extent(0)),
-      norb2_(norb_ * norb_),
-      norb3_(norb2_ * norb_),
-      T_pq_(T),
-      V_pqrs_(V) {
-  generate_integral_intermediates(V_pqrs_);
-}
-
-template <size_t N>
-void HamiltonianGenerator<N>::generate_integral_intermediates(rank4_span_t V) {
-  if(V.extent(0) != norb_ or V.extent(1) != norb_ or V.extent(2) != norb_ or
-     V.extent(3) != norb_)
-    throw std::runtime_error("V has incorrect dimensions");
-
-  size_t no = norb_;
-  size_t no2 = no * no;
-  size_t no3 = no2 * no;
-  size_t no4 = no3 * no;
-
-  // G(i,j,k,l) = V(i,j,k,l) - V(i,l,k,j)
-  G_pqrs_data_ = std::vector<double>(begin(V), end(V));
-  G_pqrs_ = rank4_span_t(G_pqrs_data_.data(), no, no, no, no);
-  for(auto i = 0ul; i < no; ++i)
-    for(auto j = 0ul; j < no; ++j)
-      for(auto k = 0ul; k < no; ++k)
-        for(auto l = 0ul; l < no; ++l) {
-          G_pqrs_(i, j, k, l) -= V(i, l, k, j);
-        }
-
-  // G_red(i,j,k) = G(i,j,k,k) = G(k,k,i,j)
-  // V_red(i,j,k) = V(i,j,k,k) = V(k,k,i,j)
-  G_red_data_.resize(no3);
-  V_red_data_.resize(no3);
-  G_red_ = rank3_span_t(G_red_data_.data(), no, no, no);
-  V_red_ = rank3_span_t(V_red_data_.data(), no, no, no);
-  for(auto j = 0ul; j < no; ++j)
-    for(auto i = 0ul; i < no; ++i)
-      for(auto k = 0ul; k < no; ++k) {
-        G_red_(k, i, j) = G_pqrs_(k, k, i, j);
-        V_red_(k, i, j) = V(k, k, i, j);
-      }
-
-  // G2_red(i,j) = 0.5 * G(i,i,j,j)
-  // V2_red(i,j) = V(i,i,j,j)
-  G2_red_data_.resize(no2);
-  V2_red_data_.resize(no2);
-  G2_red_ = matrix_span<double>(G2_red_data_.data(), no, no);
-  V2_red_ = matrix_span<double>(V2_red_data_.data(), no, no);
-  for(auto j = 0ul; j < no; ++j)
-    for(auto i = 0ul; i < no; ++i) {
-      G2_red_(i, j) = 0.5 * G_pqrs_(i, i, j, j);
-      V2_red_(i, j) = V(i, i, j, j);
-    }
-}
-
-}  // namespace macis
-
-#include <macis/hamiltonian_generator/fast_diagonals.hpp>
 #include <macis/hamiltonian_generator/matrix_elements.hpp>
-#include <macis/hamiltonian_generator/rdms.hpp>
diff --git a/include/macis/hamiltonian_generator/matrix_elements.hpp b/include/macis/hamiltonian_generator/matrix_elements.hpp
index 7c232a3f..36c80b9d 100644
--- a/include/macis/hamiltonian_generator/matrix_elements.hpp
+++ b/include/macis/hamiltonian_generator/matrix_elements.hpp
@@ -11,13 +11,14 @@
 
 namespace macis {
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element(full_det_t bra,
-                                               full_det_t ket) const {
-  auto bra_alpha = bitset_lo_word(bra);
-  auto ket_alpha = bitset_lo_word(ket);
-  auto bra_beta = bitset_hi_word(bra);
-  auto ket_beta = bitset_hi_word(ket);
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element(full_det_t bra,
+                                                     full_det_t ket) const {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  auto bra_alpha = wfn_traits::alpha_string(bra);
+  auto ket_alpha = wfn_traits::alpha_string(ket);
+  auto bra_beta = wfn_traits::beta_string(bra);
+  auto ket_beta = wfn_traits::beta_string(ket);
 
   auto ex_alpha = bra_alpha ^ ket_alpha;
   auto ex_beta = bra_beta ^ ket_beta;
@@ -29,14 +30,15 @@ double HamiltonianGenerator<N>::matrix_element(full_det_t bra,
                         ex_beta, bra_occ_alpha, bra_occ_beta);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element(
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element(
     spin_det_t bra_alpha, spin_det_t ket_alpha, spin_det_t ex_alpha,
     spin_det_t bra_beta, spin_det_t ket_beta, spin_det_t ex_beta,
     const std::vector<uint32_t>& bra_occ_alpha,
     const std::vector<uint32_t>& bra_occ_beta) const {
-  const uint32_t ex_alpha_count = ex_alpha.count();
-  const uint32_t ex_beta_count = ex_beta.count();
+  using spin_wfn_traits = wavefunction_traits<spin_det_t>;
+  const uint32_t ex_alpha_count = spin_wfn_traits::count(ex_alpha);
+  const uint32_t ex_beta_count = spin_wfn_traits::count(ex_beta);
 
   if((ex_alpha_count + ex_beta_count) > 4) return 0.;
 
@@ -62,16 +64,17 @@ double HamiltonianGenerator<N>::matrix_element(
     return matrix_element_diag(bra_occ_alpha, bra_occ_beta);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element_4(spin_det_t bra, spin_det_t ket,
-                                                 spin_det_t ex) const {
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element_4(spin_det_t bra,
+                                                       spin_det_t ket,
+                                                       spin_det_t ex) const {
   auto [o1, v1, o2, v2, sign] = doubles_sign_indices(bra, ket, ex);
 
   return sign * G_pqrs_(v1, o1, v2, o2);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element_22(
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element_22(
     spin_det_t bra_alpha, spin_det_t ket_alpha, spin_det_t ex_alpha,
     spin_det_t bra_beta, spin_det_t ket_beta, spin_det_t ex_beta) const {
   auto [o1, v1, sign_a] =
@@ -83,8 +86,8 @@ double HamiltonianGenerator<N>::matrix_element_22(
   return sign * V_pqrs_(v1, o1, v2, o2);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element_2(
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element_2(
     spin_det_t bra, spin_det_t ket, spin_det_t ex,
     const std::vector<uint32_t>& bra_occ_alpha,
     const std::vector<uint32_t>& bra_occ_beta) const {
@@ -105,8 +108,8 @@ double HamiltonianGenerator<N>::matrix_element_2(
   return sign * h_el;
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::matrix_element_diag(
+template <typename WfnType>
+double HamiltonianGenerator<WfnType>::matrix_element_diag(
     const std::vector<uint32_t>& occ_alpha,
     const std::vector<uint32_t>& occ_beta) const {
   double h_el = 0;
diff --git a/include/macis/hamiltonian_generator/sorted_double_loop.hpp b/include/macis/hamiltonian_generator/sorted_double_loop.hpp
new file mode 100644
index 00000000..af8760dd
--- /dev/null
+++ b/include/macis/hamiltonian_generator/sorted_double_loop.hpp
@@ -0,0 +1,326 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#pragma once
+#include <chrono>
+#include <macis/hamiltonian_generator.hpp>
+#include <macis/sd_operations.hpp>
+#include <macis/util/rdms.hpp>
+
+namespace macis {
+
+template <typename WfnType>
+class SortedDoubleLoopHamiltonianGenerator
+    : public HamiltonianGenerator<WfnType> {
+ public:
+  using base_type = HamiltonianGenerator<WfnType>;
+  using full_det_t = typename base_type::full_det_t;
+  using spin_det_t = typename base_type::spin_det_t;
+  using full_det_iterator = typename base_type::full_det_iterator;
+  using matrix_span_t = typename base_type::matrix_span_t;
+  using rank4_span_t = typename base_type::rank4_span_t;
+
+  template <typename index_t>
+  using sparse_matrix_type = sparsexx::csr_matrix<double, index_t>;
+
+ protected:
+  template <typename index_t>
+  sparse_matrix_type<index_t> make_csr_hamiltonian_block_(
+      full_det_iterator bra_begin, full_det_iterator bra_end,
+      full_det_iterator ket_begin, full_det_iterator ket_end, double H_thresh) {
+    using wfn_traits = wavefunction_traits<WfnType>;
+    using spin_wfn_type = typename wfn_traits::spin_wfn_type;
+    using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
+    const size_t nbra_dets = std::distance(bra_begin, bra_end);
+    const size_t nket_dets = std::distance(ket_begin, ket_end);
+
+    const bool is_symm = bra_begin == ket_begin and bra_end == ket_end;
+    auto world_rank = comm_rank(MPI_COMM_WORLD);
+
+    // Get unique alpha strings
+    auto setup_st = std::chrono::high_resolution_clock::now();
+    auto unique_alpha_bra = get_unique_alpha(bra_begin, bra_end);
+    auto unique_alpha_ket =
+        is_symm ? unique_alpha_bra : get_unique_alpha(ket_begin, ket_end);
+
+    const size_t nuniq_bra = unique_alpha_bra.size();
+    const size_t nuniq_ket = unique_alpha_ket.size();
+
+    // Compute offsets
+    std::vector<size_t> unique_alpha_bra_idx(nuniq_bra + 1);
+    std::transform_exclusive_scan(
+        unique_alpha_bra.begin(), unique_alpha_bra.end(),
+        unique_alpha_bra_idx.begin(), 0ul, std::plus<size_t>{},
+        [](auto& x) { return x.second; });
+    std::vector<size_t> unique_alpha_ket_idx(nuniq_ket + 1);
+    if(is_symm) {
+      unique_alpha_ket_idx = unique_alpha_bra_idx;
+    } else {
+      std::transform_exclusive_scan(
+          unique_alpha_ket.begin(), unique_alpha_ket.end(),
+          unique_alpha_ket_idx.begin(), 0ul, std::plus<size_t>{},
+          [](auto& x) { return x.second; });
+    }
+
+    unique_alpha_bra_idx.back() = nbra_dets;
+    unique_alpha_ket_idx.back() = nket_dets;
+    auto setup_en = std::chrono::high_resolution_clock::now();
+
+    // std::cout << "AVERAGE NBETA = " <<
+    //   std::accumulate(unique_alpha_bra.begin(), unique_alpha_bra.end(),
+    //     0ul, [](auto a, auto b){ return a + b.second; }) / double(nuniq_bra)
+    //     << std::endl;
+
+    // Populate COO matrix locally
+    // sparsexx::coo_matrix<double, index_t> coo_mat(nbra_dets, nket_dets, 0,
+    // 0);
+    std::vector<index_t> row_ind, col_ind;
+    std::vector<double> nz_val;
+
+    // size_t skip1 = 0;
+    // size_t skip2 = 0;
+
+    auto fast_insert = [](auto& old_vec, auto&& new_vec) {
+      if(old_vec.size() == 0)
+        old_vec = std::move(new_vec);
+      else {
+        old_vec.reserve(old_vec.size() + new_vec.size());
+        old_vec.insert(old_vec.end(), new_vec.begin(), new_vec.end());
+      }
+    };
+    // Loop over uniq alphas in bra/ket
+    auto pop_st = std::chrono::high_resolution_clock::now();
+#pragma omp parallel
+    {
+      std::vector<index_t> row_ind_loc, col_ind_loc;
+      std::vector<double> nz_val_loc;
+      std::vector<uint32_t> bra_occ_alpha, bra_occ_beta;
+#pragma omp for schedule(dynamic)
+      for(size_t ia_bra = 0; ia_bra < nuniq_bra; ++ia_bra) {
+        if(unique_alpha_bra[ia_bra].first.any()) {
+          if(!(ia_bra % 100))
+            printf("[ham_gen rank %d] IA_BRA = %lu / %lu\n", world_rank, ia_bra,
+                   nuniq_bra);
+          // Extract alpha bra
+          const auto bra_alpha = unique_alpha_bra[ia_bra].first;
+          const size_t beta_st_bra = unique_alpha_bra_idx[ia_bra];
+          const size_t beta_en_bra = unique_alpha_bra_idx[ia_bra + 1];
+          spin_wfn_traits::state_to_occ(bra_alpha, bra_occ_alpha);
+
+          const auto ket_lower = is_symm ? ia_bra : 0;
+          for(size_t ia_ket = ket_lower; ia_ket < nuniq_ket; ++ia_ket) {
+            if(unique_alpha_ket[ia_ket].first.any()) {
+              // Extract alpha ket
+              const auto ket_alpha = unique_alpha_ket[ia_ket].first;
+
+              // Compute alpha excitation
+              const auto ex_alpha = bra_alpha ^ ket_alpha;
+              const auto ex_alpha_count = spin_wfn_traits::count(ex_alpha);
+
+              // Early exit
+              if(ex_alpha_count > 4) {
+                // skip1++;
+                continue;
+              }
+
+              // Precompute all-alpha excitation if it will be used
+              const double mat_el_4_alpha =
+                  (ex_alpha_count == 4)
+                      ? this->matrix_element_4(bra_alpha, ket_alpha, ex_alpha)
+                      : 0.0;
+              if(ex_alpha_count == 4 and std::abs(mat_el_4_alpha) < H_thresh) {
+                // The only possible matrix element is too-small, skip everyhing
+                // skip2++;
+                continue;
+              }
+
+              const size_t beta_st_ket = unique_alpha_ket_idx[ia_ket];
+              const size_t beta_en_ket = unique_alpha_ket_idx[ia_ket + 1];
+
+              // Loop over local betas according to their global indices
+              for(size_t ibra = beta_st_bra; ibra < beta_en_bra; ++ibra) {
+                const auto bra_beta =
+                    wfn_traits::beta_string(*(bra_begin + ibra));
+                spin_wfn_traits::state_to_occ(bra_beta, bra_occ_beta);
+                for(size_t iket = beta_st_ket; iket < beta_en_ket; ++iket) {
+                  if(is_symm and (iket < ibra)) continue;
+                  const auto ket_beta =
+                      wfn_traits::beta_string(*(ket_begin + iket));
+
+                  // Compute beta excitation
+                  const auto ex_beta = bra_beta ^ ket_beta;
+                  const auto ex_beta_count = spin_wfn_traits::count(ex_beta);
+
+                  if((ex_alpha_count + ex_beta_count) > 4) continue;
+
+                  double h_el = 0.0;
+                  if(ex_alpha_count == 4) {
+                    // Use precomputed value
+                    h_el = mat_el_4_alpha;
+                  } else if(ex_beta_count == 4) {
+                    h_el = this->matrix_element_4(bra_beta, ket_beta, ex_beta);
+                  } else if(ex_alpha_count == 2) {
+                    if(ex_beta_count == 2) {
+                      h_el = this->matrix_element_22(bra_alpha, ket_alpha,
+                                                     ex_alpha, bra_beta,
+                                                     ket_beta, ex_beta);
+                    } else {
+                      h_el =
+                          this->matrix_element_2(bra_alpha, ket_alpha, ex_alpha,
+                                                 bra_occ_alpha, bra_occ_beta);
+                    }
+                  } else if(ex_beta_count == 2) {
+                    h_el = this->matrix_element_2(bra_beta, ket_beta, ex_beta,
+                                                  bra_occ_beta, bra_occ_alpha);
+                  } else {
+                    // Diagonal matrix element
+                    h_el =
+                        this->matrix_element_diag(bra_occ_alpha, bra_occ_beta);
+                  }
+
+                  // Insert matrix element
+                  if(std::abs(h_el) > H_thresh) {
+                    // coo_mat.template insert<false>(ibra, iket, h_el);
+                    row_ind_loc.emplace_back(ibra);
+                    col_ind_loc.emplace_back(iket);
+                    nz_val_loc.emplace_back(h_el);
+                    if(is_symm and ibra != iket) {
+                      // coo_mat.template insert<false>(iket, ibra, h_el);
+                      row_ind_loc.emplace_back(iket);
+                      col_ind_loc.emplace_back(ibra);
+                      nz_val_loc.emplace_back(h_el);
+                    }
+                  }
+
+                }  // ket beta
+              }    // bra beta
+            }
+          }  // Loop over ket alphas
+        }
+      }  // Loop over bra alphas
+
+// Atomically insert into larger matrix arrays
+#pragma omp critical
+      {
+#if 0
+        row_ind.insert(row_ind.end(), row_ind_loc.begin(), row_ind_loc.end());
+        // row_ind_loc.clear(); row_ind_loc.shrink_to_fit();
+        col_ind.insert(col_ind.end(), col_ind_loc.begin(), col_ind_loc.end());
+        // col_ind_loc.clear(); col_ind_loc.shrink_to_fit();
+        nz_val.insert(nz_val.end(), nz_val_loc.begin(), nz_val_loc.end());
+        // nz_val_loc.clear(); nz_val_loc.shrink_to_fit();
+#else
+        fast_insert(row_ind, row_ind_loc);
+        fast_insert(col_ind, col_ind_loc);
+        fast_insert(nz_val, nz_val_loc);
+#endif
+      }
+
+    }  // OpenMP
+    auto pop_en = std::chrono::high_resolution_clock::now();
+
+    // Generate Sparse Matrix
+    sparsexx::coo_matrix<double, index_t> coo_mat(
+        nbra_dets, nket_dets, std::move(col_ind), std::move(row_ind),
+        std::move(nz_val), 0);
+
+    // Sort for CSR Conversion
+    auto sort_st = std::chrono::high_resolution_clock::now();
+    printf("[ham_gen rank %d] BEFORE SORT\n", world_rank);
+    coo_mat.sort_by_row_index();
+    auto sort_en = std::chrono::high_resolution_clock::now();
+
+    auto conv_st = std::chrono::high_resolution_clock::now();
+    printf("[ham_gen rank %d] BEFORE CONV\n", world_rank);
+    sparse_matrix_type<index_t> csr_mat(coo_mat);  // Convert to CSR Matrix
+    auto conv_en = std::chrono::high_resolution_clock::now();
+
+    printf("Setup %.2e Pop %.2e Sort %.2e Conv %.2e\n",
+           std::chrono::duration<double>(setup_en - setup_st).count(),
+           std::chrono::duration<double>(pop_en - pop_st).count(),
+           std::chrono::duration<double>(sort_en - sort_st).count(),
+           std::chrono::duration<double>(conv_en - conv_st).count());
+
+    return csr_mat;
+  }
+
+  sparse_matrix_type<int32_t> make_csr_hamiltonian_block_32bit_(
+      full_det_iterator bra_begin, full_det_iterator bra_end,
+      full_det_iterator ket_begin, full_det_iterator ket_end,
+      double H_thresh) override {
+    return make_csr_hamiltonian_block_<int32_t>(bra_begin, bra_end, ket_begin,
+                                                ket_end, H_thresh);
+  }
+
+  sparse_matrix_type<int64_t> make_csr_hamiltonian_block_64bit_(
+      full_det_iterator bra_begin, full_det_iterator bra_end,
+      full_det_iterator ket_begin, full_det_iterator ket_end,
+      double H_thresh) override {
+    return make_csr_hamiltonian_block_<int64_t>(bra_begin, bra_end, ket_begin,
+                                                ket_end, H_thresh);
+  }
+
+ public:
+  void form_rdms(full_det_iterator bra_begin, full_det_iterator bra_end,
+                 full_det_iterator ket_begin, full_det_iterator ket_end,
+                 double* C, matrix_span_t ordm, rank4_span_t trdm) override {
+    using wfn_traits = wavefunction_traits<WfnType>;
+    const size_t nbra_dets = std::distance(bra_begin, bra_end);
+    const size_t nket_dets = std::distance(ket_begin, ket_end);
+
+    std::vector<uint32_t> bra_occ_alpha, bra_occ_beta;
+
+    // Loop over bra determinants
+    for(size_t i = 0; i < nbra_dets; ++i) {
+      const auto bra = *(bra_begin + i);
+      // if( (i%1000) == 0 ) std::cout << i  << std::endl;
+      if(wfn_traits::count(bra)) {
+        // Separate out into alpha/beta components
+        spin_det_t bra_alpha = wfn_traits::alpha_string(bra);
+        spin_det_t bra_beta = wfn_traits::beta_string(bra);
+
+        // Get occupied indices
+        bits_to_indices(bra_alpha, bra_occ_alpha);
+        bits_to_indices(bra_beta, bra_occ_beta);
+
+        // Loop over ket determinants
+        for(size_t j = 0; j < nket_dets; ++j) {
+          const auto ket = *(ket_begin + j);
+          if(wfn_traits::count(ket)) {
+            spin_det_t ket_alpha = wfn_traits::alpha_string(ket);
+            spin_det_t ket_beta = wfn_traits::beta_string(ket);
+
+            full_det_t ex_total = bra ^ ket;
+            if(wfn_traits::count(ex_total) <= 4) {
+              spin_det_t ex_alpha = wfn_traits::alpha_string(ex_total);
+              spin_det_t ex_beta = wfn_traits::beta_string(ex_total);
+
+              const double val = C[i] * C[j];
+
+              // Compute Matrix Element
+              if(std::abs(val) > 1e-16) {
+                rdm_contributions(bra_alpha, ket_alpha, ex_alpha, bra_beta,
+                                  ket_beta, ex_beta, bra_occ_alpha,
+                                  bra_occ_beta, val, ordm, trdm);
+              }
+            }  // Possible non-zero connection (Hamming distance)
+
+          }  // Non-zero ket determinant
+        }    // Loop over ket determinants
+
+      }  // Non-zero bra determinant
+    }    // Loop over bra determinants
+  }
+
+ public:
+  template <typename... Args>
+  SortedDoubleLoopHamiltonianGenerator(Args&&... args)
+      : HamiltonianGenerator<WfnType>(std::forward<Args>(args)...) {}
+};
+
+}  // namespace macis
diff --git a/include/macis/util/cas.hpp b/include/macis/mcscf/cas.hpp
similarity index 84%
rename from include/macis/util/cas.hpp
rename to include/macis/mcscf/cas.hpp
index 63c8f4a2..82a12ec6 100644
--- a/include/macis/util/cas.hpp
+++ b/include/macis/mcscf/cas.hpp
@@ -7,9 +7,9 @@
  */
 
 #pragma once
+#include <macis/mcscf/mcscf.hpp>
 #include <macis/solvers/selected_ci_diag.hpp>
 #include <macis/types.hpp>
-#include <macis/util/mcscf.hpp>
 
 namespace macis {
 
@@ -34,8 +34,7 @@ double compute_casci_rdms(
     MCSCFSettings settings, NumOrbital norb, size_t nalpha, size_t nbeta,
     double* T, double* V, double* ORDM, double* TRDM,
     std::vector<double>& C MACIS_MPI_CODE(, MPI_Comm comm)) {
-  constexpr auto nbits = HamGen::nbits;
-
+  using wfn_type = typename HamGen::full_det_t;
 #ifdef MACIS_ENABLE_MPI
   int rank;
   MPI_Comm_rank(comm, &rank);
@@ -49,11 +48,11 @@ double compute_casci_rdms(
                  rank4_span<double>(V, no, no, no, no));
 
   // Compute Lowest Energy Eigenvalue (ED)
-  auto dets = generate_hilbert_space<nbits>(norb.get(), nalpha, nbeta);
-  double E0 =
-      selected_ci_diag(dets.begin(), dets.end(), ham_gen, settings.ci_matel_tol,
-                       settings.ci_max_subspace, settings.ci_res_tol, C,
-                       MACIS_MPI_CODE(comm, ) true);
+  auto dets = generate_hilbert_space<wfn_type>(norb.get(), nalpha, nbeta);
+  double E0 = selected_ci_diag<int32_t>(
+      dets.begin(), dets.end(), ham_gen, settings.ci_matel_tol,
+      settings.ci_max_subspace, settings.ci_res_tol, C,
+      MACIS_MPI_CODE(comm, ) true);
 
   // Compute RDMs
   ham_gen.form_rdms(dets.begin(), dets.end(), dets.begin(), dets.end(),
diff --git a/include/macis/util/diis.hpp b/include/macis/mcscf/diis.hpp
similarity index 100%
rename from include/macis/util/diis.hpp
rename to include/macis/mcscf/diis.hpp
diff --git a/include/macis/util/fock_matrices.hpp b/include/macis/mcscf/fock_matrices.hpp
similarity index 100%
rename from include/macis/util/fock_matrices.hpp
rename to include/macis/mcscf/fock_matrices.hpp
diff --git a/include/macis/util/mcscf.hpp b/include/macis/mcscf/mcscf.hpp
similarity index 100%
rename from include/macis/util/mcscf.hpp
rename to include/macis/mcscf/mcscf.hpp
diff --git a/include/macis/util/mcscf_impl.hpp b/include/macis/mcscf/mcscf_impl.hpp
similarity index 98%
rename from include/macis/util/mcscf_impl.hpp
rename to include/macis/mcscf/mcscf_impl.hpp
index 48d1f850..294cae59 100644
--- a/include/macis/util/mcscf_impl.hpp
+++ b/include/macis/mcscf/mcscf_impl.hpp
@@ -7,14 +7,14 @@
  */
 
 #pragma once
-#include <macis/util/diis.hpp>
+#include <macis/mcscf/diis.hpp>
+#include <macis/mcscf/fock_matrices.hpp>
+#include <macis/mcscf/mcscf.hpp>
+#include <macis/mcscf/orbital_gradient.hpp>
+#include <macis/mcscf/orbital_hessian.hpp>
+#include <macis/mcscf/orbital_rotation_utilities.hpp>
+#include <macis/mcscf/orbital_steps.hpp>
 #include <macis/util/fcidump.hpp>
-#include <macis/util/fock_matrices.hpp>
-#include <macis/util/mcscf.hpp>
-#include <macis/util/orbital_gradient.hpp>
-#include <macis/util/orbital_hessian.hpp>
-#include <macis/util/orbital_rotation_utilities.hpp>
-#include <macis/util/orbital_steps.hpp>
 #include <macis/util/transform.hpp>
 
 // #include <ceres/ceres.h>
diff --git a/include/macis/util/orbital_energies.hpp b/include/macis/mcscf/orbital_energies.hpp
similarity index 100%
rename from include/macis/util/orbital_energies.hpp
rename to include/macis/mcscf/orbital_energies.hpp
diff --git a/include/macis/util/orbital_gradient.hpp b/include/macis/mcscf/orbital_gradient.hpp
similarity index 100%
rename from include/macis/util/orbital_gradient.hpp
rename to include/macis/mcscf/orbital_gradient.hpp
diff --git a/include/macis/util/orbital_hessian.hpp b/include/macis/mcscf/orbital_hessian.hpp
similarity index 96%
rename from include/macis/util/orbital_hessian.hpp
rename to include/macis/mcscf/orbital_hessian.hpp
index 53f59c18..952ae35a 100644
--- a/include/macis/util/orbital_hessian.hpp
+++ b/include/macis/mcscf/orbital_hessian.hpp
@@ -7,9 +7,9 @@
  */
 
 #pragma once
+#include <macis/mcscf/fock_matrices.hpp>
+#include <macis/mcscf/orbital_rotation_utilities.hpp>
 #include <macis/types.hpp>
-#include <macis/util/fock_matrices.hpp>
-#include <macis/util/orbital_rotation_utilities.hpp>
 
 namespace macis {
 
diff --git a/include/macis/util/orbital_rotation_utilities.hpp b/include/macis/mcscf/orbital_rotation_utilities.hpp
similarity index 100%
rename from include/macis/util/orbital_rotation_utilities.hpp
rename to include/macis/mcscf/orbital_rotation_utilities.hpp
diff --git a/include/macis/util/orbital_steps.hpp b/include/macis/mcscf/orbital_steps.hpp
similarity index 100%
rename from include/macis/util/orbital_steps.hpp
rename to include/macis/mcscf/orbital_steps.hpp
diff --git a/include/macis/sd_operations.hpp b/include/macis/sd_operations.hpp
index c04267f8..4f4e9781 100644
--- a/include/macis/sd_operations.hpp
+++ b/include/macis/sd_operations.hpp
@@ -10,6 +10,7 @@
 #include <algorithm>
 #include <cassert>
 #include <macis/bitset_operations.hpp>
+#include <macis/wfn/raw_bitset.hpp>
 #include <numeric>
 
 namespace macis {
@@ -27,47 +28,13 @@ namespace macis {
  *  @returns The bitstring HF state consisting of the specified number of
  *    occupied orbitals.
  */
-template <size_t N>
-std::bitset<N> canonical_hf_determinant(uint32_t nalpha, uint32_t nbeta) {
-  static_assert((N % 2) == 0, "N Must Be Even");
-  std::bitset<N> alpha = full_mask<N>(nalpha);
-  std::bitset<N> beta = full_mask<N>(nbeta) << (N / 2);
-  return alpha | beta;
-}
-
-/**
- *  @brief Generate canonical HF determinant.
- *
- *  Generates a string representation of the canonical HF determinant
- *  consisting of a specifed number of alpha and beta orbitals. This variant
- *  does not assume energetic ordering of the HF orbitals.
- *
- *  TODO: This assumes restricted orbitals
- *
- *  @tparam N Number of bits for the total bit string of the state
- *  @param[in] nalpha  Number of occupied alpha orbitals in the HF state
- *  @param[in] nbeta   Number of occupied beta orbitals in the HF state
- *  @param[in] orb_ens Orbital eigenenergies.
- *
- *  @returns The bitstring HF state consisting of the specified number of
- *    occupied orbitals populated according to the ordering of `orb_ens`.
- */
-template <size_t N>
-std::bitset<N> canonical_hf_determinant(uint32_t nalpha, uint32_t nbeta,
-                                        const std::vector<double>& orb_ens) {
-  static_assert((N % 2) == 0, "N Must Be Even");
-  // First, find the sorted indices for the orbital energies
-  std::vector<size_t> idx(orb_ens.size());
-  std::iota(idx.begin(), idx.end(), 0);
-  std::stable_sort(idx.begin(), idx.end(), [&orb_ens](size_t i1, size_t i2) {
-    return orb_ens[i1] < orb_ens[i2];
-  });
-  // Next, fill the electrons by energy
-  std::bitset<N> alpha(0), beta(0);
-  for(uint32_t i = 0; i < nalpha; i++) alpha.flip(idx[i]);
-  for(uint32_t i = 0; i < nbeta; i++) beta.flip(idx[i] + N / 2);
-  return alpha | beta;
-}
+// template <size_t N>
+// std::bitset<N> canonical_hf_determinant(uint32_t nalpha, uint32_t nbeta) {
+//   static_assert((N % 2) == 0, "N Must Be Even");
+//   std::bitset<N> alpha = full_mask<N>(nalpha);
+//   std::bitset<N> beta = full_mask<N>(nbeta) << (N / 2);
+//   return alpha | beta;
+// }
 
 /**
  *  @brief Generate the list of (un)occupied orbitals for a paricular state.
@@ -78,57 +45,77 @@ std::bitset<N> canonical_hf_determinant(uint32_t nalpha, uint32_t nbeta,
  *  @param[out] occ    List of occupied orbitals in `state`
  *  @param[out] vir    List of unoccupied orbitals in `state`
  */
+// template <size_t N>
+// void bitset_to_occ_vir(size_t norb, std::bitset<N> state,
+//                        std::vector<uint32_t>& occ, std::vector<uint32_t>&
+//                        vir) {
+//   occ = bits_to_indices(state);
+//   const auto nocc = occ.size();
+//   assert(nocc < norb);
+//
+//   const auto nvir = norb - nocc;
+//   vir.resize(nvir);
+//   state = ~state;
+//   for(int i = 0; i < nvir; ++i) {
+//     auto a = ffs(state) - 1;
+//     vir[i] = a;
+//     state.flip(a);
+//   }
+// }
+
+// template <size_t N>
+// auto single_excitation(std::bitset<N> state, unsigned p, unsigned q) {
+//   return state.flip(p).flip(q);
+// }
+
+// template <Spin Sigma, size_t N>
+// auto single_excitation_spin(std::bitset<N> state, unsigned p, unsigned q) {
+//   static_assert(N%2 == 0, "Num Bits Must Be Even");
+//   if constexpr (Sigma == Spin::Alpha)
+//     return single_excitation(state,p,q);
+//   else
+//     return single_excitation(state,p+N/2,q+N/2);
+// }
+
+// template <size_t N>
+// auto double_excitation(std::bitset<N> state, unsigned p, unsigned q, unsigned
+// r, unsigned s) {
+//   return state.flip(p).flip(q).flip(r).flip(s);
+// }
+
+// template <Spin Sigma, size_t N>
+// auto double_excitation_spin(std::bitset<N> state, unsigned p, unsigned q,
+// unsigned r, unsigned s) {
+//   static_assert(N%2 == 0, "Num Bits Must Be Even");
+//   if constexpr (Sigma == Spin::Alpha)
+//     return double_excitation(state,p,q,r,s);
+//   else
+//     return double_excitation(state,p+N/2,q+N/2,r+N/2,s+N/2);
+// }
+
+// TODO: Test this function
 template <size_t N>
-void bitset_to_occ_vir(size_t norb, std::bitset<N> state,
-                       std::vector<uint32_t>& occ, std::vector<uint32_t>& vir) {
-  occ = bits_to_indices(state);
-  const auto nocc = occ.size();
-  assert(nocc < norb);
-
-  const auto nvir = norb - nocc;
-  vir.resize(nvir);
-  state = ~state;
-  for(int i = 0; i < nvir; ++i) {
-    auto a = ffs(state) - 1;
-    vir[i] = a;
-    state.flip(a);
-  }
+uint32_t first_occupied_flipped(std::bitset<N> state, std::bitset<N> ex) {
+  return ffs(state & ex) - 1u;
 }
 
-/**
- *  @brief Generate the list of (un)occupied orbitals for a paricular state.
- *
- *  TODO: Test this function
- *
- *  @tparam N Number of bits for the total bit string of the state
- *  @param[in]  norb   Number of orbitals used to describe the state (<= `N`)
- *  @param[in]  state  The state from which to determine orbital occupations.
- *  @param[out] occ    List of occupied orbitals in `state`
- *  @param[out] vir    List of unoccupied orbitals in `state`
- *  @param[in]  as_orbs TODO:????
- */
+// TODO: Test this function
 template <size_t N>
-void bitset_to_occ_vir_as(size_t norb, std::bitset<N> state,
-                          std::vector<uint32_t>& occ,
-                          std::vector<uint32_t>& vir,
-                          const std::vector<uint32_t>& as_orbs) {
-  occ.clear();
-  for(const auto i : as_orbs)
-    if(state[i]) occ.emplace_back(i);
-  const auto nocc = occ.size();
-  assert(nocc <= norb);
-
-  const auto nvir = as_orbs.size() - nocc;
-  vir.resize(nvir);
-  auto it = vir.begin();
-  for(const auto i : as_orbs)
-    if(!state[i]) *(it++) = i;
+double single_excitation_sign(std::bitset<N> state, unsigned p, unsigned q) {
+  std::bitset<N> mask = 0ul;
+
+  if(p > q) {
+    mask = state & (full_mask<N>(p) ^ full_mask<N>(q + 1));
+  } else {
+    mask = state & (full_mask<N>(q) ^ full_mask<N>(p + 1));
+  }
+  return (mask.count() % 2) ? -1. : 1.;
 }
 
-template <size_t N>
-void append_singles(std::bitset<N> state, const std::vector<uint32_t>& occ,
-                    const std::vector<uint32_t>& vir,
-                    std::vector<std::bitset<N>>& singles) {
+template <typename WfnType, typename WfnContainer>
+void append_singles(WfnType state, const std::vector<uint32_t>& occ,
+                    const std::vector<uint32_t>& vir, WfnContainer& singles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   const size_t nocc = occ.size();
   const size_t nvir = vir.size();
 
@@ -137,15 +124,15 @@ void append_singles(std::bitset<N> state, const std::vector<uint32_t>& occ,
 
   for(size_t a = 0; a < nvir; ++a)
     for(size_t i = 0; i < nocc; ++i) {
-      auto ex = std::bitset<N>(0).flip(occ[i]).flip(vir[a]);
-      singles.emplace_back(state ^ ex);
+      singles.emplace_back(
+          wfn_traits::single_excitation_no_check(state, occ[i], vir[a]));
     }
 }
 
-template <size_t N>
-void append_doubles(std::bitset<N> state, const std::vector<uint32_t>& occ,
-                    const std::vector<uint32_t>& vir,
-                    std::vector<std::bitset<N>>& doubles) {
+template <typename WfnType, typename WfnContainer>
+void append_doubles(WfnType state, const std::vector<uint32_t>& occ,
+                    const std::vector<uint32_t>& vir, WfnContainer& doubles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   const size_t nocc = occ.size();
   const size_t nvir = vir.size();
 
@@ -158,39 +145,37 @@ void append_doubles(std::bitset<N> state, const std::vector<uint32_t>& occ,
     for(size_t i = 0; i < nocc; ++i)
       for(size_t b = a + 1; b < nvir; ++b)
         for(size_t j = i + 1; j < nocc; ++j) {
-          auto ex =
-              std::bitset<N>(0).flip(occ[i]).flip(vir[a]).flip(occ[j]).flip(
-                  vir[b]);
-          doubles.emplace_back(state ^ ex);
+          doubles.emplace_back(wfn_traits::double_excitation_no_check(
+              state, occ[i], occ[j], vir[a], vir[b]));
         }
 }
 
-template <size_t N>
-void generate_singles(size_t norb, std::bitset<N> state,
-                      std::vector<std::bitset<N>>& singles) {
+template <typename WfnType, typename WfnContainer>
+void generate_singles(size_t norb, WfnType state, WfnContainer& singles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   std::vector<uint32_t> occ_orbs, vir_orbs;
-  bitset_to_occ_vir(norb, state, occ_orbs, vir_orbs);
+  wfn_traits::state_to_occ_vir(norb, state, occ_orbs, vir_orbs);
 
   singles.clear();
   append_singles(state, occ_orbs, vir_orbs, singles);
 }
 
-template <size_t N>
-void generate_doubles(size_t norb, std::bitset<N> state,
-                      std::vector<std::bitset<N>>& doubles) {
+template <typename WfnType, typename WfnContainer>
+void generate_doubles(size_t norb, WfnType state, WfnContainer& doubles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   std::vector<uint32_t> occ_orbs, vir_orbs;
-  bitset_to_occ_vir(norb, state, occ_orbs, vir_orbs);
+  wfn_traits::state_to_occ_vir(norb, state, occ_orbs, vir_orbs);
 
   doubles.clear();
   append_doubles(state, occ_orbs, vir_orbs, doubles);
 }
 
-template <size_t N>
-void generate_singles_doubles(size_t norb, std::bitset<N> state,
-                              std::vector<std::bitset<N>>& singles,
-                              std::vector<std::bitset<N>>& doubles) {
+template <typename WfnType, typename WfnContainer>
+void generate_singles_doubles(size_t norb, WfnType state, WfnContainer& singles,
+                              WfnContainer& doubles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   std::vector<uint32_t> occ_orbs, vir_orbs;
-  bitset_to_occ_vir(norb, state, occ_orbs, vir_orbs);
+  wfn_traits::state_to_occ_vir(norb, state, occ_orbs, vir_orbs);
 
   singles.clear();
   doubles.clear();
@@ -198,197 +183,62 @@ void generate_singles_doubles(size_t norb, std::bitset<N> state,
   append_doubles(state, occ_orbs, vir_orbs, doubles);
 }
 
-// TODO: Test this function
-template <size_t N>
-void generate_singles_as(size_t norb, std::bitset<N> state,
-                         std::vector<std::bitset<N>>& singles,
-                         const std::vector<uint32_t>& as_orbs) {
-  std::vector<uint32_t> occ_orbs, vir_orbs;
-  bitset_to_occ_vir_as<N>(norb, state, occ_orbs, vir_orbs, as_orbs);
+template <typename WfnType, typename WfnContainer>
+void generate_singles_spin(size_t norb, WfnType state, WfnContainer& singles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
 
-  singles.clear();
-  append_singles(state, occ_orbs, vir_orbs, singles);
-}
+  auto state_alpha = wfn_traits::alpha_string(state);
+  auto state_beta = wfn_traits::beta_string(state);
 
-// TODO: Test this function
-template <size_t N>
-void generate_singles_doubles_as(size_t norb, std::bitset<N> state,
-                                 std::vector<std::bitset<N>>& singles,
-                                 std::vector<std::bitset<N>>& doubles,
-                                 const std::vector<uint32_t>& as_orbs) {
-  std::vector<uint32_t> occ_orbs, vir_orbs;
-  bitset_to_occ_vir_as<N>(norb, state, occ_orbs, vir_orbs, as_orbs);
-
-  singles.clear();
-  doubles.clear();
-  append_singles(state, occ_orbs, vir_orbs, singles);
-  append_doubles(state, occ_orbs, vir_orbs, doubles);
-}
-
-// TODO: Test this function
-template <size_t N>
-void generate_singles_spin_as(size_t norb, std::bitset<N> state,
-                              std::vector<std::bitset<N>>& singles,
-                              const std::vector<uint32_t> as_orbs) {
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
-
-  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
-
-  // Generate Spin-Specific singles
-  generate_singles_as(norb, state_alpha, singles_alpha, as_orbs);
-  generate_singles_as(norb, state_beta, singles_beta, as_orbs);
-
-  auto state_alpha_expand = expand_bitset<N>(state_alpha);
-  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
-
-  // Generate Singles in full space
-  singles.clear();
-
-  // Single Alpha + No Beta
-  for(auto s_alpha : singles_alpha) {
-    auto s_state = expand_bitset<N>(s_alpha);
-    s_state = s_state | state_beta_expand;
-    singles.emplace_back(s_state);
-  }
-
-  // No Alpha + Single Beta
-  for(auto s_beta : singles_beta) {
-    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
-    s_state = s_state | state_alpha_expand;
-    singles.emplace_back(s_state);
-  }
-}
-
-// TODO: Test this function
-template <size_t N>
-void generate_singles_doubles_spin_as(size_t norb, std::bitset<N> state,
-                                      std::vector<std::bitset<N>>& singles,
-                                      std::vector<std::bitset<N>>& doubles,
-                                      const std::vector<uint32_t>& as_orbs) {
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
-
-  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
-  std::vector<std::bitset<N / 2>> doubles_alpha, doubles_beta;
-
-  // Generate Spin-Specific singles / doubles
-  generate_singles_doubles_as(norb, state_alpha, singles_alpha, doubles_alpha,
-                              as_orbs);
-  generate_singles_doubles_as(norb, state_beta, singles_beta, doubles_beta,
-                              as_orbs);
-
-  auto state_alpha_expand = expand_bitset<N>(state_alpha);
-  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
-
-  // Generate Singles in full space
-  singles.clear();
-
-  // Single Alpha + No Beta
-  for(auto s_alpha : singles_alpha) {
-    auto s_state = expand_bitset<N>(s_alpha);
-    s_state = s_state | state_beta_expand;
-    singles.emplace_back(s_state);
-  }
-
-  // No Alpha + Single Beta
-  for(auto s_beta : singles_beta) {
-    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
-    s_state = s_state | state_alpha_expand;
-    singles.emplace_back(s_state);
-  }
-
-  // Generate Doubles in full space
-  doubles.clear();
-
-  // Double Alpha + No Beta
-  for(auto d_alpha : doubles_alpha) {
-    auto d_state = expand_bitset<N>(d_alpha);
-    d_state = d_state | state_beta_expand;
-    doubles.emplace_back(d_state);
-  }
-
-  // No Alpha + Double Beta
-  for(auto d_beta : doubles_beta) {
-    auto d_state = expand_bitset<N>(d_beta) << (N / 2);
-    d_state = d_state | state_alpha_expand;
-    doubles.emplace_back(d_state);
-  }
-
-  // Single Alpha + Single Beta
-  for(auto s_alpha : singles_alpha)
-    for(auto s_beta : singles_beta) {
-      auto d_state_alpha = expand_bitset<N>(s_alpha);
-      auto d_state_beta = expand_bitset<N>(s_beta) << (N / 2);
-      doubles.emplace_back(d_state_alpha | d_state_beta);
-    }
-}
-
-template <size_t N>
-void generate_singles_spin(size_t norb, std::bitset<N> state,
-                           std::vector<std::bitset<N>>& singles) {
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
-
-  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
+  using spin_wfn_type = spin_wfn_t<WfnType>;
+  std::vector<spin_wfn_type> singles_alpha, singles_beta;
 
   // Generate Spin-Specific singles / doubles
   generate_singles(norb, state_alpha, singles_alpha);
   generate_singles(norb, state_beta, singles_beta);
 
-  auto state_alpha_expand = expand_bitset<N>(state_alpha);
-  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
-
   // Generate Singles in full space
   singles.clear();
 
   // Single Alpha + No Beta
   for(auto s_alpha : singles_alpha) {
-    auto s_state = expand_bitset<N>(s_alpha);
-    s_state = s_state | state_beta_expand;
-    singles.emplace_back(s_state);
+    singles.emplace_back(wfn_traits::from_spin(s_alpha, state_beta));
   }
 
   // No Alpha + Single Beta
   for(auto s_beta : singles_beta) {
-    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
-    s_state = s_state | state_alpha_expand;
-    singles.emplace_back(s_state);
+    singles.emplace_back(wfn_traits::from_spin(state_alpha, s_beta));
   }
 }
 
-template <size_t N>
-void generate_singles_doubles_spin(size_t norb, std::bitset<N> state,
-                                   std::vector<std::bitset<N>>& singles,
-                                   std::vector<std::bitset<N>>& doubles) {
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
+template <typename WfnType, typename WfnContainer>
+void generate_singles_doubles_spin(size_t norb, WfnType state,
+                                   WfnContainer& singles,
+                                   WfnContainer& doubles) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+
+  auto state_alpha = wfn_traits::alpha_string(state);
+  auto state_beta = wfn_traits::beta_string(state);
 
-  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
-  std::vector<std::bitset<N / 2>> doubles_alpha, doubles_beta;
+  using spin_wfn_type = spin_wfn_t<WfnType>;
+  std::vector<spin_wfn_type> singles_alpha, singles_beta;
+  std::vector<spin_wfn_type> doubles_alpha, doubles_beta;
 
   // Generate Spin-Specific singles / doubles
   generate_singles_doubles(norb, state_alpha, singles_alpha, doubles_alpha);
   generate_singles_doubles(norb, state_beta, singles_beta, doubles_beta);
 
-  auto state_alpha_expand = expand_bitset<N>(state_alpha);
-  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
-
   // Generate Singles in full space
   singles.clear();
 
   // Single Alpha + No Beta
   for(auto s_alpha : singles_alpha) {
-    auto s_state = expand_bitset<N>(s_alpha);
-    s_state = s_state | state_beta_expand;
-    singles.emplace_back(s_state);
+    singles.emplace_back(wfn_traits::from_spin(s_alpha, state_beta));
   }
 
   // No Alpha + Single Beta
   for(auto s_beta : singles_beta) {
-    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
-    s_state = s_state | state_alpha_expand;
-    singles.emplace_back(s_state);
+    singles.emplace_back(wfn_traits::from_spin(state_alpha, s_beta));
   }
 
   // Generate Doubles in full space
@@ -396,58 +246,51 @@ void generate_singles_doubles_spin(size_t norb, std::bitset<N> state,
 
   // Double Alpha + No Beta
   for(auto d_alpha : doubles_alpha) {
-    auto d_state = expand_bitset<N>(d_alpha);
-    d_state = d_state | state_beta_expand;
-    doubles.emplace_back(d_state);
+    doubles.emplace_back(wfn_traits::from_spin(d_alpha, state_beta));
   }
 
   // No Alpha + Double Beta
   for(auto d_beta : doubles_beta) {
-    auto d_state = expand_bitset<N>(d_beta) << (N / 2);
-    d_state = d_state | state_alpha_expand;
-    doubles.emplace_back(d_state);
+    doubles.emplace_back(wfn_traits::from_spin(state_alpha, d_beta));
   }
 
   // Single Alpha + Single Beta
   for(auto s_alpha : singles_alpha)
     for(auto s_beta : singles_beta) {
-      auto d_state_alpha = expand_bitset<N>(s_alpha);
-      auto d_state_beta = expand_bitset<N>(s_beta) << (N / 2);
-      doubles.emplace_back(d_state_alpha | d_state_beta);
+      doubles.emplace_back(wfn_traits::from_spin(s_alpha, s_beta));
     }
 }
 
-template <size_t N>
-void generate_cisd_hilbert_space(size_t norb, std::bitset<N> state,
-                                 std::vector<std::bitset<N>>& dets) {
+template <typename WfnType, typename WfnContainer>
+void generate_cisd_hilbert_space(size_t norb, WfnType state,
+                                 WfnContainer& dets) {
   dets.clear();
   dets.emplace_back(state);
-  std::vector<std::bitset<N>> singles, doubles;
+  std::vector<WfnType> singles, doubles;
   generate_singles_doubles_spin(norb, state, singles, doubles);
   dets.insert(dets.end(), singles.begin(), singles.end());
   dets.insert(dets.end(), doubles.begin(), doubles.end());
 }
 
-template <size_t N>
-std::vector<std::bitset<N>> generate_cisd_hilbert_space(size_t norb,
-                                                        std::bitset<N> state) {
-  std::vector<std::bitset<N>> dets;
+template <typename WfnType>
+auto generate_cisd_hilbert_space(size_t norb, WfnType state) {
+  std::vector<WfnType> dets;
   generate_cisd_hilbert_space(norb, state, dets);
   return dets;
 }
 
-template <size_t N>
-std::vector<std::bitset<N>> generate_combs(uint64_t nbits, uint64_t nset) {
+template <typename WfnType>
+std::vector<WfnType> generate_combs(uint64_t nbits, uint64_t nset) {
+  using wfn_traits = wavefunction_traits<WfnType>;
   std::vector<bool> v(nbits, false);
   std::fill_n(v.begin(), nset, true);
-  std::vector<std::bitset<N>> store;
+  std::vector<WfnType> store;
 
   do {
-    std::bitset<N> temp = 0ul;
-    std::bitset<N> one = 1ul;
+    WfnType temp(0ul);
     for(uint64_t i = 0; i < nbits; ++i)
       if(v[i]) {
-        temp = temp | (one << i);
+        temp = wfn_traits::create_no_check(temp, i);
       }
     store.emplace_back(temp);
 
@@ -456,66 +299,47 @@ std::vector<std::bitset<N>> generate_combs(uint64_t nbits, uint64_t nset) {
   return store;
 }
 
-template <size_t N>
-std::vector<std::bitset<N>> generate_hilbert_space(size_t norbs, size_t nalpha,
-                                                   size_t nbeta) {
+template <typename WfnType>
+std::vector<WfnType> generate_hilbert_space(size_t norbs, size_t nalpha,
+                                            size_t nbeta) {
+  using spin_wfn_type = spin_wfn_t<WfnType>;
+  using wfn_traits = wavefunction_traits<WfnType>;
+
   // Get all alpha and beta combs
-  auto alpha_dets = generate_combs<N>(norbs, nalpha);
-  auto beta_dets = generate_combs<N>(norbs, nbeta);
+  auto alpha_dets = generate_combs<spin_wfn_type>(norbs, nalpha);
+  auto beta_dets = generate_combs<spin_wfn_type>(norbs, nbeta);
 
-  std::vector<std::bitset<N>> states;
+  std::vector<WfnType> states;
   states.reserve(alpha_dets.size() * beta_dets.size());
   for(auto alpha_det : alpha_dets)
     for(auto beta_det : beta_dets) {
-      std::bitset<N> state = alpha_det | (beta_det << (N / 2));
-      states.emplace_back(state);
+      states.emplace_back(wfn_traits::from_spin(alpha_det, beta_det));
     }
 
   return states;
 }
 
-template <size_t N>
-void generate_cis_hilbert_space(size_t norb, std::bitset<N> state,
-                                std::vector<std::bitset<N>>& dets) {
+template <typename WfnType, typename WfnContainer>
+void generate_cis_hilbert_space(size_t norb, WfnType state,
+                                WfnContainer& dets) {
   dets.clear();
   dets.emplace_back(state);
-  std::vector<std::bitset<N>> singles;
+  std::vector<WfnType> singles;
   generate_singles_spin(norb, state, singles);
   dets.insert(dets.end(), singles.begin(), singles.end());
 }
 
-template <size_t N>
-std::vector<std::bitset<N>> generate_cis_hilbert_space(size_t norb,
-                                                       std::bitset<N> state) {
-  std::vector<std::bitset<N>> dets;
+template <typename WfnType>
+std::vector<WfnType> generate_cis_hilbert_space(size_t norb, WfnType state) {
+  std::vector<WfnType> dets;
   generate_cis_hilbert_space(norb, state, dets);
   return dets;
 }
 
 // TODO: Test this function
-template <size_t N>
-uint32_t first_occupied_flipped(std::bitset<N> state, std::bitset<N> ex) {
-  return ffs(state & ex) - 1u;
-}
-
-// TODO: Test this function
-template <size_t N>
-double single_excitation_sign(std::bitset<N> state, unsigned p, unsigned q) {
-  std::bitset<N> mask = 0ul;
-
-  if(p > q) {
-    mask = state & (full_mask<N>(p) ^ full_mask<N>(q + 1));
-  } else {
-    mask = state & (full_mask<N>(q) ^ full_mask<N>(p + 1));
-  }
-  return (mask.count() % 2) ? -1. : 1.;
-}
-
-// TODO: Test this function
-template <size_t N>
-inline auto single_excitation_sign_indices(std::bitset<N> bra,
-                                           std::bitset<N> ket,
-                                           std::bitset<N> ex) {
+template <typename WfnType>
+inline auto single_excitation_sign_indices(WfnType bra, WfnType ket,
+                                           WfnType ex) {
   auto o1 = first_occupied_flipped(ket, ex);
   auto v1 = first_occupied_flipped(bra, ex);
   auto sign = single_excitation_sign(ket, v1, o1);
@@ -524,81 +348,59 @@ inline auto single_excitation_sign_indices(std::bitset<N> bra,
 }
 
 // TODO: Test this function
-template <size_t N>
-inline auto doubles_sign_indices(std::bitset<N> bra, std::bitset<N> ket,
-                                 std::bitset<N> ex) {
-  const auto o1 = first_occupied_flipped(ket, ex);
-  const auto v1 = first_occupied_flipped(bra, ex);
-  auto sign = single_excitation_sign(ket, v1, o1);
+template <typename WfnType>
+inline auto doubles_sign_indices(WfnType bra, WfnType ket, WfnType ex) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  auto [o1, v1, sign1] = single_excitation_sign_indices(bra, ket, ex);
 
-  ket.flip(o1).flip(v1);
-  ex.flip(o1).flip(v1);
+  ket = wfn_traits::single_excitation_no_check(ket, o1, v1);
+  ex = wfn_traits::single_excitation_no_check(ex, o1, v1);
 
-  const auto o2 = first_occupied_flipped(ket, ex);
-  const auto v2 = first_occupied_flipped(bra, ex);
-  sign *= single_excitation_sign(ket, v2, o2);
+  auto [o2, v2, sign2] = single_excitation_sign_indices(bra, ket, ex);
+  auto sign = sign1 * sign2;
 
   return std::make_tuple(o1, v1, o2, v2, sign);
 }
 
 // TODO: Test this function
-template <size_t N>
-inline auto doubles_sign(std::bitset<N> bra, std::bitset<N> ket,
-                         std::bitset<N> ex) {
+template <typename WfnType>
+inline auto doubles_sign(WfnType bra, WfnType ket, WfnType ex) {
   auto [p, q, r, s, sign] = doubles_sign_indices(bra, ket, ex);
   return sign;
 }
 
-// TODO: Test this function
-template <size_t N>
-void generate_residues(std::bitset<N> state, std::vector<std::bitset<N>>& res) {
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
-
-  auto occ_alpha = bits_to_indices(state_alpha);
-  const int nalpha = occ_alpha.size();
-
-  auto occ_beta = bits_to_indices(state_beta);
-  const int nbeta = occ_beta.size();
-
-  std::bitset<N> state_alpha_full = expand_bitset<N>(state_alpha);
-  std::bitset<N> state_beta_full = expand_bitset<N>(state_beta);
-  state_beta_full = state_beta_full << (N / 2);
-
-  std::bitset<N / 2> one = 1ul;
-
-  // Double alpha
-  for(auto i = 0; i < nalpha; ++i)
-    for(auto j = i + 1; j < nalpha; ++j) {
-      auto mask = (one << occ_alpha[i]) | (one << occ_alpha[j]);
-      std::bitset<N> _r = expand_bitset<N>(state_alpha & ~mask);
-      res.emplace_back(_r | state_beta_full);
-    }
-
-  // Double beta
-  for(auto i = 0; i < nbeta; ++i)
-    for(auto j = i + 1; j < nbeta; ++j) {
-      auto mask = (one << occ_beta[i]) | (one << occ_beta[j]);
-      std::bitset<N> _r = expand_bitset<N>(state_beta & ~mask) << (N / 2);
-      res.emplace_back(_r | state_alpha_full);
+template <typename WfnIterator>
+auto get_unique_alpha(WfnIterator begin, WfnIterator end) {
+  using wfn_type = typename std::iterator_traits<WfnIterator>::value_type;
+  using wfn_traits = wavefunction_traits<wfn_type>;
+  using spin_wfn_type = typename wfn_traits::spin_wfn_type;
+
+  std::vector<std::pair<spin_wfn_type, size_t>> unique_alpha;
+  unique_alpha.push_back({wfn_traits::alpha_string(*begin), 1});
+  for(auto it = begin + 1; it != end; ++it) {
+    auto& [cur_alpha, cur_count] = unique_alpha.back();
+    auto alpha_i = wfn_traits::alpha_string(*it);
+    if(alpha_i == cur_alpha) {
+      cur_count++;
+    } else {
+      unique_alpha.push_back({alpha_i, 1});
     }
+  }
 
-  // Mixed
-  for(auto i = 0; i < nalpha; ++i)
-    for(auto j = 0; j < nbeta; ++j) {
-      std::bitset<N> mask = expand_bitset<N>(one << occ_alpha[i]);
-      mask = mask | (expand_bitset<N>(one << occ_beta[j]) << (N / 2));
-      res.emplace_back(state & ~mask);
-    }
+  return unique_alpha;
 }
 
-template <size_t N>
-std::string to_canonical_string(std::bitset<N> state) {
-  static_assert((N % 2) == 0, "N Odd");
-  auto state_alpha = bitset_lo_word(state);
-  auto state_beta = bitset_hi_word(state);
+template <typename WfnType>
+std::string to_canonical_string(WfnType state) {
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using spin_wfn_type = spin_wfn_t<WfnType>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
+
+  auto state_alpha = wfn_traits::alpha_string(state);
+  auto state_beta = wfn_traits::beta_string(state);
   std::string str;
-  for(size_t i = 0; i < N / 2; ++i) {
+
+  for(size_t i = 0; i < spin_wfn_traits::size(); ++i) {
     if(state_alpha[i] and state_beta[i])
       str.push_back('2');
     else if(state_alpha[i])
@@ -611,20 +413,23 @@ std::string to_canonical_string(std::bitset<N> state) {
   return str;
 }
 
-template <size_t N>
-std::bitset<N> from_canonical_string(std::string str) {
-  std::bitset<N> state_alpha(0), state_beta(0);
-  for(auto i = 0ul; i < str.length(); ++i) {
+template <typename WfnType>
+WfnType from_canonical_string(std::string str) {
+  using spin_wfn_type = spin_wfn_t<WfnType>;
+  using wfn_traits = wavefunction_traits<WfnType>;
+  using spin_wfn_traits = wavefunction_traits<spin_wfn_type>;
+  spin_wfn_type state_alpha(0), state_beta(0);
+  for(auto i = 0ul; i < std::min(str.length(), spin_wfn_traits::size()); ++i) {
     if(str[i] == '2') {
-      state_alpha.set(i);
-      state_beta.set(i);
+      state_alpha = spin_wfn_traits::create_no_check(state_alpha, i);
+      state_beta = spin_wfn_traits::create_no_check(state_beta, i);
     } else if(str[i] == 'u') {
-      state_alpha.set(i);
+      state_alpha = spin_wfn_traits::create_no_check(state_alpha, i);
     } else if(str[i] == 'd') {
-      state_beta.set(i);
+      state_beta = spin_wfn_traits::create_no_check(state_beta, i);
     }
   }
-  auto state = state_alpha | (state_beta << (N / 2));
+  auto state = wfn_traits::from_spin(state_alpha, state_beta);
   return state;
 }
 
diff --git a/include/macis/solvers/davidson.hpp b/include/macis/solvers/davidson.hpp
index 63756772..bde3ef17 100644
--- a/include/macis/solvers/davidson.hpp
+++ b/include/macis/solvers/davidson.hpp
@@ -11,6 +11,7 @@
 #include <spdlog/sinks/stdout_color_sinks.h>
 #include <spdlog/spdlog.h>
 
+#include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <lobpcgxx/lobpcg.hpp>
@@ -96,6 +97,7 @@ void p_diagonal_guess(size_t N_local, const SpMatType& A, double* X) {
   // Determine min index
   auto D_min = std::min_element(D.begin(), D.end());
   auto min_idx = std::distance(D.begin(), D_min);
+  // printf("[rank %d] DMIN %lu %.6e\n", world_rank, min_idx, *D_min);
 
   // Zero out guess
   for(size_t i = 0; i < N_local; ++i) X[i] = 0.;
@@ -252,6 +254,8 @@ inline void p_gram_schmidt(int64_t N_local, int64_t K, const double* V_old,
   double dot = blas::dot(N_local, V_new, 1, V_new, 1);
   dot = allreduce(dot, MPI_SUM, comm);
   double nrm = std::sqrt(dot);
+  // printf("[rank %d] GS DOT %.6e NRM %.6e\n", comm_rank(comm),
+  //   dot, nrm);
   blas::scal(N_local, 1. / nrm, V_new, 1);
 }
 
@@ -267,11 +271,16 @@ inline void p_rayleigh_ritz(int64_t N_local, int64_t K, const double* X,
 
   // Reduce result
   if(LDC != K) throw std::runtime_error("DIE DIE DIE RR");
-  allreduce(C, K * K, MPI_SUM, comm);
+  // allreduce(C, K * K, MPI_SUM, comm);
+  std::allocator<double> alloc;
+  double* tmp_c = world_rank ? nullptr : alloc.allocate(K * K);
+  reduce(C, tmp_c, K * K, MPI_SUM, 0, comm);
 
   // Do local diagonalization on rank-0
   if(!world_rank) {
+    memcpy(C, tmp_c, K * K * sizeof(double));
     lapack::syev(lapack::Job::Vec, lapack::Uplo::Lower, K, C, LDC, W);
+    alloc.deallocate(tmp_c, K * K);
   }
 
   // Broadcast results
@@ -378,8 +387,18 @@ auto p_davidson(int64_t N_local, int64_t max_m, const Functor& op,
 
     // Compute new vector
     // (D - LAM(0)*I) * W = -R ==> W = -(D - LAM(0)*I)**-1 * R
+    double E1_denom = 0, E1_num = 0;
     for(auto j = 0; j < N_local; ++j) {
       R_local[j] = -R_local[j] / (D_local[j] - LAM[0]);
+      E1_num += X_local[j] * R_local[j];
+      E1_denom += X_local[j] * X_local[j] / (D_local[j] - LAM[0]);
+    }
+    E1_denom = allreduce(E1_denom, MPI_SUM, comm);
+    E1_num = allreduce(E1_num, MPI_SUM, comm);
+    const double E1 = E1_num / E1_denom;
+
+    for(auto j = 0; j < N_local; ++j) {
+      R_local[j] += E1 * X_local[j] / (D_local[j] - LAM[0]);
     }
 
     // Project new vector out form old vectors
diff --git a/include/macis/solvers/selected_ci_diag.hpp b/include/macis/solvers/selected_ci_diag.hpp
index 0a1e293f..9edffb41 100644
--- a/include/macis/solvers/selected_ci_diag.hpp
+++ b/include/macis/solvers/selected_ci_diag.hpp
@@ -13,6 +13,7 @@
 #include <macis/solvers/davidson.hpp>
 #include <macis/types.hpp>
 #include <macis/util/mpi.hpp>
+#include <sparsexx/io/write_dist_mm.hpp>
 #include <sparsexx/matrix_types/dense_conversions.hpp>
 #include <sparsexx/util/submatrix.hpp>
 
@@ -121,10 +122,9 @@ double serial_selected_ci_diag(const SpMatType& H, size_t davidson_max_m,
   return E;
 }
 
-template <size_t N, typename index_t = int32_t>
-double selected_ci_diag(wavefunction_iterator_t<N> dets_begin,
-                        wavefunction_iterator_t<N> dets_end,
-                        HamiltonianGenerator<N>& ham_gen, double h_el_tol,
+template <typename index_t, typename WfnType, typename WfnIterator>
+double selected_ci_diag(WfnIterator dets_begin, WfnIterator dets_end,
+                        HamiltonianGenerator<WfnType>& ham_gen, double h_el_tol,
                         size_t davidson_max_m, double davidson_res_tol,
                         std::vector<double>& C_local,
                         MACIS_MPI_CODE(MPI_Comm comm, )
@@ -146,9 +146,25 @@ double selected_ci_diag(wavefunction_iterator_t<N> dets_begin,
   MACIS_MPI_CODE(MPI_Barrier(comm);)
   auto H_st = clock_type::now();
 
+  auto world_size = comm_size(comm);
+  auto world_rank = comm_rank(comm);
+  //{
+  // std::ofstream wfn_file("wfn_" +
+  // std::to_string(std::distance(dets_begin,dets_end)) + "_" +
+  // std::to_string(world_rank) + "." + std::to_string(world_size) + ".txt");
+  // for(auto it = dets_begin; it != dets_end; ++it) {
+  //  wfn_file << *it << "\n";
+  //}
+  // wfn_file << std::flush;
+  //}
+
 #ifdef MACIS_ENABLE_MPI
   auto H = make_dist_csr_hamiltonian<index_t>(comm, dets_begin, dets_end,
                                               ham_gen, h_el_tol);
+  // sparsexx::write_dist_mm("ham_" + std::to_string(H.n()) + "." +
+  // std::to_string(world_size) + ".mtx", H, 1);
+  // MACIS_MPI_CODE(MPI_Barrier(comm);)
+  // if(H.n() >= 10000000) throw "DIE DIE DIE";
 #else
   auto H =
       make_csr_hamiltonian<index_t>(dets_begin, dets_end, ham_gen, h_el_tol);
@@ -171,7 +187,6 @@ double selected_ci_diag(wavefunction_iterator_t<N> dets_begin,
                duration_type(H_en - H_st).count());
 
 #ifdef MACIS_ENABLE_MPI
-  auto world_size = comm_size(comm);
   if(world_size > 1) {
     double local_hdur = duration_type(H_en - H_st).count();
     double max_hdur = allreduce(local_hdur, MPI_MAX, comm);
@@ -185,7 +200,7 @@ double selected_ci_diag(wavefunction_iterator_t<N> dets_begin,
 #endif
   logger->info("  {} = {:.2e} GiB", "HMEM_LOC",
                H.mem_footprint() / 1073741824.);
-  logger->info("  {} = {:.2f}%", "H_SPARSE",
+  logger->info("  {} = {:.2e}%", "H_SPARSE",
                total_nnz / double(H.n() * H.n()) * 100);
 #ifdef MACIS_ENABLE_MPI
   if(world_size > 1) {
diff --git a/include/macis/types.hpp b/include/macis/types.hpp
index 97331b52..00f7c061 100644
--- a/include/macis/types.hpp
+++ b/include/macis/types.hpp
@@ -16,8 +16,8 @@
 
 namespace macis {
 
-namespace KokkosEx =
-    MDSPAN_IMPL_STANDARD_NAMESPACE::MDSPAN_IMPL_PROPOSED_NAMESPACE;
+namespace KokkosEx = Kokkos;
+//    MDSPAN_IMPL_STANDARD_NAMESPACE::MDSPAN_IMPL_PROPOSED_NAMESPACE;
 
 template <typename T, size_t rank>
 using col_major_span =
@@ -50,7 +50,7 @@ using wavefunction_iterator_t = typename std::vector<std::bitset<N> >::iterator;
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
-using uint128_t = unsigned __int128;
+using uint128_t = __uint128_t;
 #pragma GCC diagnostic pop
 
 template <typename T, typename ParameterType>
diff --git a/include/macis/util/mpi.hpp b/include/macis/util/mpi.hpp
index a5ae545a..6375dbbc 100644
--- a/include/macis/util/mpi.hpp
+++ b/include/macis/util/mpi.hpp
@@ -125,6 +125,7 @@ REGISTER_MPI_TYPE(int, MPI_INT);
 REGISTER_MPI_TYPE(double, MPI_DOUBLE);
 REGISTER_MPI_TYPE(float, MPI_FLOAT);
 REGISTER_MPI_TYPE(size_t, MPI_UINT64_T);
+REGISTER_MPI_TYPE(int64_t, MPI_INT64_T);
 
 #undef REGISTER_MPI_TYPE
 
@@ -145,6 +146,18 @@ mpi_datatype make_contiguous_mpi_datatype(int n) {
   return make_managed_mpi_datatype(contig_dtype);
 }
 
+template <typename T>
+void reduce(const T* send, T* recv, size_t count, MPI_Op op, int root,
+            MPI_Comm comm) {
+  auto dtype = mpi_traits<T>::datatype();
+
+  size_t intmax = std::numeric_limits<int>::max();
+  size_t nchunk = count / intmax;
+  if(nchunk) throw std::runtime_error("Msg over INT_MAX not yet tested");
+
+  MPI_Reduce(send, recv, count, dtype, op, root, comm);
+}
+
 /**
  * @brief Type-safe wrapper for MPI_Allreduce
  *
@@ -228,5 +241,43 @@ struct mpi_traits<std::bitset<N>> {
   }
 };
 
+template <typename T>
+class global_atomic {
+  MPI_Win window_;
+  T* buffer_;
+
+ public:
+  global_atomic() = delete;
+
+  global_atomic(MPI_Comm comm, T init = 0) {
+    MPI_Win_allocate(sizeof(T), sizeof(T), MPI_INFO_NULL, comm, &buffer_,
+                     &window_);
+    if(window_ == MPI_WIN_NULL) {
+      throw std::runtime_error("Window creation failed");
+    }
+    *buffer_ = init;
+    MPI_Win_lock_all(MPI_MODE_NOCHECK, window_);
+  }
+
+  ~global_atomic() noexcept {
+    MPI_Win_unlock_all(window_);
+    MPI_Win_free(&window_);
+  }
+
+  global_atomic(const global_atomic&) = delete;
+  global_atomic(global_atomic&&) noexcept = delete;
+
+  T fetch_and_op(T val, MPI_Op op) {
+    T next_val;
+    MPI_Fetch_and_op(&val, &next_val, mpi_traits<T>::datatype(), 0, 0, op,
+                     window_);
+    MPI_Win_flush(0, window_);
+    return next_val;
+  }
+
+  T fetch_and_add(T val) { return fetch_and_op(val, MPI_SUM); }
+  T fetch_and_min(T val) { return fetch_and_op(val, MPI_MIN); }
+};
+
 }  // namespace macis
 #endif
diff --git a/include/macis/util/rdms.hpp b/include/macis/util/rdms.hpp
index 72b55bb3..875cd968 100644
--- a/include/macis/util/rdms.hpp
+++ b/include/macis/util/rdms.hpp
@@ -103,8 +103,9 @@ inline void rdm_contributions(wfn_t<N> bra_alpha, wfn_t<N> ket_alpha,
                               const IndexType& bra_occ_alpha,
                               const IndexType& bra_occ_beta, T val,
                               matrix_span<T> ordm, rank4_span<T> trdm) {
-  const uint32_t ex_alpha_count = ex_alpha.count();
-  const uint32_t ex_beta_count = ex_beta.count();
+  using wfn_traits = wavefunction_traits<wfn_t<N>>;
+  const uint32_t ex_alpha_count = wfn_traits::count(ex_alpha);
+  const uint32_t ex_beta_count = wfn_traits::count(ex_beta);
 
   if((ex_alpha_count + ex_beta_count) > 4) return;
 
diff --git a/include/macis/util/trexio.hpp b/include/macis/util/trexio.hpp
new file mode 100644
index 00000000..04a86621
--- /dev/null
+++ b/include/macis/util/trexio.hpp
@@ -0,0 +1,53 @@
+#pragma once
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include <trexio.h>
+
+#include <stdexcept>
+
+namespace macis {
+
+class trexio_exception : public std::exception {
+  std::string msg_;
+  const char* what() const noexcept override;
+
+ public:
+  trexio_exception(std::string func_name, trexio_exit_code rc);
+};
+
+class TREXIOFile {
+  trexio_t* file_handle_ = nullptr;
+
+ public:
+  TREXIOFile() noexcept = default;
+
+  TREXIOFile(std::string name, char mode, int backend);
+  ~TREXIOFile() noexcept;
+
+  TREXIOFile(const TREXIOFile&) = delete;
+  TREXIOFile(TREXIOFile&& other) noexcept;
+
+  int64_t read_mo_num() const;
+  int64_t read_mo_2e_int_eri_size() const;
+  double read_nucleus_repulsion() const;
+  void read_mo_1e_int_core_hamiltonian(double* h) const;
+  void read_mo_2e_int_eri(double* V) const;
+  int64_t read_determinant_num() const;
+  int32_t get_determinant_int64_num() const;
+  void read_determinant_list(int64_t ndet, int64_t* dets,
+                             int64_t ioff = 0) const;
+
+  void write_mo_num(int64_t nmo);
+  void write_nucleus_repulsion(double E);
+  void write_mo_1e_int_core_hamiltonian(const double* h);
+  void write_mo_2e_int_eri(const double* V);
+  void write_determinant_list(int64_t ndet, const int64_t* dets,
+                              int64_t ioff = 0);
+};
+
+}  // namespace macis
diff --git a/include/macis/util/unused_untested.hpp b/include/macis/util/unused_untested.hpp
new file mode 100644
index 00000000..af7b6639
--- /dev/null
+++ b/include/macis/util/unused_untested.hpp
@@ -0,0 +1,245 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#if 0
+/**
+ *  @brief Generate canonical HF determinant.
+ *
+ *  Generates a string representation of the canonical HF determinant
+ *  consisting of a specifed number of alpha and beta orbitals. This variant
+ *  does not assume energetic ordering of the HF orbitals.
+ *
+ *  TODO: This assumes restricted orbitals
+ *
+ *  @tparam N Number of bits for the total bit string of the state
+ *  @param[in] nalpha  Number of occupied alpha orbitals in the HF state
+ *  @param[in] nbeta   Number of occupied beta orbitals in the HF state
+ *  @param[in] orb_ens Orbital eigenenergies.
+ *
+ *  @returns The bitstring HF state consisting of the specified number of
+ *    occupied orbitals populated according to the ordering of `orb_ens`.
+ */
+template <size_t N>
+std::bitset<N> canonical_hf_determinant(uint32_t nalpha, uint32_t nbeta,
+                                        const std::vector<double>& orb_ens) {
+  static_assert((N % 2) == 0, "N Must Be Even");
+  // First, find the sorted indices for the orbital energies
+  std::vector<size_t> idx(orb_ens.size());
+  std::iota(idx.begin(), idx.end(), 0);
+  std::stable_sort(idx.begin(), idx.end(), [&orb_ens](size_t i1, size_t i2) {
+    return orb_ens[i1] < orb_ens[i2];
+  });
+  // Next, fill the electrons by energy
+  std::bitset<N> alpha(0), beta(0);
+  for(uint32_t i = 0; i < nalpha; i++) alpha.flip(idx[i]);
+  for(uint32_t i = 0; i < nbeta; i++) beta.flip(idx[i] + N / 2);
+  return alpha | beta;
+}
+
+/**
+ *  @brief Generate the list of (un)occupied orbitals for a paricular state.
+ *
+ *  TODO: Test this function
+ *
+ *  @tparam N Number of bits for the total bit string of the state
+ *  @param[in]  norb   Number of orbitals used to describe the state (<= `N`)
+ *  @param[in]  state  The state from which to determine orbital occupations.
+ *  @param[out] occ    List of occupied orbitals in `state`
+ *  @param[out] vir    List of unoccupied orbitals in `state`
+ *  @param[in]  as_orbs TODO:????
+ */
+template <size_t N>
+void bitset_to_occ_vir_as(size_t norb, std::bitset<N> state,
+                          std::vector<uint32_t>& occ,
+                          std::vector<uint32_t>& vir,
+                          const std::vector<uint32_t>& as_orbs) {
+  occ.clear();
+  for(const auto i : as_orbs)
+    if(state[i]) occ.emplace_back(i);
+  const auto nocc = occ.size();
+  assert(nocc <= norb);
+
+  const auto nvir = as_orbs.size() - nocc;
+  vir.resize(nvir);
+  auto it = vir.begin();
+  for(const auto i : as_orbs)
+    if(!state[i]) *(it++) = i;
+}
+
+
+
+// TODO: Test this function
+template <size_t N>
+void generate_singles_as(size_t norb, std::bitset<N> state,
+                         std::vector<std::bitset<N>>& singles,
+                         const std::vector<uint32_t>& as_orbs) {
+  std::vector<uint32_t> occ_orbs, vir_orbs;
+  bitset_to_occ_vir_as<N>(norb, state, occ_orbs, vir_orbs, as_orbs);
+
+  singles.clear();
+  append_singles(state, occ_orbs, vir_orbs, singles);
+}
+
+// TODO: Test this function
+template <size_t N>
+void generate_singles_doubles_as(size_t norb, std::bitset<N> state,
+                                 std::vector<std::bitset<N>>& singles,
+                                 std::vector<std::bitset<N>>& doubles,
+                                 const std::vector<uint32_t>& as_orbs) {
+  std::vector<uint32_t> occ_orbs, vir_orbs;
+  bitset_to_occ_vir_as<N>(norb, state, occ_orbs, vir_orbs, as_orbs);
+
+  singles.clear();
+  doubles.clear();
+  append_singles(state, occ_orbs, vir_orbs, singles);
+  append_doubles(state, occ_orbs, vir_orbs, doubles);
+}
+
+// TODO: Test this function
+template <size_t N>
+void generate_singles_spin_as(size_t norb, std::bitset<N> state,
+                              std::vector<std::bitset<N>>& singles,
+                              const std::vector<uint32_t> as_orbs) {
+  auto state_alpha = bitset_lo_word(state);
+  auto state_beta = bitset_hi_word(state);
+
+  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
+
+  // Generate Spin-Specific singles
+  generate_singles_as(norb, state_alpha, singles_alpha, as_orbs);
+  generate_singles_as(norb, state_beta, singles_beta, as_orbs);
+
+  auto state_alpha_expand = expand_bitset<N>(state_alpha);
+  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
+
+  // Generate Singles in full space
+  singles.clear();
+
+  // Single Alpha + No Beta
+  for(auto s_alpha : singles_alpha) {
+    auto s_state = expand_bitset<N>(s_alpha);
+    s_state = s_state | state_beta_expand;
+    singles.emplace_back(s_state);
+  }
+
+  // No Alpha + Single Beta
+  for(auto s_beta : singles_beta) {
+    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
+    s_state = s_state | state_alpha_expand;
+    singles.emplace_back(s_state);
+  }
+}
+
+// TODO: Test this function
+template <size_t N>
+void generate_singles_doubles_spin_as(size_t norb, std::bitset<N> state,
+                                      std::vector<std::bitset<N>>& singles,
+                                      std::vector<std::bitset<N>>& doubles,
+                                      const std::vector<uint32_t>& as_orbs) {
+  auto state_alpha = bitset_lo_word(state);
+  auto state_beta = bitset_hi_word(state);
+
+  std::vector<std::bitset<N / 2>> singles_alpha, singles_beta;
+  std::vector<std::bitset<N / 2>> doubles_alpha, doubles_beta;
+
+  // Generate Spin-Specific singles / doubles
+  generate_singles_doubles_as(norb, state_alpha, singles_alpha, doubles_alpha,
+                              as_orbs);
+  generate_singles_doubles_as(norb, state_beta, singles_beta, doubles_beta,
+                              as_orbs);
+
+  auto state_alpha_expand = expand_bitset<N>(state_alpha);
+  auto state_beta_expand = expand_bitset<N>(state_beta) << (N / 2);
+
+  // Generate Singles in full space
+  singles.clear();
+
+  // Single Alpha + No Beta
+  for(auto s_alpha : singles_alpha) {
+    auto s_state = expand_bitset<N>(s_alpha);
+    s_state = s_state | state_beta_expand;
+    singles.emplace_back(s_state);
+  }
+
+  // No Alpha + Single Beta
+  for(auto s_beta : singles_beta) {
+    auto s_state = expand_bitset<N>(s_beta) << (N / 2);
+    s_state = s_state | state_alpha_expand;
+    singles.emplace_back(s_state);
+  }
+
+  // Generate Doubles in full space
+  doubles.clear();
+
+  // Double Alpha + No Beta
+  for(auto d_alpha : doubles_alpha) {
+    auto d_state = expand_bitset<N>(d_alpha);
+    d_state = d_state | state_beta_expand;
+    doubles.emplace_back(d_state);
+  }
+
+  // No Alpha + Double Beta
+  for(auto d_beta : doubles_beta) {
+    auto d_state = expand_bitset<N>(d_beta) << (N / 2);
+    d_state = d_state | state_alpha_expand;
+    doubles.emplace_back(d_state);
+  }
+
+  // Single Alpha + Single Beta
+  for(auto s_alpha : singles_alpha)
+    for(auto s_beta : singles_beta) {
+      auto d_state_alpha = expand_bitset<N>(s_alpha);
+      auto d_state_beta = expand_bitset<N>(s_beta) << (N / 2);
+      doubles.emplace_back(d_state_alpha | d_state_beta);
+    }
+}
+#if 0
+// TODO: Test this function
+template <size_t N>
+void generate_residues(std::bitset<N> state, std::vector<std::bitset<N>>& res) {
+  auto state_alpha = bitset_lo_word(state);
+  auto state_beta = bitset_hi_word(state);
+
+  auto occ_alpha = bits_to_indices(state_alpha);
+  const int nalpha = occ_alpha.size();
+
+  auto occ_beta = bits_to_indices(state_beta);
+  const int nbeta = occ_beta.size();
+
+  std::bitset<N> state_alpha_full = expand_bitset<N>(state_alpha);
+  std::bitset<N> state_beta_full = expand_bitset<N>(state_beta);
+  state_beta_full = state_beta_full << (N / 2);
+
+  std::bitset<N / 2> one = 1ul;
+
+  // Double alpha
+  for(auto i = 0; i < nalpha; ++i)
+    for(auto j = i + 1; j < nalpha; ++j) {
+      auto mask = (one << occ_alpha[i]) | (one << occ_alpha[j]);
+      std::bitset<N> _r = expand_bitset<N>(state_alpha & ~mask);
+      res.emplace_back(_r | state_beta_full);
+    }
+
+  // Double beta
+  for(auto i = 0; i < nbeta; ++i)
+    for(auto j = i + 1; j < nbeta; ++j) {
+      auto mask = (one << occ_beta[i]) | (one << occ_beta[j]);
+      std::bitset<N> _r = expand_bitset<N>(state_beta & ~mask) << (N / 2);
+      res.emplace_back(_r | state_alpha_full);
+    }
+
+  // Mixed
+  for(auto i = 0; i < nalpha; ++i)
+    for(auto j = 0; j < nbeta; ++j) {
+      std::bitset<N> mask = expand_bitset<N>(one << occ_alpha[i]);
+      mask = mask | (expand_bitset<N>(one << occ_beta[j]) << (N / 2));
+      res.emplace_back(state & ~mask);
+    }
+}
+#endif
+#endif
diff --git a/include/macis/wavefunction_io.hpp b/include/macis/wavefunction_io.hpp
index bc38f7ee..1483d864 100644
--- a/include/macis/wavefunction_io.hpp
+++ b/include/macis/wavefunction_io.hpp
@@ -67,7 +67,7 @@ void read_wavefunction(std::string fname, std::vector<std::bitset<N>>& states,
     std::stringstream ss{line};
     std::string c, d;
     ss >> c >> d;
-    states.emplace_back(from_canonical_string<N>(d));
+    states.emplace_back(from_canonical_string<wfn_t<N>>(d));
     coeffs.emplace_back(std::stod(c));
   }
 }
diff --git a/include/macis/wfn/raw_bitset.hpp b/include/macis/wfn/raw_bitset.hpp
new file mode 100644
index 00000000..3bd5f4e0
--- /dev/null
+++ b/include/macis/wfn/raw_bitset.hpp
@@ -0,0 +1,144 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#pragma once
+
+#include <macis/bitset_operations.hpp>
+#include <macis/types.hpp>
+
+namespace macis {
+
+enum class Spin { Alpha, Beta };
+
+template <typename WfnType>
+struct wavefunction_traits;
+
+template <typename WfnType>
+using spin_wfn_t = typename wavefunction_traits<WfnType>::spin_wfn_type;
+
+template <size_t N>
+struct wavefunction_traits<std::bitset<N>> {
+  using wfn_type = std::bitset<N>;
+  using spin_wfn_type = std::bitset<N / 2>;
+  using orbidx_type = uint32_t;
+  using orbidx_container = std::vector<orbidx_type>;
+
+  inline static constexpr size_t bit_size = N;
+
+  static constexpr auto size() { return bit_size; }
+
+  static inline auto count(wfn_type state) { return state.count(); }
+
+  static inline spin_wfn_type alpha_string(wfn_type state) {
+    return bitset_lo_word(state);
+  }
+
+  static inline spin_wfn_type beta_string(wfn_type state) {
+    return bitset_hi_word(state);
+  }
+
+  using wfn_comparator = bitset_less_comparator<N>;
+
+  struct spin_comparator {
+    using spin_wfn_comparator = bitset_less_comparator<N / 2>;
+    bool operator()(wfn_type x, wfn_type y) const {
+      auto s_comp = spin_wfn_comparator{};
+      const auto x_a = alpha_string(x);
+      const auto y_a = alpha_string(y);
+      if(x_a == y_a) {
+        const auto x_b = beta_string(x);
+        const auto y_b = beta_string(y);
+        return s_comp(x_b, y_b);
+      } else
+        return s_comp(x_a, y_a);
+    }
+  };
+
+  template <Spin Sigma = Spin::Alpha>
+  static inline wfn_type from_spin(spin_wfn_type alpha, spin_wfn_type beta) {
+    if constexpr(Sigma == Spin::Alpha) {
+      auto alpha_expand = expand_bitset<N>(alpha);
+      auto beta_expand = expand_bitset<N>(beta) << N / 2;
+      return alpha_expand | beta_expand;
+    } else {
+      auto alpha_expand = expand_bitset<N>(alpha) << N / 2;
+      auto beta_expand = expand_bitset<N>(beta);
+      return alpha_expand | beta_expand;
+    }
+  }
+
+  static inline wfn_type canonical_hf_determinant(uint32_t nalpha,
+                                                  uint32_t nbeta) {
+    spin_wfn_type alpha = full_mask<N / 2>(nalpha);
+    spin_wfn_type beta = full_mask<N / 2>(nbeta);
+    return from_spin(alpha, beta);
+  }
+
+  template <Spin Sigma, typename... Inds>
+  static inline wfn_type& flip_bits(wfn_type& state, Inds&&...);
+
+  template <Spin Sigma>
+  static inline wfn_type& flip_bits(wfn_type& state) {
+    return state;
+  }
+
+  template <Spin Sigma, typename... Inds>
+  static inline wfn_type& flip_bits(wfn_type& state, unsigned p,
+                                    Inds&&... inds) {
+    return flip_bits<Sigma>(state.flip(p + (Sigma == Spin::Alpha ? 0 : N / 2)),
+                            std::forward<Inds>(inds)...);
+  }
+
+  template <Spin Sigma = Spin::Alpha>
+  static inline wfn_type create_no_check(wfn_type state, unsigned p) {
+    flip_bits<Sigma>(state, p);
+    return state;
+  }
+
+  template <Spin Sigma = Spin::Alpha>
+  static inline wfn_type single_excitation_no_check(wfn_type state, unsigned p,
+                                                    unsigned q) {
+    flip_bits<Sigma>(state, p, q);
+    return state;
+  }
+
+  template <Spin Sigma = Spin::Alpha>
+  static inline wfn_type double_excitation_no_check(wfn_type state, unsigned p,
+                                                    unsigned q, unsigned r,
+                                                    unsigned s) {
+    flip_bits<Sigma>(state, p, q, r, s);
+    return state;
+  }
+
+  static inline void state_to_occ(wfn_type state, orbidx_container& occ) {
+    occ = bits_to_indices(state);
+  }
+
+  static inline orbidx_container state_to_occ(wfn_type state) {
+    return bits_to_indices(state);
+  }
+
+  static inline void state_to_occ_vir(size_t norb, wfn_type state,
+                                      orbidx_container& occ,
+                                      orbidx_container& vir) {
+    state_to_occ(state, occ);
+    const auto nocc = occ.size();
+    assert(nocc < norb);
+
+    const auto nvir = norb - nocc;
+    vir.resize(nvir);
+    state = ~state;
+    for(int i = 0; i < nvir; ++i) {
+      auto a = ffs(state) - 1;
+      vir[i] = a;
+      state.flip(a);
+    }
+  }
+};
+
+}  // namespace macis
diff --git a/src/macis/CMakeLists.txt b/src/macis/CMakeLists.txt
index 26155ac6..25e9abfd 100644
--- a/src/macis/CMakeLists.txt
+++ b/src/macis/CMakeLists.txt
@@ -6,17 +6,31 @@
 
 add_library( macis
   fcidump.cxx
-  fock_matrices.cxx
   transform.cxx
-  orbital_gradient.cxx
-  casscf.cxx
   moller_plesset.cxx
-  orbital_energies.cxx
-  orbital_rotation_utilities.cxx
-  orbital_hessian.cxx
-  orbital_steps.cxx
+
+  mcscf/fock_matrices.cxx
+  mcscf/casscf.cxx
+  mcscf/orbital_gradient.cxx
+  mcscf/orbital_energies.cxx
+  mcscf/orbital_rotation_utilities.cxx
+  mcscf/orbital_hessian.cxx
+  mcscf/orbital_steps.cxx
+
+  hamiltonian_generator/base.cxx
 )
 
+if(MACIS_ENABLE_TREXIO)
+  FetchContent_Declare( trexio
+    #GIT_REPOSITORY https://github.com/TREX-CoE/trexio
+    GIT_REPOSITORY https://github.com/wavefunction91/trexio
+    GIT_TAG cmake_fixup
+  )
+  FetchContent_MakeAvailable(trexio)
+  target_link_libraries(macis PUBLIC trexio::trexio)
+  target_sources(macis PRIVATE trexio.cxx)
+endif()
+
 target_include_directories( macis PUBLIC 
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
     $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
diff --git a/src/macis/hamiltonian_generator/base.cxx b/src/macis/hamiltonian_generator/base.cxx
new file mode 100644
index 00000000..dbfd61ac
--- /dev/null
+++ b/src/macis/hamiltonian_generator/base.cxx
@@ -0,0 +1,17 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#include "base.ipp"
+
+#include "fast_diagonals.ipp"
+
+namespace macis {
+
+template class HamiltonianGeneratorBase<double>;
+
+}
diff --git a/include/macis/hamiltonian_generator/rdms.hpp b/src/macis/hamiltonian_generator/base.ipp
similarity index 57%
rename from include/macis/hamiltonian_generator/rdms.hpp
rename to src/macis/hamiltonian_generator/base.ipp
index ba4cc0f9..a39ea749 100644
--- a/include/macis/hamiltonian_generator/rdms.hpp
+++ b/src/macis/hamiltonian_generator/base.ipp
@@ -7,7 +7,8 @@
  */
 
 #pragma once
-#include <macis/hamiltonian_generator.hpp>
+#include <macis/hamiltonian_generator/base.hpp>
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #include <blas.hh>
@@ -16,17 +17,77 @@
 
 namespace macis {
 
-template <size_t N>
-void HamiltonianGenerator<N>::rotate_hamiltonian_ordm(const double* ordm) {
+template <typename Scalar>
+HamiltonianGeneratorBase<Scalar>::HamiltonianGeneratorBase(matrix_span_t T,
+                                              rank4_span_t V)
+    : norb_(T.extent(0)),
+      norb2_(norb_ * norb_),
+      norb3_(norb2_ * norb_),
+      T_pq_(T),
+      V_pqrs_(V) {
+  generate_integral_intermediates_(V_pqrs_);
+}
+
+template <typename Scalar>
+void HamiltonianGeneratorBase<Scalar>::generate_integral_intermediates_(rank4_span_t V) {
+  if(V.extent(0) != norb_ or V.extent(1) != norb_ or V.extent(2) != norb_ or
+     V.extent(3) != norb_)
+    throw std::runtime_error("V has incorrect dimensions");
+
+  size_t no = norb_;
+  size_t no2 = no * no;
+  size_t no3 = no2 * no;
+  size_t no4 = no3 * no;
+
+  // G(i,j,k,l) = V(i,j,k,l) - V(i,l,k,j)
+  G_pqrs_data_ = std::vector<Scalar>(begin(V), end(V));
+  G_pqrs_ = rank4_span_t(G_pqrs_data_.data(), no, no, no, no);
+  for(auto i = 0ul; i < no; ++i)
+    for(auto j = 0ul; j < no; ++j)
+      for(auto k = 0ul; k < no; ++k)
+        for(auto l = 0ul; l < no; ++l) {
+          G_pqrs_(i, j, k, l) -= V(i, l, k, j);
+        }
+
+  // G_red(i,j,k) = G(i,j,k,k) = G(k,k,i,j)
+  // V_red(i,j,k) = V(i,j,k,k) = V(k,k,i,j)
+  G_red_data_.resize(no3);
+  V_red_data_.resize(no3);
+  G_red_ = rank3_span_t(G_red_data_.data(), no, no, no);
+  V_red_ = rank3_span_t(V_red_data_.data(), no, no, no);
+  for(auto j = 0ul; j < no; ++j)
+    for(auto i = 0ul; i < no; ++i)
+      for(auto k = 0ul; k < no; ++k) {
+        G_red_(k, i, j) = G_pqrs_(k, k, i, j);
+        V_red_(k, i, j) = V(k, k, i, j);
+      }
+
+  // G2_red(i,j) = 0.5 * G(i,i,j,j)
+  // V2_red(i,j) = V(i,i,j,j)
+  G2_red_data_.resize(no2);
+  V2_red_data_.resize(no2);
+  G2_red_ = matrix_span<Scalar>(G2_red_data_.data(), no, no);
+  V2_red_ = matrix_span<Scalar>(V2_red_data_.data(), no, no);
+  for(auto j = 0ul; j < no; ++j)
+    for(auto i = 0ul; i < no; ++i) {
+      G2_red_(i, j) = 0.5 * G_pqrs_(i, i, j, j);
+      V2_red_(i, j) = V(i, i, j, j);
+    }
+}
+
+
+
+template <typename Scalar>
+void HamiltonianGeneratorBase<Scalar>::rotate_hamiltonian_ordm(const Scalar* ordm) {
   // SVD on ordm to get natural orbitals
-  std::vector<double> natural_orbitals(ordm, ordm + norb2_);
-  std::vector<double> S(norb_);
+  std::vector<Scalar> natural_orbitals(ordm, ordm + norb2_);
+  std::vector<Scalar> S(norb_);
   lapack::gesvd(lapack::Job::OverwriteVec, lapack::Job::NoVec, norb_, norb_,
                 natural_orbitals.data(), norb_, S.data(), NULL, 1, NULL, 1);
 
 #if 0
   {
-    std::vector<double> tmp(norb2_);
+    std::vector<Scalar> tmp(norb2_);
     blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
       norb_, norb_, norb_, 1., natural_orbitals.data(), norb_,
       natural_orbitals.data(), norb_, 0., tmp.data(), norb_ );
@@ -34,7 +95,7 @@ void HamiltonianGenerator<N>::rotate_hamiltonian_ordm(const double* ordm) {
     std::cout << "MAX = " << *std::max_element(tmp.begin(),tmp.end(),
       []( auto x, auto y ){ return std::abs(x) < std::abs(y); } ) << std::endl;
 
-    double max_diff = 0.;
+    Scalar max_diff = 0.;
     for( auto i = 0; i < norb_; ++i )
     for( auto j = i+1; j < norb_; ++j ) {
       max_diff = std::max( max_diff, 
@@ -44,7 +105,7 @@ void HamiltonianGenerator<N>::rotate_hamiltonian_ordm(const double* ordm) {
   }
 #endif
 
-  std::vector<double> tmp(norb3_ * norb_), tmp2(norb3_ * norb_);
+  std::vector<Scalar> tmp(norb3_ * norb_), tmp2(norb3_ * norb_);
 
   // Transform T
   // T <- N**H * T * N
@@ -95,7 +156,8 @@ void HamiltonianGenerator<N>::rotate_hamiltonian_ordm(const double* ordm) {
              natural_orbitals.data(), norb_, 0., V_pqrs_.data_handle(), norb3_);
 
   // Regenerate intermediates
-  generate_integral_intermediates(V_pqrs_);
+  generate_integral_intermediates_(V_pqrs_);
 }
 
 }  // namespace macis
+
diff --git a/include/macis/hamiltonian_generator/fast_diagonals.hpp b/src/macis/hamiltonian_generator/fast_diagonals.ipp
similarity index 86%
rename from include/macis/hamiltonian_generator/fast_diagonals.hpp
rename to src/macis/hamiltonian_generator/fast_diagonals.ipp
index 52cab806..a76292b7 100644
--- a/include/macis/hamiltonian_generator/fast_diagonals.hpp
+++ b/src/macis/hamiltonian_generator/fast_diagonals.ipp
@@ -11,8 +11,8 @@
 
 namespace macis {
 
-template <size_t N>
-double HamiltonianGenerator<N>::single_orbital_en(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::single_orbital_en(
     uint32_t orb, const std::vector<uint32_t>& ss_occ,
     const std::vector<uint32_t>& os_occ) const {
   // One electron component
@@ -28,8 +28,8 @@ double HamiltonianGenerator<N>::single_orbital_en(
   return orb_en;
 }
 
-template <size_t N>
-std::vector<double> HamiltonianGenerator<N>::single_orbital_ens(
+template <typename Scalar>
+std::vector<double> HamiltonianGeneratorBase<Scalar>::single_orbital_ens(
     size_t norb, const std::vector<uint32_t>& ss_occ,
     const std::vector<uint32_t>& os_occ) const {
   std::vector<double> ens(norb);
@@ -50,8 +50,8 @@ std::vector<double> HamiltonianGenerator<N>::single_orbital_ens(
   return ens;
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_single(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_single(
     // These refer to original determinant
     double hol_en, double par_en, uint32_t orb_hol, uint32_t orb_par,
     double orig_det_Hii) const {
@@ -59,8 +59,8 @@ double HamiltonianGenerator<N>::fast_diag_single(
          G2_red_(orb_hol, orb_par);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_single(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_single(
     // These refer to original determinant
     const std::vector<uint32_t>& ss_occ, const std::vector<uint32_t>& os_occ,
     uint32_t orb_hol, uint32_t orb_par, double orig_det_Hii) const {
@@ -69,8 +69,8 @@ double HamiltonianGenerator<N>::fast_diag_single(
   return fast_diag_single(hol_en, par_en, orb_hol, orb_par, orig_det_Hii);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_ss_double(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_ss_double(
     // These refer to original determinant
     double hol1_en, double hol2_en, double par1_en, double par2_en,
     uint32_t orb_hol1, uint32_t orb_hol2, uint32_t orb_par1, uint32_t orb_par2,
@@ -84,8 +84,8 @@ double HamiltonianGenerator<N>::fast_diag_ss_double(
          G2_red_(orb_par2, orb_hol2) - G2_red_(orb_hol2, orb_par2);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_ss_double(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_ss_double(
     // These refer to original determinant
     const std::vector<uint32_t>& ss_occ, const std::vector<uint32_t>& os_occ,
     uint32_t orb_hol1, uint32_t orb_hol2, uint32_t orb_par1, uint32_t orb_par2,
@@ -98,8 +98,8 @@ double HamiltonianGenerator<N>::fast_diag_ss_double(
                              orb_hol2, orb_par1, orb_par2, orig_det_Hii);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_os_double(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_os_double(
     // These refer to original determinant
     double en_holu, double en_hold, double en_paru, double en_pard,
     uint32_t orb_holu, uint32_t orb_hold, uint32_t orb_paru, uint32_t orb_pard,
@@ -111,8 +111,8 @@ double HamiltonianGenerator<N>::fast_diag_os_double(
          V2_red_(orb_paru, orb_hold) - V2_red_(orb_holu, orb_pard);
 }
 
-template <size_t N>
-double HamiltonianGenerator<N>::fast_diag_os_double(
+template <typename Scalar>
+double HamiltonianGeneratorBase<Scalar>::fast_diag_os_double(
     // These refer to original determinant
     const std::vector<uint32_t>& ss_occ, const std::vector<uint32_t>& os_occ,
     uint32_t orb_holu, uint32_t orb_hold, uint32_t orb_paru, uint32_t orb_pard,
diff --git a/src/macis/casscf.cxx b/src/macis/mcscf/casscf.cxx
similarity index 86%
rename from src/macis/casscf.cxx
rename to src/macis/mcscf/casscf.cxx
index d58ae4df..075f3777 100644
--- a/src/macis/casscf.cxx
+++ b/src/macis/mcscf/casscf.cxx
@@ -7,9 +7,9 @@
  */
 
 #include <macis/hamiltonian_generator/double_loop.hpp>
-#include <macis/util/cas.hpp>
-#include <macis/util/mcscf.hpp>
-#include <macis/util/mcscf_impl.hpp>
+#include <macis/mcscf/cas.hpp>
+#include <macis/mcscf/mcscf.hpp>
+#include <macis/mcscf/mcscf_impl.hpp>
 
 namespace macis {
 
@@ -19,7 +19,7 @@ double casscf_diis(MCSCFSettings settings, NumElectron nalpha,
                    size_t LDT, double* V, size_t LDV, double* A1RDM,
                    size_t LDD1, double* A2RDM,
                    size_t LDD2 MACIS_MPI_CODE(, MPI_Comm comm)) {
-  using generator_t = DoubleLoopHamiltonianGenerator<64>;
+  using generator_t = DoubleLoopHamiltonianGenerator<wfn_t<64>>;
   using functor_t = CASRDMFunctor<generator_t>;
   functor_t op;
   return mcscf_impl<functor_t>(op, settings, nalpha, nbeta, norb, ninact, nact,
diff --git a/src/macis/fock_matrices.cxx b/src/macis/mcscf/fock_matrices.cxx
similarity index 99%
rename from src/macis/fock_matrices.cxx
rename to src/macis/mcscf/fock_matrices.cxx
index a2708f14..40bd82b8 100644
--- a/src/macis/fock_matrices.cxx
+++ b/src/macis/mcscf/fock_matrices.cxx
@@ -7,7 +7,7 @@
  */
 
 #include <blas.hh>
-#include <macis/util/fock_matrices.hpp>
+#include <macis/mcscf/fock_matrices.hpp>
 #include <vector>
 
 namespace macis {
diff --git a/src/macis/orbital_energies.cxx b/src/macis/mcscf/orbital_energies.cxx
similarity index 95%
rename from src/macis/orbital_energies.cxx
rename to src/macis/mcscf/orbital_energies.cxx
index 97395b59..ef34c632 100644
--- a/src/macis/orbital_energies.cxx
+++ b/src/macis/mcscf/orbital_energies.cxx
@@ -6,7 +6,7 @@
  * See LICENSE.txt for details
  */
 
-#include <macis/util/orbital_energies.hpp>
+#include <macis/mcscf/orbital_energies.hpp>
 
 namespace macis {
 
diff --git a/src/macis/orbital_gradient.cxx b/src/macis/mcscf/orbital_gradient.cxx
similarity index 99%
rename from src/macis/orbital_gradient.cxx
rename to src/macis/mcscf/orbital_gradient.cxx
index 918bfbae..3abedee9 100644
--- a/src/macis/orbital_gradient.cxx
+++ b/src/macis/mcscf/orbital_gradient.cxx
@@ -7,8 +7,8 @@
  */
 
 #include <Eigen/Core>
-#include <macis/util/fock_matrices.hpp>
-#include <macis/util/orbital_gradient.hpp>
+#include <macis/mcscf/fock_matrices.hpp>
+#include <macis/mcscf/orbital_gradient.hpp>
 #include <macis/util/transform.hpp>
 #include <unsupported/Eigen/MatrixFunctions>
 
diff --git a/src/macis/orbital_hessian.cxx b/src/macis/mcscf/orbital_hessian.cxx
similarity index 99%
rename from src/macis/orbital_hessian.cxx
rename to src/macis/mcscf/orbital_hessian.cxx
index fcddd407..e5734bfa 100644
--- a/src/macis/orbital_hessian.cxx
+++ b/src/macis/mcscf/orbital_hessian.cxx
@@ -9,7 +9,7 @@
 #include <blas.hh>
 #include <iomanip>
 #include <iostream>
-#include <macis/util/orbital_hessian.hpp>
+#include <macis/mcscf/orbital_hessian.hpp>
 #include <vector>
 
 namespace macis {
diff --git a/src/macis/orbital_rotation_utilities.cxx b/src/macis/mcscf/orbital_rotation_utilities.cxx
similarity index 98%
rename from src/macis/orbital_rotation_utilities.cxx
rename to src/macis/mcscf/orbital_rotation_utilities.cxx
index 8912902b..8ffdfe35 100644
--- a/src/macis/orbital_rotation_utilities.cxx
+++ b/src/macis/mcscf/orbital_rotation_utilities.cxx
@@ -6,7 +6,7 @@
  * See LICENSE.txt for details
  */
 
-#include <macis/util/orbital_rotation_utilities.hpp>
+#include <macis/mcscf/orbital_rotation_utilities.hpp>
 
 namespace macis {
 
diff --git a/src/macis/orbital_steps.cxx b/src/macis/mcscf/orbital_steps.cxx
similarity index 99%
rename from src/macis/orbital_steps.cxx
rename to src/macis/mcscf/orbital_steps.cxx
index 95fd4f43..d50a4a5c 100644
--- a/src/macis/orbital_steps.cxx
+++ b/src/macis/mcscf/orbital_steps.cxx
@@ -6,8 +6,8 @@
  * See LICENSE.txt for details
  */
 
-#include <macis/util/orbital_hessian.hpp>
-#include <macis/util/orbital_steps.hpp>
+#include <macis/mcscf/orbital_hessian.hpp>
+#include <macis/mcscf/orbital_steps.hpp>
 
 namespace macis {
 
diff --git a/src/macis/moller_plesset.cxx b/src/macis/moller_plesset.cxx
index 5e00d85c..49982ed1 100644
--- a/src/macis/moller_plesset.cxx
+++ b/src/macis/moller_plesset.cxx
@@ -8,8 +8,8 @@
 
 #include <iostream>
 #include <lapack.hh>
+#include <macis/mcscf/orbital_energies.hpp>
 #include <macis/util/moller_plesset.hpp>
-#include <macis/util/orbital_energies.hpp>
 
 namespace macis {
 
diff --git a/src/macis/trexio.cxx b/src/macis/trexio.cxx
new file mode 100644
index 00000000..8b260c83
--- /dev/null
+++ b/src/macis/trexio.cxx
@@ -0,0 +1,167 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#include <array>
+#include <iostream>
+#include <macis/util/trexio.hpp>
+#include <vector>
+
+namespace macis {
+
+trexio_exception::trexio_exception(std::string func_name, trexio_exit_code rc)
+    : msg_("TREXIO Error: " + func_name +
+           "\n  Error Message: " + std::string(trexio_string_of_error(rc))) {}
+
+const char* trexio_exception::what() const noexcept { return msg_.c_str(); }
+
+#define TREXIO_EXCEPTION(rc) throw trexio_exception(__PRETTY_FUNCTION__, rc)
+
+TREXIOFile::TREXIOFile(std::string name, char mode, int backend) {
+  trexio_exit_code rc;
+  file_handle_ = trexio_open(name.c_str(), mode, backend, &rc);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+TREXIOFile::~TREXIOFile() noexcept {
+  if(file_handle_) trexio_close(file_handle_);
+}
+
+TREXIOFile::TREXIOFile(TREXIOFile&& other) noexcept {
+  this->file_handle_ = other.file_handle_;
+  other.file_handle_ = nullptr;
+}
+
+int64_t TREXIOFile::read_mo_num() const {
+  int64_t nmo;
+  auto rc = trexio_read_mo_num_64(file_handle_, &nmo);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+  return nmo;
+}
+
+int64_t TREXIOFile::read_mo_2e_int_eri_size() const {
+  int64_t neri;
+  auto rc = trexio_read_mo_2e_int_eri_size(file_handle_, &neri);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+  return neri;
+}
+
+double TREXIOFile::read_nucleus_repulsion() const {
+  double E;
+  auto rc = trexio_read_nucleus_repulsion(file_handle_, &E);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+  return E;
+}
+
+void TREXIOFile::read_mo_1e_int_core_hamiltonian(double* h) const {
+  auto rc = trexio_read_mo_1e_int_core_hamiltonian(file_handle_, h);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::read_mo_2e_int_eri(double* V) const {
+  const auto norb = read_mo_num();
+  const auto neri = read_mo_2e_int_eri_size();
+  const size_t nbatch = 100000;
+  std::vector<std::array<int32_t, 4>> idx_batch(nbatch);
+  std::vector<double> V_batch(nbatch);
+
+  size_t ioff = 0;
+  int64_t icount = nbatch;
+  while(icount == nbatch) {
+    if(ioff < neri) {
+      trexio_read_mo_2e_int_eri(file_handle_, ioff, &icount,
+                                idx_batch[0].data(), V_batch.data());
+    } else {
+      icount = 0;
+    }
+
+    for(size_t ii = 0; ii < icount; ++ii) {
+      const auto [i, j, k, l] = idx_batch[ii];
+      const auto v = V_batch[ii];
+
+      V[i + j * norb + k * norb * norb + l * norb * norb * norb] = v;
+      V[i + j * norb + l * norb * norb + k * norb * norb * norb] = v;
+      V[j + i * norb + k * norb * norb + l * norb * norb * norb] = v;
+      V[j + i * norb + l * norb * norb + k * norb * norb * norb] = v;
+      V[k + l * norb + i * norb * norb + j * norb * norb * norb] = v;
+      V[k + l * norb + j * norb * norb + i * norb * norb * norb] = v;
+      V[l + k * norb + i * norb * norb + j * norb * norb * norb] = v;
+      V[l + k * norb + j * norb * norb + i * norb * norb * norb] = v;
+    }
+    ioff += icount;
+  }
+}
+
+int64_t TREXIOFile::read_determinant_num() const {
+  int64_t ndet;
+  auto rc = trexio_read_determinant_num_64(file_handle_, &ndet);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+  return ndet;
+}
+
+int32_t TREXIOFile::get_determinant_int64_num() const {
+  int32_t n64;
+  auto rc = trexio_get_int64_num(file_handle_, &n64);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+  return n64;
+}
+
+void TREXIOFile::read_determinant_list(int64_t ndet, int64_t* dets,
+                                       int64_t ioff) const {
+  int64_t icount = ndet;
+  auto rc = trexio_read_determinant_list(file_handle_, ioff, &icount, dets);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::write_mo_num(int64_t nmo) {
+  auto rc = trexio_write_mo_num_64(file_handle_, nmo);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::write_nucleus_repulsion(double E) {
+  auto rc = trexio_write_nucleus_repulsion(file_handle_, E);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::write_mo_1e_int_core_hamiltonian(const double* h) {
+  auto rc = trexio_write_mo_1e_int_core_hamiltonian(file_handle_, h);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::write_mo_2e_int_eri(const double* V) {
+  const auto norb = read_mo_num();
+  const auto npair = (norb * (norb + 1)) / 2;
+  const auto nquad = (npair * (npair + 1)) / 2;
+  std::vector<double> V_compress(nquad);
+  std::vector<std::array<int32_t, 4>> idx(nquad);
+
+  size_t ijkl = 0;
+  for(size_t i = 0, ij = 0; i < norb; ++i)
+    for(size_t j = 0; j <= i; ++j, ++ij) {
+      for(size_t k = 0, kl = 0; k < norb; ++k)
+        for(size_t l = 0; l <= k; ++l, ++kl) {
+          if(kl > ij) continue;
+          V_compress.at(ijkl) =
+              V[i + j * norb + k * norb * norb + l * norb * norb * norb];
+          idx.at(ijkl) = {i, j, k, l};
+          ijkl++;
+        }
+    }
+
+  auto rc = trexio_write_mo_2e_int_eri(file_handle_, 0, nquad, idx[0].data(),
+                                       V_compress.data());
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+void TREXIOFile::write_determinant_list(int64_t ndet, const int64_t* dets,
+                                        int64_t ioff) {
+  int64_t icount = ndet;
+  auto rc = trexio_write_determinant_list(file_handle_, ioff, icount, dets);
+  if(rc != TREXIO_SUCCESS) TREXIO_EXCEPTION(rc);
+}
+
+}  // namespace macis
diff --git a/src/sparsexx/CMakeLists.txt b/src/sparsexx/CMakeLists.txt
index 935419db..e9019082 100644
--- a/src/sparsexx/CMakeLists.txt
+++ b/src/sparsexx/CMakeLists.txt
@@ -93,7 +93,7 @@ if( SPARSEXX_ENABLE_RANGES_V3 )
     add_subdirectory(${range_v3_download_SOURCE_DIR} ${range_v3_download_BINARY_DIR})
     add_library( ranges INTERFACE IMPORTED )
     target_link_libraries( ranges INTERFACE range-v3 )
-    target_compile_definitions( ranges INTERFACE "SPARSEXX_ENABLE_RANGES_V3=1" )
+    #target_compile_definitions( ranges INTERFACE "SPARSEXX_ENABLE_RANGES_V3=1" )
   endif()
   list(APPEND SPARSEXX_EXTERNAL_LIBRARIES ranges)
 endif()
@@ -128,7 +128,7 @@ endif()
 if( TARGET OpenMP::OpenMP_CXX )
   target_link_libraries( sparsexx PUBLIC OpenMP::OpenMP_CXX )
 endif()
-target_link_libraries     ( sparsexx PRIVATE ${SPARSEXX_EXTERNAL_LIBRARIES} )
+target_link_libraries     ( sparsexx PUBLIC ${SPARSEXX_EXTERNAL_LIBRARIES} )
 target_include_directories( sparsexx PUBLIC 
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
     $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
diff --git a/src/sparsexx/include/sparsexx/matrix_types/conversions.hpp b/src/sparsexx/include/sparsexx/matrix_types/conversions.hpp
index 563cdeac..073cffbf 100644
--- a/src/sparsexx/include/sparsexx/matrix_types/conversions.hpp
+++ b/src/sparsexx/include/sparsexx/matrix_types/conversions.hpp
@@ -74,15 +74,40 @@ csr_matrix<T, index_t, Alloc>::csr_matrix(
   const auto& colind_coo = other.colind();
   const auto& nzval_coo = other.nzval();
 
-  // Compute rowptr
+// Compute rowptr
+#if 0
   rowptr_.at(0) = other.indexing();
   auto cur_row = 0;
   for(size_type i = 0; i < nnz_; ++i)
-    if(rowind_coo[i] != (cur_row + indexing_)) {
+    while(rowind_coo[i] != (cur_row + indexing_)) {
       cur_row++;
       rowptr_.at(cur_row) = i + indexing_;
     }
   rowptr_.at(m_) = nnz_ + indexing_;
+#else
+  if(indexing_) throw std::runtime_error("NONZERO INDEXING");
+  for(size_type i = 0; i < nnz_; ++i) {
+    rowptr_[rowind_coo[i] - indexing_ + 1]++;
+  }
+  for(size_type i = 0; i < m_; ++i) {
+    rowptr_[i + 1] += rowptr_[i];
+  }
+  if(indexing_)
+    for(size_type i = 0; i < m_ + 1; ++i) {
+      rowptr_[i] += indexing_;
+    }
+#endif
+
+  // for(size_type i = 0; i < m_; ++i) {
+  //   auto row_st = rowptr_[i];
+  //   auto row_en = rowptr_[i+1];
+  //   for(size_type j = row_st; j < row_en; ++j) {
+  //     if(rowind_coo[j] != i) throw std::runtime_error("ROWPTR WRONG");
+  //   }
+  //   if(!std::is_sorted(colind_coo.begin() + row_st, colind_coo.begin() +
+  //   row_en))
+  //     throw std::runtime_error("COLIND WRONG");
+  // }
 
   std::copy(colind_coo.begin(), colind_coo.end(), colind_.begin());
   std::copy(nzval_coo.begin(), nzval_coo.end(), nzval_.begin());
@@ -105,7 +130,7 @@ csc_matrix<T, index_t, Alloc>::csc_matrix(
   colptr_.at(0) = other.indexing();
   auto cur_col = 0;
   for(size_t i = 0; i < nnz_; ++i)
-    if(colind_coo[i] != (cur_col + indexing_)) {
+    while(colind_coo[i] != (cur_col + indexing_)) {
       cur_col++;
       colptr_.at(cur_col) = i + indexing_;
     }
diff --git a/src/sparsexx/include/sparsexx/matrix_types/coo_matrix.hpp b/src/sparsexx/include/sparsexx/matrix_types/coo_matrix.hpp
index 8e47b9ee..e0342cf2 100644
--- a/src/sparsexx/include/sparsexx/matrix_types/coo_matrix.hpp
+++ b/src/sparsexx/include/sparsexx/matrix_types/coo_matrix.hpp
@@ -212,6 +212,15 @@ class coo_matrix {
 
   void expand_from_triangle();
 
+  template <bool Check>
+  inline void insert(index_type i, index_type j, value_type v) noexcept {
+    static_assert(not Check, "insert check NYI");
+    rowind_.emplace_back(i);
+    colind_.emplace_back(j);
+    nzval_.emplace_back(v);
+    nnz_++;
+  }
+
 #ifdef SPARSEXX_ENABLE_CEREAL
   template <class Archive>
   void serialize(Archive& ar) {
diff --git a/src/sparsexx/include/sparsexx/matrix_types/coo_matrix_ops.hpp b/src/sparsexx/include/sparsexx/matrix_types/coo_matrix_ops.hpp
index c6a2c7f0..e99bd7c1 100644
--- a/src/sparsexx/include/sparsexx/matrix_types/coo_matrix_ops.hpp
+++ b/src/sparsexx/include/sparsexx/matrix_types/coo_matrix_ops.hpp
@@ -9,10 +9,11 @@
 #pragma once
 
 #include <iostream>
+#include <sparsexx/sparsexx_config.hpp>
 
 #include "coo_matrix.hpp"
 
-#if SPARSEXX_ENABLE_RANGES_V3
+#ifdef SPARSEXX_ENABLE_RANGES_V3
 #include <range/v3/all.hpp>
 #endif
 #include <algorithm>
@@ -22,7 +23,7 @@ namespace sparsexx {
 
 template <typename T, typename index_t, typename Alloc>
 void coo_matrix<T, index_t, Alloc>::sort_by_row_index() {
-#if SPARSEXX_ENABLE_RANGES_V3
+#ifdef SPARSEXX_ENABLE_RANGES_V3
   auto coo_zip = ranges::views::zip(rowind_, colind_, nzval_);
 
   // Sort lex by row index
@@ -72,9 +73,7 @@ void coo_matrix<T, index_t, Alloc>::sort_by_row_index() {
 
 template <typename T, typename index_t, typename Alloc>
 void coo_matrix<T, index_t, Alloc>::expand_from_triangle() {
-  std::cout << "Expanding Triangle" << std::endl;
-
-#if SPARSEXX_ENABLE_RANGES_V3
+#ifdef SPARSEXX_ENABLE_RANGES_V3
 
   auto idx_zip = ranges::views::zip(rowind_, colind_);
 
@@ -110,7 +109,7 @@ void coo_matrix<T, index_t, Alloc>::expand_from_triangle() {
   std::cout << "UT " << upper_triangle << std::endl;
   if(diagonal or full_matrix) return;
 
-  std::cout << "Performing Expansion..." << std::endl;
+  // std::cout << "Performing Expansion..." << std::endl;
   size_t new_nnz = 2 * nnz_ - n_;
   rowind_.reserve(new_nnz);
   colind_.reserve(new_nnz);
@@ -132,7 +131,7 @@ void coo_matrix<T, index_t, Alloc>::expand_from_triangle() {
 
 template <typename T, typename index_t, typename Alloc>
 void coo_matrix<T, index_t, Alloc>::sort_by_col_index() {
-#if SPARSEXX_ENABLE_RANGES_V3
+#ifdef SPARSEXX_ENABLE_RANGES_V3
   auto coo_zip = ranges::views::zip(rowind_, colind_, nzval_);
 
   // Sort lex by row index
diff --git a/src/sparsexx/include/sparsexx/spblas/pspmbv.hpp b/src/sparsexx/include/sparsexx/spblas/pspmbv.hpp
index 811f4a69..e18f3ad3 100644
--- a/src/sparsexx/include/sparsexx/spblas/pspmbv.hpp
+++ b/src/sparsexx/include/sparsexx/spblas/pspmbv.hpp
@@ -50,6 +50,8 @@ struct spmv_info {
     int comm_size = recv_offsets.size();
     for(int i = 0; i < comm_size; ++i)
       if(recv_counts[i]) {
+        if(recv_counts[i] > std::numeric_limits<int32_t>::max())
+          throw "DIE IN RECV";
         reqs.emplace_back(
             detail::mpi_irecv(X + recv_offsets[i], recv_counts[i], i, 0, comm));
       }
@@ -62,6 +64,8 @@ struct spmv_info {
     int comm_size = send_offsets.size();
     for(int i = 0; i < comm_size; ++i)
       if(send_counts[i]) {
+        if(send_counts[i] > std::numeric_limits<int32_t>::max())
+          throw "DIE IN SEND";
         reqs.emplace_back(
             detail::mpi_isend(X + send_offsets[i], send_counts[i], i, 0, comm));
       }
diff --git a/src/sparsexx/tests/CMakeLists.txt b/src/sparsexx/tests/CMakeLists.txt
index 52e12cdb..17ba4589 100644
--- a/src/sparsexx/tests/CMakeLists.txt
+++ b/src/sparsexx/tests/CMakeLists.txt
@@ -24,7 +24,7 @@ if( NOT Catch2_FOUND )
 
 endif()
 
-add_executable( sparsexx_test ut_main.cxx test_io.cxx test_util.cxx test_graph.cxx test_spmbv.cxx )
+add_executable( sparsexx_test ut_main.cxx test_types.cxx test_io.cxx test_util.cxx test_graph.cxx test_spmbv.cxx )
 target_link_libraries( sparsexx_test PUBLIC sparsexx Catch2::Catch2 )
 target_compile_definitions( sparsexx_test PUBLIC
   "SPARSEXX_DATA_DIR=\"${CMAKE_CURRENT_LIST_DIR}\""
diff --git a/src/sparsexx/tests/test_types.cxx b/src/sparsexx/tests/test_types.cxx
new file mode 100644
index 00000000..09b18c03
--- /dev/null
+++ b/src/sparsexx/tests/test_types.cxx
@@ -0,0 +1,34 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#include <sparsexx/matrix_types/coo_matrix.hpp>
+#include <sparsexx/matrix_types/csr_matrix.hpp>
+
+#include "catch2/catch.hpp"
+
+TEST_CASE("COO Matrix", "[matrix_types]") {
+  using mat_type = sparsexx::coo_matrix<double, int32_t>;
+
+  SECTION("Insert") {
+    size_t n = 6;
+    mat_type A(n, n, 0, 0);  // empty matrix zero-indexing
+    for(int i = 0; i < n; ++i) {
+      A.insert<false>(i, i, 2);
+      if(i < n - 1) A.insert<false>(i, i + 1, 1);
+      if(i > 0) A.insert<false>(i, i - 1, 1);
+    }
+    A.sort_by_row_index();  // Put into canonical order
+    REQUIRE(A.nnz() == 16);
+    REQUIRE(A.rowind() == std::vector<int32_t>{0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3,
+                                               4, 4, 4, 5, 5});
+    REQUIRE(A.colind() == std::vector<int32_t>{0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4,
+                                               3, 4, 5, 4, 5});
+    REQUIRE(A.nzval() == std::vector<double>{2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1,
+                                             2, 1, 1, 2});
+  }
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 038f9dfd..120c832f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -26,6 +26,8 @@ target_include_directories( macis_test PUBLIC ${PROJECT_BINARY_DIR}/tests )
 
 add_executable( standalone_driver standalone_driver.cxx ini_input.cxx )
 target_link_libraries( standalone_driver PUBLIC macis )
+add_executable( fcidump_to_trexio fcidump_to_trexio.cxx )
+target_link_libraries( fcidump_to_trexio PUBLIC macis )
 
 set(REF_DATA_PREFIX "${PROJECT_SOURCE_DIR}/tests/ref_data")
 configure_file( ut_common.hpp.in ${PROJECT_BINARY_DIR}/tests/ut_common.hpp)
diff --git a/tests/asci.cxx b/tests/asci.cxx
index 5542db83..40e8c406 100644
--- a/tests/asci.cxx
+++ b/tests/asci.cxx
@@ -12,6 +12,7 @@
 #include <iostream>
 #include <macis/asci/determinant_contributions.hpp>
 #include <macis/asci/grow.hpp>
+#include <macis/asci/pt2.hpp>
 #include <macis/asci/refine.hpp>
 #include <macis/bitset_operations.hpp>
 #include <macis/hamiltonian_generator/double_loop.hpp>
@@ -44,11 +45,29 @@ size_t top_set_ordinal(std::bitset<NBits> word, size_t NSet) {
   return ord;
 }
 
+template <size_t N>
+auto make_quad(unsigned i, unsigned j, unsigned k, unsigned l) {
+  using wfn_type = macis::wfn_t<N>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using constraint_type = macis::alpha_constraint<wfn_traits>;
+  using string_type = typename constraint_type::constraint_type;
+
+  string_type C = 0;
+  C.flip(i).flip(j).flip(k).flip(l);
+  string_type B = 1;
+  B <<= l;
+  B = B.to_ullong() - 1;
+
+  return constraint_type(C, B, l);
+}
+
 TEST_CASE("Triplets") {
   constexpr size_t num_bits = 64;
   size_t norb = 32;
 
   using wfn_less_comparator = macis::bitset_less_comparator<num_bits>;
+  using wfn_type = macis::wfn_t<num_bits>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
 
   // Generate ficticious wfns
   std::vector<macis::wfn_t<num_bits>> wfn_a = {
@@ -158,12 +177,10 @@ TEST_CASE("Triplets") {
         triplets.emplace_back(i, j, k);
       }
 
-  const auto overfill = macis::full_mask<num_bits>(norb);
-
   std::vector<size_t> new_triplet_hist(triplet_hist.size(), 0);
   for(auto [i, j, k] : triplets) {
     const auto label = i * 32 * 32 + j * 32 + k;
-    auto [T, B, T_min] = macis::make_triplet<num_bits>(i, j, k);
+    auto constraint = macis::make_triplet<num_bits>(i, j, k);
 
     for(auto det : wfn_a_uniq) {
       const size_t nocc = det.count();
@@ -172,7 +189,7 @@ TEST_CASE("Triplets") {
       const size_t n_doubles = (n_singles * (n_singles - nocc - nvir + 1)) / 4;
 
       new_triplet_hist[label] += macis::constraint_histogram(
-          det, n_singles, n_doubles, T, overfill, B);
+          wfn_traits::alpha_string(det), n_singles, n_doubles, constraint);
     }
   }
 
@@ -190,7 +207,7 @@ TEST_CASE("Triplets") {
   for(auto [i, j, k, l] : quads) {
     const size_t label =
         i * 32ul * 32ul * 32ul + j * 32ul * 32ul + k * 32ul + l;
-    auto [Q, B, Q_min] = macis::make_quad<num_bits>(i, j, k, l);
+    auto constraint = make_quad<num_bits>(i, j, k, l);
 
     for(auto det : wfn_a_uniq) {
       const size_t nocc = det.count();
@@ -199,13 +216,149 @@ TEST_CASE("Triplets") {
       const size_t n_doubles = (n_singles * (n_singles - nocc - nvir + 1)) / 4;
 
       new_quad_hist[label] += macis::constraint_histogram(
-          det, n_singles, n_doubles, Q, overfill, B);
+          wfn_traits::alpha_string(det), n_singles, n_doubles, constraint);
     }
   }
 
   REQUIRE(quad_hist == new_quad_hist);
 }
 
+TEST_CASE("Constraints") {
+  using wfn_type = macis::wfn_t<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using spin_wfn_type = typename wfn_traits::spin_wfn_type;
+  using spin_wfn_traits = macis::wavefunction_traits<spin_wfn_type>;
+  using wfn_comparator = typename wfn_traits::spin_comparator;
+  using spin_wfn_comparator = typename spin_wfn_traits::wfn_comparator;
+
+  using constraint_type = macis::alpha_constraint<wfn_traits>;
+
+  const size_t norb = 10;
+  size_t nalpha, nbeta;
+
+  SECTION("Closed Shell") {
+    nalpha = 6;
+    nbeta = 6;
+  }
+
+  SECTION("Open Shell") {
+    nalpha = 6;
+    nbeta = 3;
+  }
+
+  // Generate Hilbert Space
+  auto dets = macis::generate_hilbert_space<wfn_type>(norb, nalpha, nbeta);
+  std::sort(dets.begin(), dets.end(), wfn_comparator{});  // lex sort
+
+  // Get Alpha Compression Data
+  std::vector<std::pair<spin_wfn_type, size_t>> unique_alpha;
+  unique_alpha.push_back({wfn_traits::alpha_string(dets[0]), 1});
+  for(size_t i = 1; i < dets.size(); ++i) {
+    auto& [cur_alpha, cur_count] = unique_alpha.back();
+    auto alpha_i = wfn_traits::alpha_string(dets[i]);
+    if(alpha_i == cur_alpha) {
+      cur_count++;
+    } else {
+      unique_alpha.push_back({alpha_i, 1});
+    }
+  }
+
+  // Get offsets
+  std::vector<size_t> unique_alpha_offsets(unique_alpha.size());
+  std::transform_exclusive_scan(
+      unique_alpha.begin(), unique_alpha.end(), unique_alpha_offsets.begin(),
+      0ul, std::plus<size_t>(), [](auto p) { return p.second; });
+
+  // size_t ntot = std::accumulate( unique_alpha.begin(), unique_alpha.end(),
+  //   0ul, [](auto s, auto a){ return s + a.second; } );
+  // REQUIRE(ntot == dets.size());
+
+  // Generate constraints
+  std::vector<constraint_type> triplets, quads;
+  for(int i = 0; i < norb; ++i)
+    for(int j = 0; j < i; ++j)
+      for(int k = 0; k < j; ++k) {
+        triplets.emplace_back(macis::make_triplet<64>(i, j, k));
+        for(int l = 0; l < k; ++l) {
+          quads.emplace_back(make_quad<64>(i, j, k, l));
+        }
+      }
+
+  // Check doubles
+  auto check_doubles = [](auto det, auto C, auto& doubles) {
+    // Sanity check
+    auto [_O, _V] = macis::generate_constraint_double_excitations(det, C);
+    REQUIRE(std::all_of(_O.begin(), _O.end(),
+                        [](auto e) { return spin_wfn_traits::count(e) == 2; }));
+    REQUIRE(std::all_of(_V.begin(), _V.end(),
+                        [](auto e) { return spin_wfn_traits::count(e) == 2; }));
+
+    // Extract all double excitations that satisfy the constraint
+    std::vector<spin_wfn_type> doubles_c;
+    std::copy_if(doubles.begin(), doubles.end(), std::back_inserter(doubles_c),
+                 [=](auto d) { return C.satisfies_constraint(d); });
+
+    // Generate all double excitations that satisfy the constraint
+    std::vector<spin_wfn_type> doubles_g;
+    macis::generate_constraint_doubles(det, C, doubles_g);
+
+    // Compare and check counting
+    std::sort(doubles_c.begin(), doubles_c.end(), spin_wfn_comparator{});
+    std::sort(doubles_g.begin(), doubles_g.end(), spin_wfn_comparator{});
+    REQUIRE(doubles_c == doubles_g);
+    REQUIRE(doubles_g.size() == macis::count_constraint_doubles(det, C));
+  };
+
+  // Check singles
+  auto check_singles = [](auto det, auto C, auto& singles) {
+    // Extract all double excitations that satisfy the constraint
+    std::vector<spin_wfn_type> singles_c;
+    std::copy_if(singles.begin(), singles.end(), std::back_inserter(singles_c),
+                 [=](auto d) { return C.satisfies_constraint(d); });
+
+    // Generate all double excitations that satisfy the constraint
+    std::vector<spin_wfn_type> singles_g;
+    macis::generate_constraint_singles(det, C, singles_g);
+
+    // Compare and check counting
+    std::sort(singles_c.begin(), singles_c.end(), spin_wfn_comparator{});
+    std::sort(singles_g.begin(), singles_g.end(), spin_wfn_comparator{});
+    REQUIRE(singles_c == singles_g);
+    REQUIRE(singles_g.size() == macis::count_constraint_singles(det, C));
+  };
+
+  for(auto i = 0; i < unique_alpha.size(); ++i) {
+    auto det = unique_alpha[i].first;
+
+    auto det_triplet = top_set_indices<3>(det);
+    auto det_quad = top_set_indices<4>(det);
+
+    std::vector<spin_wfn_type> singles, doubles;
+    macis::generate_singles(norb, det, singles);
+    macis::generate_doubles(norb, det, doubles);
+
+    for(auto C : triplets) {
+      // Check validity of constraint check
+      auto inds = top_set_indices<3>(C.C());
+      REQUIRE(C.C_min() == inds.back());
+      REQUIRE(C.satisfies_constraint(det) == (inds == det_triplet));
+
+      check_singles(det, C, singles);
+      check_doubles(det, C, doubles);
+    }
+
+    for(auto C : quads) {
+      // Check validity of constraint check
+      auto inds = top_set_indices<4>(C.C());
+      REQUIRE(C.C_min() == inds.back());
+      REQUIRE(C.satisfies_constraint(det) == (inds == det_quad));
+
+      check_singles(det, C, singles);
+      check_doubles(det, C, doubles);
+    }
+  }
+}
+
 TEST_CASE("ASCI") {
   MACIS_MPI_CODE(MPI_Barrier(MPI_COMM_WORLD);)
   using macis::NumActive;
@@ -231,7 +384,9 @@ TEST_CASE("ASCI") {
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
   // Hamiltonian Genereator
-  using generator_t = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_type = macis::wfn_t<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_t = macis::DoubleLoopHamiltonianGenerator<wfn_type>;
   generator_t ham_gen(
       macis::matrix_span<double>(T.data(), norb, norb),
       macis::rank4_span<double>(V.data(), norb, norb, norb, norb));
@@ -242,8 +397,8 @@ TEST_CASE("ASCI") {
   macis::MCSCFSettings mcscf_settings;
 
   // HF guess
-  std::vector<macis::wfn_t<64>> dets = {
-      macis::canonical_hf_determinant<64>(nalpha, nbeta)};
+  std::vector<wfn_type> dets = {
+      wfn_traits::canonical_hf_determinant(nalpha, nbeta)};
   ;
   std::vector<double> C = {1.0};
   double E0 = ham_gen.matrix_element(dets[0], dets[0]);
@@ -254,7 +409,8 @@ TEST_CASE("ASCI") {
       asci_settings, mcscf_settings, E0, std::move(dets), std::move(C), ham_gen,
       norb MACIS_MPI_CODE(, MPI_COMM_WORLD));
 
-  REQUIRE(E0 == Approx(-8.542926243842e+01));
+  // std::cout << E0 - -8.542926243842e+01 << std::endl;
+  REQUIRE(std::abs(E0 - -8.542926243842e+01) < 1e-11);
   REQUIRE(dets.size() == 10000);
   REQUIRE(C.size() == 10000);
   REQUIRE(std::inner_product(C.begin(), C.end(), C.begin(), 0.0) ==
@@ -265,12 +421,22 @@ TEST_CASE("ASCI") {
       asci_settings, mcscf_settings, E0, std::move(dets), std::move(C), ham_gen,
       norb MACIS_MPI_CODE(, MPI_COMM_WORLD));
 
-  REQUIRE(E0 == Approx(-8.542925964708e+01));
+  REQUIRE(std::abs(E0 - -8.542925964708e+01) < 1e-11);
   REQUIRE(dets.size() == 10000);
   REQUIRE(C.size() == 10000);
   REQUIRE(std::inner_product(C.begin(), C.end(), C.begin(), 0.0) ==
           Approx(1.0));
 
+  // ASCI-PT2
+  auto EPT2 = macis::asci_pt2_constraint(
+      dets.begin(), dets.end(), E0, C, norb, ham_gen.T(), ham_gen.G_red(),
+      ham_gen.V_red(), ham_gen.G(), ham_gen.V(),
+      ham_gen MACIS_MPI_CODE(, MPI_COMM_WORLD));
+
+  // std::cout << std::scientific << std::setprecision(12);
+  // std::cout << EPT2 << std::endl;
+  REQUIRE(std::abs(EPT2 - -5.701585096318e-03) < 1e-10);
+
   MACIS_MPI_CODE(MPI_Barrier(MPI_COMM_WORLD);)
   spdlog::drop_all();
 }
diff --git a/tests/bitset_operations.cxx b/tests/bitset_operations.cxx
index e69516a8..2fd4673e 100644
--- a/tests/bitset_operations.cxx
+++ b/tests/bitset_operations.cxx
@@ -11,6 +11,9 @@
 
 #include "ut_common.hpp"
 
+template <size_t N>
+using bs = std::bitset<N>;
+
 template <size_t N>
 void uint128_conversion_test() {
   using ref_type = macis::uint128_t;
@@ -37,7 +40,134 @@ void uint128_conversion_test() {
 }
 
 template <size_t N>
-using bs = std::bitset<N>;
+void mask_test() {
+  bs<N> full = ~bs<N>(0);
+  bs<N> full_128 = N > 128 ? full >> (N - 128) : 0;
+  bs<N> full_64 = N > 64 ? full >> (N - 64) : 0;
+  bs<N> full_32 = N > 32 ? full >> (N - 32) : 0;
+
+  REQUIRE(full == macis::full_mask<N>());
+  if constexpr(N > 128) REQUIRE(full_128 == macis::full_mask<128, N>());
+  if constexpr(N > 64) REQUIRE(full_64 == macis::full_mask<64, N>());
+  if constexpr(N > 32) REQUIRE(full_32 == macis::full_mask<32, N>());
+
+  REQUIRE(full == macis::full_mask<N>(N));
+  if constexpr(N > 128) REQUIRE(full_128 == macis::full_mask<N>(128));
+  if constexpr(N > 64) REQUIRE(full_64 == macis::full_mask<N>(64));
+  if constexpr(N > 32) REQUIRE(full_32 == macis::full_mask<N>(32));
+}
+
+template <size_t N>
+void ffs_test() {
+  REQUIRE(macis::ffs(bs<N>()) == 0);
+  REQUIRE(macis::ffs(bs<N>(1)) == 1);
+
+  bs<N> a(0xABC);
+  REQUIRE(macis::ffs(a) == 3);
+  if(N > 32) REQUIRE(macis::ffs(a << 32) == 3 + 32);
+  if(N > 64) REQUIRE(macis::ffs(a << 64) == 3 + 64);
+  if(N > 128) REQUIRE(macis::ffs(a << 128) == 3 + 128);
+}
+
+template <size_t N>
+void fls_test() {
+  REQUIRE(macis::fls(bs<N>()) == UINT32_MAX);
+  REQUIRE(macis::fls(bs<N>(1)) == 0);
+
+  bs<N> a(0xDEADBEEF);
+  REQUIRE(macis::fls(a) == 31);
+  if(N > 32) REQUIRE(macis::fls(a << 32) == 31 + 32);
+  if(N > 64) REQUIRE(macis::fls(a << 64) == 31 + 64);
+  if(N > 128) REQUIRE(macis::fls(a << 128) == 31 + 128);
+}
+
+template <size_t N>
+void indices_test() {
+  std::vector<uint32_t> indices = {5};
+  if(N > 32) indices.emplace_back(6 + 32);
+  if(N > 64) indices.emplace_back(7 + 64);
+  if(N > 128) indices.emplace_back(8 + 128);
+
+  bs<N> one(1), a(0);
+  for(auto i : indices) a |= one << i;
+  REQUIRE(indices == macis::bits_to_indices(a));
+}
+
+template <size_t N>
+void lo_word_test() {
+  bs<N> a(0);
+  bs<N / 2> b(0);
+  size_t n_32_words = N / 32;
+  for(int i = 0; i < n_32_words; ++i) {
+    a |= bs<N>(0xDEADBEEF + i) << (i * 32);
+  }
+  for(int i = 0; i < n_32_words / 2; ++i) {
+    b |= bs<N / 2>(0xDEADBEEF + i) << (i * 32);
+  }
+
+  REQUIRE(macis::bitset_lo_word(a) == b);
+
+  bs<N> c(0);
+  macis::set_bitset_lo_word(c, b);
+  REQUIRE((a << N / 2) >> N / 2 == c);
+}
+
+template <size_t N>
+void hi_word_test() {
+  bs<N> a(0);
+  bs<N / 2> b(0);
+  size_t n_32_words = N / 32;
+  for(int i = 0; i < n_32_words; ++i) {
+    a |= bs<N>(0xDEADBEEF + i) << (i * 32);
+  }
+  for(int i = 0; i < n_32_words / 2; ++i) {
+    b |= bs<N / 2>(0xDEADBEEF + (i + n_32_words / 2)) << (i * 32);
+  }
+
+  REQUIRE(macis::bitset_hi_word(a) == b);
+
+  bs<N> c(0);
+  macis::set_bitset_hi_word(c, b);
+  REQUIRE((a >> N / 2) << N / 2 == c);
+}
+
+template <size_t N>
+void bitset_less_test() {
+  bs<N> a(65), b(42);
+  REQUIRE(macis::bitset_less(b, a));
+  REQUIRE_FALSE(macis::bitset_less(a, a));
+  REQUIRE_FALSE(macis::bitset_less(a, b));
+
+  if(N > 32) {
+    auto aa = a << 32;
+    auto bb = b << 32;
+    REQUIRE(macis::bitset_less(bb, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, bb));
+    REQUIRE_FALSE(macis::bitset_less(aa, b));
+    REQUIRE_FALSE(macis::bitset_less(aa, a));
+  }
+
+  if(N > 64) {
+    auto aa = a << 64;
+    auto bb = b << 64;
+    REQUIRE(macis::bitset_less(bb, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, bb));
+    REQUIRE_FALSE(macis::bitset_less(aa, b));
+    REQUIRE_FALSE(macis::bitset_less(aa, a));
+  }
+
+  if(N > 128) {
+    auto aa = a << 128;
+    auto bb = b << 128;
+    REQUIRE(macis::bitset_less(bb, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, aa));
+    REQUIRE_FALSE(macis::bitset_less(aa, bb));
+    REQUIRE_FALSE(macis::bitset_less(aa, b));
+    REQUIRE_FALSE(macis::bitset_less(aa, a));
+  }
+}
 
 TEST_CASE("Bitset Operations") {
   ROOT_ONLY(MPI_COMM_WORLD);
@@ -54,63 +184,34 @@ TEST_CASE("Bitset Operations") {
   }
 
   SECTION("Full Mask") {
-    bs<128> full_128 = ~bs<128>(0);
-    bs<128> full_64 = bs<128>(UINT64_MAX);
-    bs<128> full_32 = bs<128>(UINT32_MAX);
-
-    SECTION("Static") {
-      REQUIRE(full_128 == macis::full_mask<128>());
-      REQUIRE(full_64 == macis::full_mask<64, 128>());
-      REQUIRE(full_32 == macis::full_mask<32, 128>());
-    }
-
-    SECTION("Dynamic") {
-      REQUIRE(full_128 == macis::full_mask<128>(128));
-      REQUIRE(full_64 == macis::full_mask<128>(64));
-      REQUIRE(full_32 == macis::full_mask<128>(32));
-    }
+    SECTION("32") { mask_test<32>(); }
+    SECTION("64") { mask_test<64>(); }
+    SECTION("128") { mask_test<128>(); }
+    SECTION("256") { mask_test<256>(); }
   }
 
   SECTION("FFS") {
-    SECTION("Zero") { REQUIRE(macis::ffs(bs<64>()) == 0); }
-    SECTION("One") { REQUIRE(macis::ffs(bs<64>(1)) == 1); }
-    SECTION("Arbitrary") {
-      bs<64> a(0x0A0A);
-      REQUIRE(macis::ffs(a) == 2);
-      bs<32> b(0x0A0A);
-      REQUIRE(macis::ffs(b) == 2);
-      bs<128> c(0xABC);
-      REQUIRE(macis::ffs(c) == 3);
-      REQUIRE(macis::ffs(c << 64) == 3 + 64);
-      bs<256> d(0xABC);
-      REQUIRE(macis::ffs(d) == 3);
-      REQUIRE(macis::ffs(d << 64) == 3 + 64);
-      REQUIRE(macis::ffs(d << 128) == 3 + 128);
-    }
+    SECTION("32") { ffs_test<32>(); }
+    SECTION("64") { ffs_test<64>(); }
+    SECTION("128") { ffs_test<128>(); }
+    SECTION("256") { ffs_test<256>(); }
   }
 
   SECTION("FLS") {
-    bs<64> a(0x0A0A);
-    REQUIRE(macis::fls(a) == 12 - 1);
-    bs<32> b(0x0A0A);
-    REQUIRE(macis::fls(b) == 12 - 1);
-    bs<128> c(0xDEADBEEF);
-    REQUIRE(macis::fls(c) == 31);
-    REQUIRE(macis::fls(c << 64) == 31 + 64);
-    bs<256> d(0xDEADBEEF);
-    REQUIRE(macis::fls(d) == 31);
-    REQUIRE(macis::fls(d << 64) == 31 + 64);
-    REQUIRE(macis::fls(d << 128) == 31 + 128);
+    SECTION("32") { fls_test<32>(); }
+    SECTION("64") { fls_test<64>(); }
+    SECTION("128") { fls_test<128>(); }
+    SECTION("256") { fls_test<256>(); }
   }
 
   SECTION("Indices") {
-    bs<128> one(1);
-    bs<128> a = (one << 4) | (one << 67) | (one << 118) | (one << 31);
-    auto ind = macis::bits_to_indices(a);
-    std::vector<uint32_t> ref = {4, 31, 67, 118};
-    REQUIRE(ind == ref);
+    SECTION("32") { indices_test<32>(); }
+    SECTION("64") { indices_test<64>(); }
+    SECTION("128") { indices_test<128>(); }
+    SECTION("256") { indices_test<256>(); }
   }
 
+#if 0
   SECTION("Truncate") {
     bs<64> a_64(0xCCCCCCCCDEADDEAD);
     bs<32> ref(0xDEADDEAD);
@@ -122,31 +223,28 @@ TEST_CASE("Bitset Operations") {
     bs<64> ref(0x00000000DEADDEAD);
     REQUIRE(macis::expand_bitset<64>(a_32) == ref);
   }
+#endif
+
+  SECTION("LO WORDS") {
+    SECTION("64") { lo_word_test<64>(); }
+    SECTION("128") { lo_word_test<128>(); }
+    SECTION("256") { lo_word_test<256>(); }
+    SECTION("512") { lo_word_test<512>(); }
+    SECTION("1024") { lo_word_test<1024>(); }
+  }
+
+  SECTION("HI WORDS") {
+    SECTION("64") { hi_word_test<64>(); }
+    SECTION("128") { hi_word_test<128>(); }
+    SECTION("256") { hi_word_test<256>(); }
+    SECTION("512") { hi_word_test<512>(); }
+    SECTION("1024") { hi_word_test<1024>(); }
+  }
 
   SECTION("Compare") {
-    SECTION("32 bit") {
-      bs<32> a(65), b(42);
-      REQUIRE(macis::bitset_less(b, a));
-      REQUIRE_FALSE(macis::bitset_less(a, b));
-    }
-    SECTION("64 bit") {
-      bs<64> a(65ull << 32), b(42ull << 32);
-      REQUIRE(macis::bitset_less(b, a));
-      REQUIRE_FALSE(macis::bitset_less(a, b));
-    }
-    SECTION("128 bit") {
-      bs<128> a(65ull), b(42ull);
-      a = a << 64;
-      b = b << 64;
-      REQUIRE(macis::bitset_less(b, a));
-      REQUIRE_FALSE(macis::bitset_less(a, b));
-    }
-    SECTION("256 bit") {
-      bs<256> a(65ull), b(42ull);
-      a = a << 128;
-      b = b << 128;
-      REQUIRE(macis::bitset_less(b, a));
-      REQUIRE_FALSE(macis::bitset_less(a, b));
-    }
+    SECTION("32") { bitset_less_test<32>(); }
+    SECTION("64") { bitset_less_test<64>(); }
+    SECTION("128") { bitset_less_test<128>(); }
+    SECTION("256") { bitset_less_test<256>(); }
   }
 }
diff --git a/tests/csr_hamiltonian.cxx b/tests/csr_hamiltonian.cxx
index ba3e3a87..5be3cbed 100644
--- a/tests/csr_hamiltonian.cxx
+++ b/tests/csr_hamiltonian.cxx
@@ -11,11 +11,15 @@
 #include <iostream>
 #include <macis/csr_hamiltonian.hpp>
 #include <macis/hamiltonian_generator/double_loop.hpp>
+#include <macis/hamiltonian_generator/sorted_double_loop.hpp>
 #include <macis/util/fcidump.hpp>
 
 #include "ut_common.hpp"
 
-TEST_CASE("CSR Hamiltonian") {
+using wfn_type = macis::wfn_t<64>;
+TEMPLATE_TEST_CASE("CSR Hamiltonian", "[ham_gen]",
+                   macis::DoubleLoopHamiltonianGenerator<wfn_type>,
+                   macis::SortedDoubleLoopHamiltonianGenerator<wfn_type>) {
   ROOT_ONLY(MPI_COMM_WORLD);
 
   size_t norb = macis::read_fcidump_norb(water_ccpvdz_fcidump);
@@ -27,7 +31,8 @@ TEST_CASE("CSR Hamiltonian") {
   macis::read_fcidump_1body(water_ccpvdz_fcidump, T.data(), norb);
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = TestType;
 
 #if 0
   generator_type ham_gen(norb, V.data(), T.data());
@@ -38,12 +43,36 @@ TEST_CASE("CSR Hamiltonian") {
 #endif
 
   // Generate configuration space
-  const auto hf_det = macis::canonical_hf_determinant<64>(nocc, nocc);
+  const auto hf_det = wfn_traits::canonical_hf_determinant(nocc, nocc);
   auto dets = macis::generate_cisd_hilbert_space(norb, hf_det);
+  std::sort(dets.begin(), dets.end(), wfn_traits::spin_comparator{});
 
   // Generate CSR Hamiltonian
+  auto st = std::chrono::high_resolution_clock::now();
   auto H = macis::make_csr_hamiltonian_block<int32_t>(
       dets.begin(), dets.end(), dets.begin(), dets.end(), ham_gen, 1e-16);
+  auto en = std::chrono::high_resolution_clock::now();
+  std::cout << std::chrono::duration<double, std::milli>(en - st).count()
+            << std::endl;
+
+// #define GENERATE_TESTS
+#ifdef GENERATE_TESTS
+  std::string tmp_rowptr_fname = "tmp_rowptr.bin";
+  std::string tmp_colind_fname = "tmp_colind.bin";
+  std::string tmp_nzval_fname = "tmp_nzval.bin";
+
+  std::ofstream rowptr_file(tmp_rowptr_fname, std::ios::binary);
+  rowptr_file.write((char*)H.rowptr().data(),
+                    H.rowptr().size() * sizeof(int32_t));
+  std::ofstream colind_file(tmp_colind_fname, std::ios::binary);
+  colind_file.write((char*)H.colind().data(),
+                    H.colind().size() * sizeof(int32_t));
+  std::ofstream nzval_file(tmp_nzval_fname, std::ios::binary);
+  nzval_file.write((char*)H.nzval().data(), H.nzval().size() * sizeof(double));
+#else
+
+  // std::cout << "NEW H " << H.m() << " " << H.nnz() << " " << H.indexing() <<
+  // std::endl;
 
   // Read reference data
   std::vector<int32_t> ref_rowptr(H.rowptr().size()),
@@ -68,10 +97,13 @@ TEST_CASE("CSR Hamiltonian") {
   for(auto i = 0ul; i < nnz; ++i) {
     REQUIRE(H.nzval()[i] == Approx(ref_nzval[i]));
   }
+#endif
 }
 
 #ifdef MACIS_ENABLE_MPI
-TEST_CASE("Distributed CSR Hamiltonian") {
+TEMPLATE_TEST_CASE("Distributed CSR Hamiltonian", "[ham_gen]",
+                   macis::DoubleLoopHamiltonianGenerator<wfn_type>,
+                   macis::SortedDoubleLoopHamiltonianGenerator<wfn_type>) {
   MPI_Barrier(MPI_COMM_WORLD);
   size_t norb = macis::read_fcidump_norb(water_ccpvdz_fcidump);
   size_t nocc = 5;
@@ -82,7 +114,8 @@ TEST_CASE("Distributed CSR Hamiltonian") {
   macis::read_fcidump_1body(water_ccpvdz_fcidump, T.data(), norb);
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = TestType;
 
 #if 0
   generator_type ham_gen(norb, V.data(), T.data());
@@ -93,8 +126,9 @@ TEST_CASE("Distributed CSR Hamiltonian") {
 #endif
 
   // Generate configuration space
-  const auto hf_det = macis::canonical_hf_determinant<64>(nocc, nocc);
+  const auto hf_det = wfn_traits::canonical_hf_determinant(nocc, nocc);
   auto dets = macis::generate_cisd_hilbert_space(norb, hf_det);
+  std::sort(dets.begin(), dets.end(), wfn_traits::spin_comparator{});
 
   // Generate Distributed CSR Hamiltonian
   auto H_dist = macis::make_dist_csr_hamiltonian<int32_t>(
diff --git a/tests/davidson.cxx b/tests/davidson.cxx
index 64a9394e..db27e85b 100644
--- a/tests/davidson.cxx
+++ b/tests/davidson.cxx
@@ -32,7 +32,9 @@ TEST_CASE("Davidson") {
   macis::read_fcidump_1body(water_ccpvdz_fcidump, T.data(), norb);
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_type = macis::wfn_t<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = macis::DoubleLoopHamiltonianGenerator<wfn_type>;
 
 #if 0
   generator_type ham_gen(norb, V.data(), T.data());
@@ -43,7 +45,7 @@ TEST_CASE("Davidson") {
 #endif
 
   // Generate configuration space
-  const auto hf_det = macis::canonical_hf_determinant<64>(nocc, nocc);
+  const auto hf_det = wfn_traits::canonical_hf_determinant(nocc, nocc);
   auto dets = macis::generate_cisd_hilbert_space(norb, hf_det);
   auto E0_ref = -7.623197835987e+01;
 
@@ -85,7 +87,9 @@ TEST_CASE("Parallel Davidson") {
   macis::read_fcidump_1body(water_ccpvdz_fcidump, T.data(), norb);
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_type = macis::wfn_t<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = macis::DoubleLoopHamiltonianGenerator<wfn_type>;
 
 #if 0
   generator_type ham_gen(norb, V.data(), T.data());
@@ -96,7 +100,7 @@ TEST_CASE("Parallel Davidson") {
 #endif
 
   // Generate configuration space
-  const auto hf_det = macis::canonical_hf_determinant<64>(nocc, nocc);
+  const auto hf_det = wfn_traits::canonical_hf_determinant(nocc, nocc);
   auto dets = macis::generate_cisd_hilbert_space(norb, hf_det);
   auto E0_ref = -7.623197835987e+01;
 
diff --git a/tests/double_loop.cxx b/tests/double_loop.cxx
index 5f326b3b..f1e4b602 100644
--- a/tests/double_loop.cxx
+++ b/tests/double_loop.cxx
@@ -6,6 +6,7 @@
  * See LICENSE.txt for details
  */
 
+#include <blas.hh>
 #include <iomanip>
 #include <iostream>
 #include <macis/hamiltonian_generator/double_loop.hpp>
@@ -28,7 +29,9 @@ TEST_CASE("Double Loop") {
   macis::read_fcidump_1body(water_ccpvdz_fcidump, T.data(), norb);
   macis::read_fcidump_2body(water_ccpvdz_fcidump, V.data(), norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<64>;
+  using wfn_type = macis::wfn_t<64>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = macis::DoubleLoopHamiltonianGenerator<wfn_type>;
 
 #if 0
   generator_type ham_gen(norb, V.data(), T.data());
@@ -37,7 +40,7 @@ TEST_CASE("Double Loop") {
       macis::matrix_span<double>(T.data(), norb, norb),
       macis::rank4_span<double>(V.data(), norb, norb, norb, norb));
 #endif
-  const auto hf_det = macis::canonical_hf_determinant<64>(nocc, nocc);
+  const auto hf_det = wfn_traits::canonical_hf_determinant(nocc, nocc);
 
   std::vector<double> eps(norb);
   for(auto p = 0ul; p < norb; ++p) {
@@ -91,7 +94,7 @@ TEST_CASE("Double Loop") {
     for(size_t i = 0; i < nocc; ++i)
       for(size_t a = nocc; a < norb; ++a) {
         // Generate excited determinant
-        std::bitset<64> state = hf_det;
+        wfn_type state = hf_det;
         state.flip(i).flip(a);
         auto el_1 = ham_gen.matrix_element(hf_det, state);
         auto el_2 = ham_gen.matrix_element(state, hf_det);
@@ -103,7 +106,7 @@ TEST_CASE("Double Loop") {
     for(size_t i = 0; i < nocc; ++i)
       for(size_t a = nocc; a < norb; ++a) {
         // Generate excited determinant
-        std::bitset<64> state = hf_det;
+        wfn_type state = hf_det;
         state.flip(i + 32).flip(a + 32);
         auto el_1 = ham_gen.matrix_element(hf_det, state);
         auto el_2 = ham_gen.matrix_element(state, hf_det);
@@ -161,8 +164,8 @@ TEST_CASE("Double Loop") {
 
   SECTION("RDM") {
     std::vector<double> ordm(norb * norb, 0.0), trdm(norb3 * norb, 0.0);
-    std::vector<std::bitset<64>> dets = {
-        macis::canonical_hf_determinant<64>(nocc, nocc)};
+    std::vector<wfn_type> dets = {
+        wfn_traits::canonical_hf_determinant(nocc, nocc)};
 
     std::vector<double> C = {1.};
 
@@ -194,14 +197,16 @@ TEST_CASE("RDMS") {
   macis::rank4_span<double> V_span(V.data(), norb, norb, norb, norb);
   macis::rank4_span<double> trdm_span(trdm.data(), norb, norb, norb, norb);
 
-  using generator_type = macis::DoubleLoopHamiltonianGenerator<128>;
+  using wfn_type = macis::wfn_t<128>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
+  using generator_type = macis::DoubleLoopHamiltonianGenerator<wfn_type>;
   generator_type ham_gen(T_span, V_span);
 
   auto abs_sum = [](auto a, auto b) { return a + std::abs(b); };
 
   SECTION("HF") {
-    std::vector<std::bitset<128>> dets = {
-        macis::canonical_hf_determinant<128>(nocc, nocc)};
+    std::vector<wfn_type> dets = {
+        wfn_traits::canonical_hf_determinant(nocc, nocc)};
 
     std::vector<double> C = {1.};
 
@@ -224,7 +229,7 @@ TEST_CASE("RDMS") {
   }
 
   SECTION("CI") {
-    std::vector<std::bitset<128>> states;
+    std::vector<wfn_type> states;
     std::vector<double> coeffs;
     macis::read_wavefunction<128>(ch4_wfn_fname, states, coeffs);
 
diff --git a/tests/fcidump.cxx b/tests/fcidump.cxx
index 7f39c11c..5540564c 100644
--- a/tests/fcidump.cxx
+++ b/tests/fcidump.cxx
@@ -8,8 +8,8 @@
 
 #include <iomanip>
 #include <iostream>
+#include <macis/mcscf/orbital_energies.hpp>
 #include <macis/util/fcidump.hpp>
-#include <macis/util/orbital_energies.hpp>
 
 #include "ut_common.hpp"
 
diff --git a/tests/fcidump_to_trexio.cxx b/tests/fcidump_to_trexio.cxx
new file mode 100644
index 00000000..98648678
--- /dev/null
+++ b/tests/fcidump_to_trexio.cxx
@@ -0,0 +1,76 @@
+/*
+ * MACIS Copyright (c) 2023, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+
+#include <iostream>
+#include <macis/util/fcidump.hpp>
+#include <macis/util/trexio.hpp>
+#include <string>
+#include <vector>
+
+int main(int argc, char** argv) {
+  std::vector<std::string> opts(argc);
+  for(int i = 0; i < argc; ++i) {
+    opts[i] = std::string(argv[i]);
+  }
+
+  std::string fcidump_fname = opts.at(1);
+  std::string trexio_fname = opts.at(2);
+
+  std::cout << "FCIDUMP FILE = " << fcidump_fname << std::endl;
+  std::cout << "TREXIO FILE  = " << trexio_fname << std::endl;
+
+  // Read the FCIDUMP file
+  std::cout << "Reading FCIDUMP File..." << std::flush;
+  size_t norb = macis::read_fcidump_norb(fcidump_fname);
+  size_t norb2 = norb * norb;
+  size_t norb3 = norb2 * norb;
+  size_t norb4 = norb2 * norb2;
+
+  std::vector<double> T(norb2), V(norb4);
+  auto E_core = macis::read_fcidump_core(fcidump_fname);
+  macis::read_fcidump_1body(fcidump_fname, T.data(), norb);
+  macis::read_fcidump_2body(fcidump_fname, V.data(), norb);
+
+  std::cout << "Done!" << std::endl;
+
+  // Write TREXIO file
+  std::cout << "Writing TREXIO file..." << std::flush;
+  {
+    macis::TREXIOFile trexio_file(trexio_fname, 'w', TREXIO_HDF5);
+    trexio_file.write_mo_num(norb);
+    trexio_file.write_nucleus_repulsion(E_core);
+    trexio_file.write_mo_1e_int_core_hamiltonian(T.data());
+    trexio_file.write_mo_2e_int_eri(V.data());
+  }
+  std::cout << "Done!" << std::endl;
+
+  std::vector<double> T_chk(norb2), V_chk(norb4);
+  size_t norb_chk;
+  double E_core_chk;
+  std::cout << "Reading TREXIO file..." << std::flush;
+  {
+    macis::TREXIOFile trexio_file(trexio_fname, 'r', TREXIO_AUTO);
+    norb_chk = trexio_file.read_mo_num();
+    E_core_chk = trexio_file.read_nucleus_repulsion();
+    trexio_file.read_mo_1e_int_core_hamiltonian(T_chk.data());
+    trexio_file.read_mo_2e_int_eri(V_chk.data());
+  }
+  std::cout << "Done!" << std::endl;
+
+  std::cout << "NORB_CHECK = " << norb - norb_chk << std::endl;
+  std::cout << "ECOR_CHECK = " << std::abs(E_core - E_core_chk) << std::endl;
+
+  double max_diff = 0;
+  for(auto i = 0; i < norb2; ++i)
+    max_diff = std::max(max_diff, std::abs(T[i] - T_chk[i]));
+  std::cout << "T_CHECK    = " << max_diff << std::endl;
+  max_diff = 0;
+  for(auto i = 0; i < norb2; ++i)
+    max_diff = std::max(max_diff, std::abs(V[i] - V_chk[i]));
+  std::cout << "V_CHECK    = " << max_diff << std::endl;
+}
diff --git a/tests/fock_matrices.cxx b/tests/fock_matrices.cxx
index 75a84a05..f4acdffc 100644
--- a/tests/fock_matrices.cxx
+++ b/tests/fock_matrices.cxx
@@ -7,9 +7,9 @@
  */
 
 #include <iomanip>
+#include <macis/mcscf/fock_matrices.hpp>
 #include <macis/util/detail/rdm_files.hpp>
 #include <macis/util/fcidump.hpp>
-#include <macis/util/fock_matrices.hpp>
 
 #include "ut_common.hpp"
 
diff --git a/tests/mcscf.cxx b/tests/mcscf.cxx
index 7bdd1b80..c928ff1f 100644
--- a/tests/mcscf.cxx
+++ b/tests/mcscf.cxx
@@ -10,9 +10,9 @@
 #include <spdlog/spdlog.h>
 
 #include <iomanip>
+#include <macis/mcscf/mcscf.hpp>
 #include <macis/util/detail/rdm_files.hpp>
 #include <macis/util/fcidump.hpp>
-#include <macis/util/mcscf.hpp>
 
 #include "ut_common.hpp"
 
diff --git a/tests/ref_data/h2o.ccpvdz.cisd.colind.bin b/tests/ref_data/h2o.ccpvdz.cisd.colind.bin
index fee257c3..60de79f7 100644
Binary files a/tests/ref_data/h2o.ccpvdz.cisd.colind.bin and b/tests/ref_data/h2o.ccpvdz.cisd.colind.bin differ
diff --git a/tests/ref_data/h2o.ccpvdz.cisd.nzval.bin b/tests/ref_data/h2o.ccpvdz.cisd.nzval.bin
index 06c86f49..f73ac81c 100644
Binary files a/tests/ref_data/h2o.ccpvdz.cisd.nzval.bin and b/tests/ref_data/h2o.ccpvdz.cisd.nzval.bin differ
diff --git a/tests/ref_data/h2o.ccpvdz.cisd.rowptr.bin b/tests/ref_data/h2o.ccpvdz.cisd.rowptr.bin
index 1142270e..13d1207d 100644
Binary files a/tests/ref_data/h2o.ccpvdz.cisd.rowptr.bin and b/tests/ref_data/h2o.ccpvdz.cisd.rowptr.bin differ
diff --git a/tests/sd_operations.cxx b/tests/sd_operations.cxx
index dd4612a6..ae540c25 100644
--- a/tests/sd_operations.cxx
+++ b/tests/sd_operations.cxx
@@ -16,25 +16,31 @@ TEST_CASE("Slater Det Operations") {
 
   SECTION("HF Determinant") {
     SECTION("Canonical Ordering") {
-      auto hf = macis::canonical_hf_determinant<32>(4, 2);
+      using wfn_type = macis::wfn_t<32>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      auto hf = wfn_traits::canonical_hf_determinant(4, 2);
       REQUIRE(hf == 0x0003000F);
     }
 
+#if 0
     SECTION("Unordered") {
       std::vector<double> orb_energies = {3, 2, 7, 8, 5};
       auto hf = macis::canonical_hf_determinant<32>(3, 2, orb_energies);
       REQUIRE(hf == 0x00030013);
     }
+#endif
   }
 
   SECTION("Occupied / Unoccupied Conversion") {
     SECTION("Default") {
-      std::bitset<64> state = 0x00000000000000AF;
+      using wfn_type = macis::wfn_t<64>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      wfn_type state = 0x00000000000000AF;
       std::vector<uint32_t> ref_occ = {0, 1, 2, 3, 5, 7};
       std::vector<uint32_t> ref_vir = {4, 6, 8, 9, 10, 11};
 
       std::vector<uint32_t> occ, vir;
-      macis::bitset_to_occ_vir(12, state, occ, vir);
+      wfn_traits::state_to_occ_vir(12, state, occ, vir);
 
       REQUIRE(occ == ref_occ);
       REQUIRE(vir == ref_vir);
@@ -45,14 +51,16 @@ TEST_CASE("Slater Det Operations") {
 
   SECTION("Singles") {
     SECTION("Single Spin") {
-      std::vector<std::bitset<64>> ref_singles = {
+      using wfn_type = macis::wfn_t<64>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      std::vector<wfn_type> ref_singles = {
           0b00011110, 0b00101110, 0b01001110, 0b10001110,
           0b00011101, 0b00101101, 0b01001101, 0b10001101,
           0b00011011, 0b00101011, 0b01001011, 0b10001011,
           0b00010111, 0b00100111, 0b01000111, 0b10000111};
 
-      std::vector<std::bitset<64>> singles;
-      auto state = macis::canonical_hf_determinant<64>(4, 0);  // all alpha
+      std::vector<wfn_type> singles;
+      auto state = wfn_traits::canonical_hf_determinant(4, 0);  // all alpha
       macis::generate_singles(8, state, singles);
 
       auto cmp = macis::bitset_less_comparator<64>{};
@@ -63,7 +71,9 @@ TEST_CASE("Slater Det Operations") {
     }
 
     SECTION("Both Spin") {
-      std::vector<std::bitset<64>> ref_singles = {
+      using wfn_type = macis::wfn_t<64>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      std::vector<wfn_type> ref_singles = {
           0x0F0000001E, 0x0F0000001D, 0x0F0000001B, 0x0F00000017, 0x0F0000002E,
           0x0F0000002D, 0x0F0000002B, 0x0F00000027, 0x0F0000004E, 0x0F0000004D,
           0x0F0000004B, 0x0F00000047, 0x0F0000008E, 0x0F0000008D, 0x0F0000008B,
@@ -72,8 +82,8 @@ TEST_CASE("Slater Det Operations") {
           0x4D0000000F, 0x4B0000000F, 0x470000000F, 0x8E0000000F, 0x8D0000000F,
           0x8B0000000f, 0x870000000f};
 
-      auto state = macis::canonical_hf_determinant<64>(4, 4);
-      std::vector<std::bitset<64>> singles;
+      auto state = wfn_traits::canonical_hf_determinant(4, 4);
+      std::vector<wfn_type> singles;
       macis::generate_singles_spin(8, state, singles);
 
       auto cmp = macis::bitset_less_comparator<64>{};
@@ -86,7 +96,9 @@ TEST_CASE("Slater Det Operations") {
 
   SECTION("Doubles") {
     SECTION("Single Spin") {
-      std::vector<std::bitset<64>> ref_doubles = {
+      using wfn_type = macis::wfn_t<64>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      std::vector<wfn_type> ref_doubles = {
           0b00111100, 0b01011100, 0b10011100, 0b01101100, 0b10101100,
           0b11001100, 0b00111010, 0b01011010, 0b10011010, 0b01101010,
           0b10101010, 0b11001010, 0b00111001, 0b01011001, 0b10011001,
@@ -96,8 +108,8 @@ TEST_CASE("Slater Det Operations") {
           0b00110011, 0b01010011, 0b10010011, 0b01100011, 0b10100011,
           0b11000011};
 
-      std::vector<std::bitset<64>> doubles;
-      auto state = macis::canonical_hf_determinant<64>(4, 0);  // all alpha
+      std::vector<wfn_type> doubles;
+      auto state = wfn_traits::canonical_hf_determinant(4, 0);  // all alpha
       macis::generate_doubles(8, state, doubles);
 
       auto cmp = macis::bitset_less_comparator<64>{};
@@ -108,7 +120,9 @@ TEST_CASE("Slater Det Operations") {
     }
 
     SECTION("Both Spins") {
-      std::vector<std::bitset<64>> ref_doubles = {
+      using wfn_type = macis::wfn_t<64>;
+      using wfn_traits = macis::wavefunction_traits<wfn_type>;
+      std::vector<wfn_type> ref_doubles = {
           0x0F0000003C, 0x0F0000003A, 0x0F00000036, 0x0F0000005C, 0x0F0000005A,
           0x0F00000056, 0x0F0000009C, 0x0F0000009A, 0x0F00000096, 0x0F00000039,
           0x0F00000035, 0x0F00000059, 0x0F00000055, 0x0F00000099, 0x0F00000095,
@@ -175,8 +189,8 @@ TEST_CASE("Slater Det Operations") {
           0x1700000087, 0x2E00000087, 0x2D00000087, 0x2B00000087, 0x2700000087,
           0x4E00000087, 0x4D00000087, 0x4B00000087, 0x4700000087, 0x8E00000087,
           0x8D00000087, 0x8B00000087, 0x8700000087};
-      auto state = macis::canonical_hf_determinant<64>(4, 4);
-      std::vector<std::bitset<64>> singles, doubles;
+      auto state = wfn_traits::canonical_hf_determinant(4, 4);
+      std::vector<wfn_type> singles, doubles;
       macis::generate_singles_doubles_spin(8, state, singles, doubles);
 
       auto cmp = macis::bitset_less_comparator<64>{};
@@ -188,8 +202,10 @@ TEST_CASE("Slater Det Operations") {
   }
 
   SECTION("CIS") {
-    auto state = macis::canonical_hf_determinant<64>(4, 4);
-    std::vector<std::bitset<64>> singles;
+    using wfn_type = macis::wfn_t<64>;
+    using wfn_traits = macis::wavefunction_traits<wfn_type>;
+    auto state = wfn_traits::canonical_hf_determinant(4, 4);
+    std::vector<wfn_type> singles;
     macis::generate_singles_spin(8, state, singles);
     singles.push_back(state);
     auto cis = macis::generate_cis_hilbert_space(8, state);
@@ -201,8 +217,10 @@ TEST_CASE("Slater Det Operations") {
   }
 
   SECTION("CISD") {
-    auto state = macis::canonical_hf_determinant<64>(4, 4);
-    std::vector<std::bitset<64>> singles, doubles;
+    using wfn_type = macis::wfn_t<64>;
+    using wfn_traits = macis::wavefunction_traits<wfn_type>;
+    auto state = wfn_traits::canonical_hf_determinant(4, 4);
+    std::vector<wfn_type> singles, doubles;
     macis::generate_singles_doubles_spin(8, state, singles, doubles);
     singles.insert(singles.end(), doubles.begin(), doubles.end());
     singles.push_back(state);
@@ -216,13 +234,13 @@ TEST_CASE("Slater Det Operations") {
 
   SECTION("FCI") {
     SECTION("Single Spin") {
-      std::vector<std::bitset<64>> ref_combs = {0x3, 0x5, 0x9, 0x6, 0xA, 0xC};
-      auto combs = macis::generate_combs<64>(4, 2);
+      std::vector<macis::wfn_t<64>> ref_combs = {0x3, 0x5, 0x9, 0x6, 0xA, 0xC};
+      auto combs = macis::generate_combs<macis::wfn_t<64>>(4, 2);
       REQUIRE(combs == ref_combs);
     }
 
     SECTION("Both Spins") {
-      std::vector<std::bitset<64>> ref_dets = {
+      std::vector<macis::wfn_t<64>> ref_dets = {
           0x300000003, 0x500000003, 0x900000003, 0x600000003, 0xA00000003,
           0xC00000003, 0x300000005, 0x500000005, 0x900000005, 0x600000005,
           0xA00000005, 0xC00000005, 0x300000009, 0x500000009, 0x900000009,
@@ -232,13 +250,16 @@ TEST_CASE("Slater Det Operations") {
           0x30000000C, 0x50000000C, 0x90000000C, 0x60000000C, 0xA0000000C,
           0xC0000000C};
 
-      auto dets = macis::generate_hilbert_space<64>(4, 2, 2);
+      auto dets = macis::generate_hilbert_space<macis::wfn_t<64>>(4, 2, 2);
       REQUIRE(dets == ref_dets);
     }
   }
 
   SECTION("String Conversions") {
-    auto state = macis::canonical_hf_determinant<64>(2, 2).flip(3).flip(4 + 32);
+    using wfn_type = macis::wfn_t<64>;
+    using wfn_traits = macis::wavefunction_traits<wfn_type>;
+    auto state =
+        wfn_traits::canonical_hf_determinant(2, 2).flip(3).flip(4 + 32);
 
     SECTION("To String") {
       auto str = macis::to_canonical_string(state);
@@ -248,13 +269,13 @@ TEST_CASE("Slater Det Operations") {
     SECTION("From String") {
       SECTION("Full String") {
         std::string str = "220ud000000000000000000000000000";
-        auto det = macis::from_canonical_string<64>(str);
+        auto det = macis::from_canonical_string<wfn_type>(str);
         REQUIRE(det == state);
       }
 
       SECTION("Short String") {
         std::string str = "220ud";
-        auto det = macis::from_canonical_string<64>(str);
+        auto det = macis::from_canonical_string<wfn_type>(str);
         REQUIRE(det == state);
       }
     }
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index 44794230..2f44e0e0 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -15,16 +15,19 @@
 #include <iomanip>
 #include <iostream>
 #include <macis/asci/grow.hpp>
+#include <macis/asci/pt2.hpp>
 #include <macis/asci/refine.hpp>
 #include <macis/hamiltonian_generator/double_loop.hpp>
-#include <macis/util/cas.hpp>
+#include <macis/hamiltonian_generator/sorted_double_loop.hpp>
+#include <macis/mcscf/cas.hpp>
+#include <macis/mcscf/fock_matrices.hpp>
 #include <macis/util/detail/rdm_files.hpp>
 #include <macis/util/fcidump.hpp>
-#include <macis/util/fock_matrices.hpp>
 #include <macis/util/memory.hpp>
 #include <macis/util/moller_plesset.hpp>
 #include <macis/util/mpi.hpp>
 #include <macis/util/transform.hpp>
+#include <macis/util/trexio.hpp>
 #include <macis/wavefunction_io.hpp>
 #include <map>
 #include <sparsexx/io/write_dist_mm.hpp>
@@ -61,9 +64,14 @@ int main(int argc, char** argv) {
   spdlog::cfg::load_env_levels();
   spdlog::set_pattern("[%n] %v");
 
-  constexpr size_t nwfn_bits = 64;
+  constexpr size_t nwfn_bits = 256;
+  using wfn_type = macis::wfn_t<nwfn_bits>;
+  using wfn_traits = macis::wavefunction_traits<wfn_type>;
 
-  MACIS_MPI_CODE(MPI_Init(&argc, &argv);)
+  MACIS_MPI_CODE(int dummy;
+                 MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &dummy);
+                 if(dummy != MPI_THREAD_MULTIPLE) throw std::runtime_error(
+                     "MPI Thread Init Failed");)
 
 #ifdef MACIS_ENABLE_MPI
   auto world_rank = macis::comm_rank(MPI_COMM_WORLD);
@@ -84,30 +92,51 @@ int main(int argc, char** argv) {
     auto input_file = opts.at(1);
     INIFile input(input_file);
 
-    // Required Keywords
-    auto fcidump_fname = input.getData<std::string>("CI.FCIDUMP");
-    auto nalpha = input.getData<size_t>("CI.NALPHA");
-    auto nbeta = input.getData<size_t>("CI.NBETA");
-
-    if(nalpha != nbeta) throw std::runtime_error("NALPHA != NBETA");
-
-    // Read FCIDUMP File
-    size_t norb = macis::read_fcidump_norb(fcidump_fname);
-    size_t norb2 = norb * norb;
-    size_t norb3 = norb2 * norb;
-    size_t norb4 = norb2 * norb2;
-
-    // XXX: Consider reading this into shared memory to avoid replication
-    std::vector<double> T(norb2), V(norb4);
-    auto E_core = macis::read_fcidump_core(fcidump_fname);
-    macis::read_fcidump_1body(fcidump_fname, T.data(), norb);
-    macis::read_fcidump_2body(fcidump_fname, V.data(), norb);
-
 #define OPT_KEYWORD(STR, RES, DTYPE) \
   if(input.containsData(STR)) {      \
     RES = input.getData<DTYPE>(STR); \
   }
 
+    // Required Keywords
+    auto nalpha = input.getData<size_t>("CI.NALPHA");
+    auto nbeta = input.getData<size_t>("CI.NBETA");
+
+    std::string reference_data_format =
+        input.getData<std::string>("CI.REF_DATA_FORMAT");
+    std::string reference_data_file =
+        input.getData<std::string>("CI.REF_DATA_FILE");
+
+    size_t norb, norb2, norb3, norb4;
+    std::vector<double> T, V;
+    double E_core;
+    if(reference_data_format == "FCIDUMP") {
+      // Read FCIDUMP File
+      norb = macis::read_fcidump_norb(reference_data_file);
+      norb2 = norb * norb;
+      norb3 = norb2 * norb;
+      norb4 = norb2 * norb2;
+
+      // XXX: Consider reading this into shared memory to avoid replication
+      T.resize(norb2);
+      V.resize(norb4);
+      E_core = macis::read_fcidump_core(reference_data_file);
+      macis::read_fcidump_1body(reference_data_file, T.data(), norb);
+      macis::read_fcidump_2body(reference_data_file, V.data(), norb);
+    } else {  // TREXIO
+      macis::TREXIOFile trexio_file(reference_data_file, 'r', TREXIO_AUTO);
+      norb = trexio_file.read_mo_num();
+      norb2 = norb * norb;
+      norb3 = norb2 * norb;
+      norb4 = norb2 * norb2;
+
+      // XXX: Consider reading this into shared memory to avoid replication
+      T.resize(norb2);
+      V.resize(norb4);
+      E_core = trexio_file.read_nucleus_repulsion();
+      trexio_file.read_mo_1e_int_core_hamiltonian(T.data());
+      trexio_file.read_mo_2e_int_eri(V.data());
+    }
+
     // Set up job
     std::string job_str = "MCSCF";
     OPT_KEYWORD("CI.JOB", job_str, std::string);
@@ -167,9 +196,9 @@ int main(int argc, char** argv) {
 
     // ASCI Settings
     macis::ASCISettings asci_settings;
-    std::string asci_wfn_fname, asci_wfn_out_fname;
+    std::string asci_wfn_fname, asci_wfn_out_fname, asci_ham_out_fname;
     double asci_E0 = 0.0;
-    bool compute_asci_E0 = true;
+    bool compute_asci_E0 = true, pt2 = true;
     OPT_KEYWORD("ASCI.NTDETS_MAX", asci_settings.ntdets_max, size_t);
     OPT_KEYWORD("ASCI.NTDETS_MIN", asci_settings.ntdets_min, size_t);
     OPT_KEYWORD("ASCI.NCDETS_MAX", asci_settings.ncdets_max, size_t);
@@ -190,17 +219,49 @@ int main(int argc, char** argv) {
       asci_E0 = input.getData<double>("ASCI.E0_WFN");
       compute_asci_E0 = false;
     }
+    OPT_KEYWORD("ASCI.HAM_OUT_FILE", asci_ham_out_fname, std::string);
+    OPT_KEYWORD("ASCI.PT2", pt2, bool);
+    OPT_KEYWORD("ASCI.PT2_TOL", asci_settings.pt2_tol, double);
+    OPT_KEYWORD("ASCI.PT2_RESERVE_COUNT", asci_settings.pt2_reserve_count,
+                size_t);
+    OPT_KEYWORD("ASCI.PT2_CONSTRAINT_LVL_MAX",
+                asci_settings.pt2_max_constraint_level, int);
+    OPT_KEYWORD("ASCI.PT2_CONSTRAINT_LVL_MIN",
+                asci_settings.pt2_min_constraint_level, int);
+    OPT_KEYWORD("ASCI.PT2_CNSTRNT_RFNE_FORCE",
+                asci_settings.pt2_constraint_refine_force, int64_t);
+    OPT_KEYWORD("ASCI.PT2_PRUNE", asci_settings.pt2_prune, bool);
+    OPT_KEYWORD("ASCI.PT2_PRECOMPUTE_EPS", asci_settings.pt2_precompute_eps,
+                bool);
+    OPT_KEYWORD("ASCI.PT2_PRECOMPUTE_IDX", asci_settings.pt2_precompute_idx,
+                bool);
+    OPT_KEYWORD("ASCI.PT2_PRINT_PROGRESS", asci_settings.pt2_print_progress,
+                bool);
+    OPT_KEYWORD("ASCI.PT2_BIGCON_THRESH", asci_settings.pt2_bigcon_thresh,
+                size_t);
+    OPT_KEYWORD("ASCI.NXTVAL_BCOUNT_THRESH", asci_settings.nxtval_bcount_thresh,
+                size_t);
+    OPT_KEYWORD("ASCI.NXTVAL_BCOUNT_INC", asci_settings.nxtval_bcount_inc,
+                size_t);
 
     bool mp2_guess = false;
     OPT_KEYWORD("MCSCF.MP2_GUESS", mp2_guess, bool);
 
     if(!world_rank) {
+      console->info("[Standalone MACIS Driver]:");
+      console->info("  * NMPI          = {}", world_size);
+      console->info("  * NTHREADS      = {}", omp_get_max_threads());
       console->info("[Wavefunction Data]:");
-      console->info("  * JOB     = {}", job_str);
-      console->info("  * CIEXP   = {}", ciexp_str);
-      console->info("  * FCIDUMP = {}", fcidump_fname);
+      console->info("  * JOB           = {}", job_str);
+      console->info("  * CIEXP         = {}", ciexp_str);
+      console->info("  * REF_FILE_NAME = {}", reference_data_file);
+      console->info("  * REF_FILE_FMT  = {}", reference_data_format);
       if(fci_out_fname.size())
         console->info("  * FCIDUMP_OUT = {}", fci_out_fname);
+      console->info("  * NORBITAL  = {}", norb);
+      console->info("  * NINACTIVE = {}", n_inactive);
+      console->info("  * NACTIVE   = {}", n_active);
+      console->info("  * NVIRTUAL  = {}", n_virtual);
       console->info("  * MP2_GUESS = {}", mp2_guess);
 
       console->debug("READ {} 1-body integrals and {} 2-body integrals",
@@ -234,6 +295,9 @@ int main(int argc, char** argv) {
 
     // MP2 Guess Orbitals
     if(mp2_guess) {
+      if(nalpha != nbeta)
+        throw std::runtime_error("MP2 Guess only implemented for closed-shell");
+
       console->info("Calculating MP2 Natural Orbitals");
       size_t nocc_canon = n_inactive + nalpha;
       size_t nvir_canon = norb - nocc_canon;
@@ -278,13 +342,12 @@ int main(int argc, char** argv) {
     std::vector<double> active_trdm(active_ordm.size() * active_ordm.size());
 
     double E0 = 0;
-
+    double EPT2 = 0;
     // CI
     if(job == Job::CI) {
-      using generator_t = macis::DoubleLoopHamiltonianGenerator<nwfn_bits>;
+      using generator_t = macis::SortedDoubleLoopHamiltonianGenerator<wfn_type>;
       if(ci_exp == CIExpansion::CAS) {
         std::vector<double> C_local;
-        // TODO: VERIFY MPI + CAS
         E0 = macis::CASRDMFunctor<generator_t>::rdms(
             mcscf_settings, NumOrbital(n_active), nalpha, nbeta,
             T_active.data(), V_active.data(), active_ordm.data(),
@@ -295,10 +358,10 @@ int main(int argc, char** argv) {
           auto det_logger = world_rank
                                 ? spdlog::null_logger_mt("determinants")
                                 : spdlog::stdout_color_mt("determinants");
-          det_logger->info("Print leading determinants > {:.12f}",
+          det_logger->info("Print leading determinants > {:.2e}",
                            determinants_threshold);
-          auto dets = macis::generate_hilbert_space<generator_t::nbits>(
-              n_active, nalpha, nbeta);
+          auto dets =
+              macis::generate_hilbert_space<wfn_type>(n_active, nalpha, nbeta);
           for(size_t i = 0; i < dets.size(); ++i) {
             if(std::abs(C_local[i]) > determinants_threshold) {
               det_logger->info("{:>16.12f}   {}", C_local[i],
@@ -308,13 +371,16 @@ int main(int argc, char** argv) {
         }
 
       } else {
+        // if(nalpha != nbeta)
+        //   throw std::runtime_error("ASCI Only Implemented for Closed-Shell");
+
         // Generate the Hamiltonian Generator
         generator_t ham_gen(
             macis::matrix_span<double>(T_active.data(), n_active, n_active),
             macis::rank4_span<double>(V_active.data(), n_active, n_active,
                                       n_active, n_active));
 
-        std::vector<macis::wfn_t<nwfn_bits>> dets;
+        std::vector<wfn_type> dets;
         std::vector<double> C;
         if(asci_wfn_fname.size()) {
           // Read wave function from standard file
@@ -338,7 +404,7 @@ int main(int argc, char** argv) {
         } else {
           // HF Guess
           console->info("Generating HF Guess for ASCI");
-          dets = {macis::canonical_hf_determinant<nwfn_bits>(nalpha, nalpha)};
+          dets = {wfn_traits::canonical_hf_determinant(nalpha, nbeta)};
           // std::cout << dets[0].to_ullong() << std::endl;
           E0 = ham_gen.matrix_element(dets[0], dets[0]);
           C = {1.0};
@@ -350,13 +416,13 @@ int main(int argc, char** argv) {
         auto asci_st = hrt_t::now();
 
         // Growth phase
-        std::tie(E0, dets, C) = macis::asci_grow(
+        std::tie(E0, dets, C) = macis::asci_grow<nwfn_bits, int64_t>(
             asci_settings, mcscf_settings, E0, std::move(dets), std::move(C),
             ham_gen, n_active MACIS_MPI_CODE(, MPI_COMM_WORLD));
 
         // Refinement phase
         if(asci_settings.max_refine_iter) {
-          std::tie(E0, dets, C) = macis::asci_refine(
+          std::tie(E0, dets, C) = macis::asci_refine<nwfn_bits, int64_t>(
               asci_settings, mcscf_settings, E0, std::move(dets), std::move(C),
               ham_gen, n_active MACIS_MPI_CODE(, MPI_COMM_WORLD));
         }
@@ -367,17 +433,48 @@ int main(int argc, char** argv) {
 
         if(asci_wfn_out_fname.size() and !world_rank) {
           console->info("Writing ASCI Wavefunction to {}", asci_wfn_out_fname);
+          // if(reference_data_format == "TREXIO") {
+          //   console->info("  * Format TREXIO");
+          //   macis::TREXIOFile trexio_file(asci_wfn_out_fname, 'w',
+          //   TREXIO_HDF5); trexio_file.write_mo_num(nwfn_bits/2); // Trick
+          //   TREXIO trexio_file.write_determinant_list(dets.size(),
+          //     reinterpret_cast<int64_t*>(dets.data()));
+          // } else {
+          console->info("  * Format TEXT");
           macis::write_wavefunction(asci_wfn_out_fname, n_active, dets, C);
+          //}
         }
 
         // Dump Hamiltonian
-#if 0
-        if(0) {
+#if 1
+        std::cout << "ASCI HAM FNAME = " << asci_ham_out_fname << std::endl;
+        if(asci_ham_out_fname.size()) {
+          std::cout << "HERE" << std::endl;
           auto H = macis::make_dist_csr_hamiltonian<int64_t>(
               MPI_COMM_WORLD, dets.begin(), dets.end(), ham_gen, 1e-16);
-          sparsexx::write_dist_mm("ham.mtx", H, 1);
+          sparsexx::write_dist_mm(asci_ham_out_fname, H, 1);
         }
 #endif
+        if(pt2) {
+          MPI_Barrier(MPI_COMM_WORLD);
+          auto pt2_st = hrt_t::now();
+          EPT2 = macis::asci_pt2_constraint(
+              asci_settings, dets.begin(), dets.end(),
+              E0 - (E_inactive + E_core), C, n_active, ham_gen.T(),
+              ham_gen.G_red(), ham_gen.V_red(), ham_gen.G(), ham_gen.V(),
+              ham_gen MACIS_MPI_CODE(, MPI_COMM_WORLD));
+          MPI_Barrier(MPI_COMM_WORLD);
+          auto pt2_en = hrt_t::now();
+          dur_t pt2_dur = pt2_en - pt2_st;
+          console->info("* ASCI_PT2_DUR = {:.2e} ms", pt2_dur.count());
+        }
+      }
+
+      console->info("E(CI)     = {:.12f} Eh", E0);
+
+      if(pt2) {
+        console->info("E(PT2)    = {:.12f} Eh", EPT2);
+        console->info("E(CI+PT2) = {:.12f} Eh", E0 + EPT2);
       }
 
       // MCSCF
@@ -410,9 +507,9 @@ int main(int argc, char** argv) {
           NumVirtual(n_virtual), E_core, T.data(), norb, V.data(), norb,
           active_ordm.data(), n_active, active_trdm.data(),
           n_active MACIS_MPI_CODE(, MPI_COMM_WORLD));
-    }
 
-    console->info("E(CI)  = {:.12f} Eh", E0);
+      console->info("E(CASSCF)  = {:.12f} Eh", E0);
+    }
 
     // Write FCIDUMP file if requested
     if(fci_out_fname.size())