diff --git a/blocksparse/ewops.py b/blocksparse/ewops.py
index 36cf61e..a6c77d5 100644
--- a/blocksparse/ewops.py
+++ b/blocksparse/ewops.py
@@ -221,7 +221,7 @@ def dropout(x, keep_prob, mask=None, mask_shape=None):
             size = 1
             for m_dim, x_dim in zip(mask_shape, x.shape.as_list()):
                 # we don't currently support placeholder dims when broadcasting the dropout mask
-                assert m_dim == 1 or m_dim == x_dim, f"incompatible mask_shape: {mask_shape} x.shape: {x.shape}"
+                assert m_dim == 1 or m_dim == x_dim, "incompatible mask_shape: %s x.shape: %s" % (mask_shape, x.shape)
                 size *= m_dim
         else:
             size = 0
@@ -439,4 +439,3 @@ def assign_add(y, x, name=None):
 #         f8 = (1 + frac) * 2**(exp - ebias)
 #         l8 = 2**(exp + frac - ebias)
 #         print("%2d %.3f %9.5f %9.5f" % (exp-ebias, frac, f8, l8))
-
diff --git a/blocksparse/grads.py b/blocksparse/grads.py
index 7d0db79..357db67 100644
--- a/blocksparse/grads.py
+++ b/blocksparse/grads.py
@@ -211,7 +211,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo
         for i, dy in enumerate(grad_ys):
             if dy is None:
                 # float grads start at ones by default
-                grad_ys[i] = tf.fill(tf.shape(ys[i]), tf.constant(1.0, dtype=ys[i].dtype, name=f"grad_ys_{i}"))
+                grad_ys[i] = tf.fill(tf.shape(ys[i]), tf.constant(1.0, dtype=ys[i].dtype, name="grad_ys_%s" % (i)))
 
         ys_ops = [t.op for t in ys]
         xs_ops = [t.op for t in xs]
@@ -261,7 +261,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo
                     else:
                         grad_fn = ops.get_gradient_function(op)
                 except LookupError:
-                    raise LookupError(f"No gradient defined for operation '{op.name}' (op type: {op.type})")
+                    raise LookupError("No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type))
 
                 # for any missing input grads, build a zero input of the right dtype/shape
                 for i, dy in enumerate(dys):
@@ -273,7 +273,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo
                     dxs = _AsList(grad_fn(op, *dys))
 
                     if len(dxs) != len(op.inputs):
-                        raise ValueError(f"Num gradients {len(dxs)} generated for op {op.node_def} do not match num inputs {len(op.inputs)}")
+                        raise ValueError("Num gradients %s generated for op %s do not match num inputs %s" % (len(dxs), op.node_def, len(op.inputs)))
 
                     #_LogOpGradients(op, dys, dxs)
             else:
@@ -316,4 +316,3 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo
 
     return [_GetGrad(grads, x) for x in xs]
 
-
diff --git a/blocksparse/lstm.py b/blocksparse/lstm.py
index 27efbe5..224fd71 100644
--- a/blocksparse/lstm.py
+++ b/blocksparse/lstm.py
@@ -19,7 +19,7 @@
 lstm_gates4_grad_op = _op_module.lstm_gates4_grad
 bias_grad_op        = _op_module.bias_grad
 
-def fused_lstm_gates(c, *args, bias=None, forget_bias=1.0, name=None):
+def fused_lstm_gates(c, bias=None, forget_bias=1.0, name=None, *args):
     # returns c_next, h_next
 
     dev = args[0].op.device.lower()
@@ -297,4 +297,4 @@ def group_lstm_grads(grads, params, scope="grouped_lstm", group_size=None):
     #         with tf.variable_scope(bias_scope, reuse=bias_reuse):
     #             b = tf.get_variable('bias', shape=[4 * width])
     #             if layernorm:
-    #                 g = tf.get_variable('gain', shape=[4 * width])
\ No newline at end of file
+    #                 g = tf.get_variable('gain', shape=[4 * width])
diff --git a/blocksparse/matmul.py b/blocksparse/matmul.py
index 087b24a..8df4890 100644
--- a/blocksparse/matmul.py
+++ b/blocksparse/matmul.py
@@ -46,7 +46,7 @@ def get_constant(lut, name):
     #print(name, lut.size)
     #tf_entry = tf.constant(lut, name=name+"_lut")
     with tf.control_dependencies(None):
-        tf_entry = tf.get_variable(f"{name}_lut_{g_lut_idx}", initializer=lut.view(np.int64), trainable=False)
+        tf_entry = tf.get_variable("%s_lut_%s" % (name, g_lut_idx), initializer=lut.view(np.int64), trainable=False)
     g_lut_idx += 1
 
     g_lookup_cache[name].append( (lut, tf_entry) )
@@ -736,14 +736,14 @@ def group_dg_grads(bsmm_dw_op, dw, scope):
     # that takes in the final accumulated dw value
     dg_op  = bsmm_dw_op.outputs[0].consumers()[0]
     assert dg_op.type == "BlocksparseMatmulDG"
-    dw, dg = blocksparse_matmul_dg(dw, *dg_op.inputs[1:], name=f"{scope}/BlocksparseMatmulDG")
+    dw, dg = blocksparse_matmul_dg(dw, *dg_op.inputs[1:], name="%s/BlocksparseMatmulDG" % (scope))
 
     # splice old add_n op out of graph
     addn_op  = dg_op.outputs[1].consumers()[0]
     addn_ops = list()
     addn_ops.append(addn_op)
     if addn_op.type[0:3] != "Add":
-        raise ValueError(f"bad type: {addn_ops[0].type} Cause: this segment does not share a broadcasted gate.")
+        raise ValueError("bad type: %s Cause: this segment does not share a broadcasted gate." % (addn_ops[0].type))
     elif addn_op.type == "AddN8":
         while True:
             addn_op = addn_op.outputs[0].consumers()[0]
@@ -768,12 +768,12 @@ def group_dg_grads(bsmm_dw_op, dw, scope):
         for i, t in enumerate(dg_consumer.inputs):
             #print(i, t.name)
             if t is addn:
-                #print(f"splicing dg into: {dg_consumer.name} at {i}")
+                #print("splicing dg into: %s at %s" % (dg_consumer.name, i))
                 dg_consumer._update_input(i, dg)
                 found = True
                 break
         if not found:
-            print(f"splice failed for {dg_consumer.name}")
+            print("splice failed for %s" % (dg_consumer.name))
     return dw
 
 
diff --git a/blocksparse/utils.py b/blocksparse/utils.py
index f59eb59..06fff35 100644
--- a/blocksparse/utils.py
+++ b/blocksparse/utils.py
@@ -222,10 +222,10 @@ def bst_conv_layout(input_h=1, input_w=1, filter_h=1, filter_w=1, stride=1, blk_
             break
     assert pad_s >= 0, "Even size filters only work with stride 2."
 
-    print(f"P:{P} Q:{Q} H:{H} W:{W} R:{R} S:{S} std:{stride} pad_r:{pad_r} pad_s:{pad_s}")
+    print("P:%s Q:%s H:%s W:%s R:%s S:%s std:%s pad_r:%s pad_s:%s" % (P, Q, H, W, R, S, stride, pad_r, pad_s))
 
-    assert P*Q % blk_size == 0, f"P:{P} Q:{Q}"
-    assert H*W % blk_size == 0, f"H:{H} W:{W}"
+    assert P*Q % blk_size == 0, "P:%s Q:%s" % (P, Q)
+    assert H*W % blk_size == 0, "H:%s W:%s" % (H, W)
 
     mask_set = set()
     layout = np.zeros((P*Q//blk_size, H*W//blk_size), dtype=np.bool)
@@ -294,10 +294,10 @@ def bst_deconv_layout(output_h=1, output_w=1, filter_h=1, filter_w=1, stride=1,
             break
     assert pad_s >= 0, "Even size filters only work with stride 2."
 
-    print(f"P:{P} Q:{Q} H:{H} W:{W} R:{R} S:{S} std:{stride} pad_r:{pad_r} pad_s:{pad_s}")
+    print("P:%s Q:%s H:%s W:%s R:%s S:%s std:%s pad_r:%s pad_s:%s" % (P, Q, H, W, R, S, stride, pad_r, pad_s))
 
-    assert P*Q % blk_size == 0, f"P:{P} Q:{Q}"
-    assert H*W % blk_size == 0, f"H:{H} W:{W}"
+    assert P*Q % blk_size == 0, "P:%s Q:%s" % (P, Q)
+    assert H*W % blk_size == 0, "H:%s W:%s" % (H, W)
 
     mask_set = set()
     layout = np.zeros((H*W//blk_size, P*Q//blk_size), dtype=np.bool)