diff --git a/blocksparse/ewops.py b/blocksparse/ewops.py index 36cf61e..a6c77d5 100644 --- a/blocksparse/ewops.py +++ b/blocksparse/ewops.py @@ -221,7 +221,7 @@ def dropout(x, keep_prob, mask=None, mask_shape=None): size = 1 for m_dim, x_dim in zip(mask_shape, x.shape.as_list()): # we don't currently support placeholder dims when broadcasting the dropout mask - assert m_dim == 1 or m_dim == x_dim, f"incompatible mask_shape: {mask_shape} x.shape: {x.shape}" + assert m_dim == 1 or m_dim == x_dim, "incompatible mask_shape: %s x.shape: %s" % (mask_shape, x.shape) size *= m_dim else: size = 0 @@ -439,4 +439,3 @@ def assign_add(y, x, name=None): # f8 = (1 + frac) * 2**(exp - ebias) # l8 = 2**(exp + frac - ebias) # print("%2d %.3f %9.5f %9.5f" % (exp-ebias, frac, f8, l8)) - diff --git a/blocksparse/grads.py b/blocksparse/grads.py index 7d0db79..357db67 100644 --- a/blocksparse/grads.py +++ b/blocksparse/grads.py @@ -211,7 +211,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo for i, dy in enumerate(grad_ys): if dy is None: # float grads start at ones by default - grad_ys[i] = tf.fill(tf.shape(ys[i]), tf.constant(1.0, dtype=ys[i].dtype, name=f"grad_ys_{i}")) + grad_ys[i] = tf.fill(tf.shape(ys[i]), tf.constant(1.0, dtype=ys[i].dtype, name="grad_ys_%s" % (i))) ys_ops = [t.op for t in ys] xs_ops = [t.op for t in xs] @@ -261,7 +261,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo else: grad_fn = ops.get_gradient_function(op) except LookupError: - raise LookupError(f"No gradient defined for operation '{op.name}' (op type: {op.type})") + raise LookupError("No gradient defined for operation '%s' (op type: %s)" % (op.name, op.type)) # for any missing input grads, build a zero input of the right dtype/shape for i, dy in enumerate(dys): @@ -273,7 +273,7 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo dxs = _AsList(grad_fn(op, *dys)) if len(dxs) != len(op.inputs): - raise ValueError(f"Num gradients {len(dxs)} generated for op {op.node_def} do not match num inputs {len(op.inputs)}") + raise ValueError("Num gradients %s generated for op %s do not match num inputs %s" % (len(dxs), op.node_def, len(op.inputs))) #_LogOpGradients(op, dys, dxs) else: @@ -316,4 +316,3 @@ def gradients(ys, xs, grad_ys=None, stop_grads=None, group_aggregations=8, custo return [_GetGrad(grads, x) for x in xs] - diff --git a/blocksparse/lstm.py b/blocksparse/lstm.py index 27efbe5..224fd71 100644 --- a/blocksparse/lstm.py +++ b/blocksparse/lstm.py @@ -19,7 +19,7 @@ lstm_gates4_grad_op = _op_module.lstm_gates4_grad bias_grad_op = _op_module.bias_grad -def fused_lstm_gates(c, *args, bias=None, forget_bias=1.0, name=None): +def fused_lstm_gates(c, bias=None, forget_bias=1.0, name=None, *args): # returns c_next, h_next dev = args[0].op.device.lower() @@ -297,4 +297,4 @@ def group_lstm_grads(grads, params, scope="grouped_lstm", group_size=None): # with tf.variable_scope(bias_scope, reuse=bias_reuse): # b = tf.get_variable('bias', shape=[4 * width]) # if layernorm: - # g = tf.get_variable('gain', shape=[4 * width]) \ No newline at end of file + # g = tf.get_variable('gain', shape=[4 * width]) diff --git a/blocksparse/matmul.py b/blocksparse/matmul.py index 087b24a..8df4890 100644 --- a/blocksparse/matmul.py +++ b/blocksparse/matmul.py @@ -46,7 +46,7 @@ def get_constant(lut, name): #print(name, lut.size) #tf_entry = tf.constant(lut, name=name+"_lut") with tf.control_dependencies(None): - tf_entry = tf.get_variable(f"{name}_lut_{g_lut_idx}", initializer=lut.view(np.int64), trainable=False) + tf_entry = tf.get_variable("%s_lut_%s" % (name, g_lut_idx), initializer=lut.view(np.int64), trainable=False) g_lut_idx += 1 g_lookup_cache[name].append( (lut, tf_entry) ) @@ -736,14 +736,14 @@ def group_dg_grads(bsmm_dw_op, dw, scope): # that takes in the final accumulated dw value dg_op = bsmm_dw_op.outputs[0].consumers()[0] assert dg_op.type == "BlocksparseMatmulDG" - dw, dg = blocksparse_matmul_dg(dw, *dg_op.inputs[1:], name=f"{scope}/BlocksparseMatmulDG") + dw, dg = blocksparse_matmul_dg(dw, *dg_op.inputs[1:], name="%s/BlocksparseMatmulDG" % (scope)) # splice old add_n op out of graph addn_op = dg_op.outputs[1].consumers()[0] addn_ops = list() addn_ops.append(addn_op) if addn_op.type[0:3] != "Add": - raise ValueError(f"bad type: {addn_ops[0].type} Cause: this segment does not share a broadcasted gate.") + raise ValueError("bad type: %s Cause: this segment does not share a broadcasted gate." % (addn_ops[0].type)) elif addn_op.type == "AddN8": while True: addn_op = addn_op.outputs[0].consumers()[0] @@ -768,12 +768,12 @@ def group_dg_grads(bsmm_dw_op, dw, scope): for i, t in enumerate(dg_consumer.inputs): #print(i, t.name) if t is addn: - #print(f"splicing dg into: {dg_consumer.name} at {i}") + #print("splicing dg into: %s at %s" % (dg_consumer.name, i)) dg_consumer._update_input(i, dg) found = True break if not found: - print(f"splice failed for {dg_consumer.name}") + print("splice failed for %s" % (dg_consumer.name)) return dw diff --git a/blocksparse/utils.py b/blocksparse/utils.py index f59eb59..06fff35 100644 --- a/blocksparse/utils.py +++ b/blocksparse/utils.py @@ -222,10 +222,10 @@ def bst_conv_layout(input_h=1, input_w=1, filter_h=1, filter_w=1, stride=1, blk_ break assert pad_s >= 0, "Even size filters only work with stride 2." - print(f"P:{P} Q:{Q} H:{H} W:{W} R:{R} S:{S} std:{stride} pad_r:{pad_r} pad_s:{pad_s}") + print("P:%s Q:%s H:%s W:%s R:%s S:%s std:%s pad_r:%s pad_s:%s" % (P, Q, H, W, R, S, stride, pad_r, pad_s)) - assert P*Q % blk_size == 0, f"P:{P} Q:{Q}" - assert H*W % blk_size == 0, f"H:{H} W:{W}" + assert P*Q % blk_size == 0, "P:%s Q:%s" % (P, Q) + assert H*W % blk_size == 0, "H:%s W:%s" % (H, W) mask_set = set() layout = np.zeros((P*Q//blk_size, H*W//blk_size), dtype=np.bool) @@ -294,10 +294,10 @@ def bst_deconv_layout(output_h=1, output_w=1, filter_h=1, filter_w=1, stride=1, break assert pad_s >= 0, "Even size filters only work with stride 2." - print(f"P:{P} Q:{Q} H:{H} W:{W} R:{R} S:{S} std:{stride} pad_r:{pad_r} pad_s:{pad_s}") + print("P:%s Q:%s H:%s W:%s R:%s S:%s std:%s pad_r:%s pad_s:%s" % (P, Q, H, W, R, S, stride, pad_r, pad_s)) - assert P*Q % blk_size == 0, f"P:{P} Q:{Q}" - assert H*W % blk_size == 0, f"H:{H} W:{W}" + assert P*Q % blk_size == 0, "P:%s Q:%s" % (P, Q) + assert H*W % blk_size == 0, "H:%s W:%s" % (H, W) mask_set = set() layout = np.zeros((H*W//blk_size, P*Q//blk_size), dtype=np.bool)