From 289c35f36ca9ed9933c36508c0e0fc4185e9cf12 Mon Sep 17 00:00:00 2001
From: Ryo Miyajima <r.miyajima@aktsk.jp>
Date: Tue, 22 Mar 2016 16:26:41 +0900
Subject: [PATCH 1/2] works with a large learning rate

---
 mnist.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mnist.py b/mnist.py
index 195eccb..1a88b4e 100644
--- a/mnist.py
+++ b/mnist.py
@@ -43,6 +43,13 @@ def conv2d(x, W):
 def max_pool_2x2(x):
   return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1,2,2,1], padding='SAME')
 
+def batch_normalization(shape, input):
+  eps = 1e-5
+  gamma = weight_variable([shape])
+  beta = weight_variable([shape])
+  mean, variance = tf.nn.moments(input, [0])
+  return gamma * (input - mean) / tf.sqrt(variance + eps) + beta
+
 def inference(images, keep_pl):
     # FIXME: deprecated documentation
     """Build the MNIST model up to where it may be used for inference.
@@ -58,21 +65,21 @@ def inference(images, keep_pl):
 
     with tf.name_scope('first_convolutional_layer') as scope:
       W_conv1 = weight_variable([5, 5, 1, 32])
-      b_conv1 = bias_variable([32])
-      h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
-      h_pool1 = max_pool_2x2(h_conv1)
+      h_conv1 = conv2d(x_image, W_conv1)
+      bn1 = batch_normalization(32, h_conv1)
+      h_pool1 = max_pool_2x2(tf.nn.relu(bn1))
 
     with tf.name_scope('second_convolutional_layer') as scope:
       W_conv2 = weight_variable([5, 5, 32, 64])
-      b_conv2 = bias_variable([64])
-      h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
-      h_pool2 = max_pool_2x2(h_conv2)
+      h_conv2 = conv2d(h_pool1, W_conv2)
+      bn2 = batch_normalization(64, h_conv2)
+      h_pool2 = max_pool_2x2(tf.nn.relu(bn2))
 
     with tf.name_scope('densely_connected_layer') as scope:
       W_fc1 = weight_variable([7*7*64, 1024])
-      b_fc1 = bias_variable([1024])
       h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
-      h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
+      bn3 = batch_normalization(1024, tf.matmul(h_pool2_flat, W_fc1))
+      h_fc1 = tf.nn.relu(bn3)
 
     with tf.name_scope('dropout') as scope:
       h_fc1_drop = tf.nn.dropout(h_fc1, keep_pl)
@@ -131,7 +138,7 @@ def training(loss):
     # Add a scalar summary for the snapshot loss.
     tf.scalar_summary(loss.op.name, loss)
     # Create the gradient descent optimizer with the given learning rate.
-    optimizer = tf.train.AdamOptimizer(1e-4)
+    optimizer = tf.train.AdamOptimizer(1e-3)
     # Create a variable to track the global step.
     global_step = tf.Variable(0, name='global_step', trainable=False)
     # Use the optimizer to apply the gradients that minimize the loss

From b121b4df50ae13cc785ddf7c2372f17410c33ec6 Mon Sep 17 00:00:00 2001
From: Ryo Miyajima <r.miyajima@aktsk.jp>
Date: Tue, 22 Mar 2016 17:20:09 +0900
Subject: [PATCH 2/2] revert learning rate

---
 mnist.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mnist.py b/mnist.py
index 1a88b4e..9a3e709 100644
--- a/mnist.py
+++ b/mnist.py
@@ -138,7 +138,7 @@ def training(loss):
     # Add a scalar summary for the snapshot loss.
     tf.scalar_summary(loss.op.name, loss)
     # Create the gradient descent optimizer with the given learning rate.
-    optimizer = tf.train.AdamOptimizer(1e-3)
+    optimizer = tf.train.AdamOptimizer(1e-4)
     # Create a variable to track the global step.
     global_step = tf.Variable(0, name='global_step', trainable=False)
     # Use the optimizer to apply the gradients that minimize the loss