Writing a CNN for cifar-10
Alex Krizhevsky, Vinod Nair and Geoffrey Hinton collected a data set called called CIFAR-10 where the Canadian Institute of Advanced Research was the name donor. It is a subset of 80 million tiny images with only ten classes of single objects in the center.
The images are 32px x 32px x 3 color channels (rgb) of the following classes: Airplanes, cars, birds, cats, deer, dogs, frogs, horses, ships and trucks.
I used a Kaggle kernel with the new GPU feature.
Loading the data set
One can download CIFAR-10 (163 MB) from Alex’ website.
It consists of 50.000 training and 10.000 test images.
The following boilerplate code loads the images (+labels) into memory and normalizes them by substracting the mean. And takes 1000 images from the training set to create a validation set.
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
def load_pickle(f):
version = platform.python_version_tuple()
if version[0] == '2':
return pickle.load(f)
elif version[0] == '3':
return pickle.load(f, encoding='latin1')
raise ValueError("invalid python version: {}".format(version))
def load_CIFAR_batch(filename):
""" load single batch of cifar """
with open(filename, 'rb') as f:
datadict = load_pickle(f)
X = datadict['data']
Y = datadict['labels']
X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
Y = np.array(Y)
return X, Y
def load_CIFAR10(ROOT):
""" load all of cifar """
xs = []
ys = []
for b in range(1,6):
f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
X, Y = load_CIFAR_batch(f)
xs.append(X)
ys.append(Y)
Xtr = np.concatenate(xs)
Ytr = np.concatenate(ys)
del X, Y
Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
return Xtr, Ytr, Xte, Yte
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=10000):
# Load the raw CIFAR-10 data
cifar10_dir = '../input/cifar-10-python/cifar-10-python/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
# Subsample the data
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
# Normalize the data: subtract the mean image
mean_image = np.mean(X_train, axis=0)
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
return X_train, y_train, X_val, y_val, X_test, y_test
Planning the CNN
The target is to get an accuracy of at least 65% which is the neccessary minimum for assignment 2 of the cs231n course.
I am going to use 3x3 convolutional filters because the VGG net uses this size. The stride of the convolution is always one. The architecture is kind of arbitrarily chosen:
- Conv1: 32x32x3 -> 30x30x32
- Relu
- Conv2: 30x30x32 -> 30x30x64
- Relu
- Conv3: 30x30x64 -> 30x30x128
- Relu
- MaxPool: pool_size=(2,2) and stride=2
- Spatial batch norm
- Affine1: 28800 -> 5760
- Batch norm
- Relu
- Dropout: rate=50%
- Affine2: 5760 -> 1024
- Relu
- Batch norm
- Affine3: 1024 -> 10
- Batch norm
- Softmax
Building the CNN
I used tensorflow to build the explained convolutional neural network.
class CifarNet():
def __init__(self):
# The weights and biases for the first conv
self.Wconv1 = tf.get_variable("Wconv1", shape=[3, 3, 3, 32])
self.bconv1 = tf.get_variable("bconv1", shape=[32])
# size 30x30x32 = 28800
# 2nd conv
self.Wconv1_2 = tf.get_variable("Wconv1_2", shape=[3, 3, 32, 64])
self.bconv1_2 = tf.get_variable("bconv1_2", shape=[64])
# size 30x30x64 = 57600
self.Wconv2 = tf.get_variable("Wconv2", shape=[3, 3, 64, 128])
self.bconv2 = tf.get_variable("bconv2", shape=[128])
# size 30x30x128 = 115200
# => max pooling
# size 15x15x128 = 28800
# affine layer: 28800 -> 5760
self.W1 = tf.get_variable("W1", shape=[28800, 5760])
self.b1 = tf.get_variable("b1", shape=[5760])
# affine layer: 5760 -> 1024
self.W2_1 = tf.get_variable("W2_1", shape=[5760, 1024])
self.b2_1 = tf.get_variable("b2_2", shape=[1024])
# affine layer: 1024 -> 10
self.W2 = tf.get_variable("W2", shape=[1024, 10])
self.b2 = tf.get_variable("b2", shape=[10])
def forward(self, X, y, is_training):
conv1 = tf.nn.conv2d(X, self.Wconv1,
strides=[1, 1, 1, 1], padding='SAME') + self.bconv1
relu1 = tf.nn.relu(conv1)
conv1_1 = tf.nn.conv2d(relu1, self.Wconv1_2, strides=[1, 1, 1, 1], padding='SAME') + self.bconv1_2
relu1_1 = tf.nn.relu(conv1_1)
conv2 = tf.nn.conv2d(relu1_1, self.Wconv2, strides=[1, 1, 1, 1], padding='VALID') + self.bconv2
relu2 = tf.nn.relu(conv2)
maxpool = tf.layers.max_pooling2d(relu2, pool_size=(2,2), strides=2)
maxpool_flat = tf.reshape(maxpool,[-1,28800])
# Spatial Batch Normalization Layer (trainable parameters, with scale and centering)
bn1 = tf.layers.batch_normalization(inputs=maxpool_flat, center=True, scale=True, training=is_training)
affine1 = tf.matmul(bn1, self.W1) + self.b1
# normal batch norm
bn2 = tf.layers.batch_normalization(inputs=affine1, center=True, scale=True, training=is_training)
relu2 = tf.nn.relu(bn2)
# dropout
drop1 = tf.layers.dropout(inputs=relu2, training=is_training, rate=0.5)
# Affine layer from 5760 input units to 1024 outputs
affine2_1 = tf.matmul(drop1, self.W2_1) + self.b2_1
bn2_1 = tf.layers.batch_normalization(inputs=affine2_1, center=True, scale=True, training=is_training)
# Affine layer from 1024 input units to 10 outputs
affine2 = tf.matmul(bn2_1, self.W2) + self.b2
# batch norm
self.predict = tf.layers.batch_normalization(inputs=affine2, center=True, scale=True, training=is_training)
return self.predict
def run(self, session, loss_val, Xd, yd, epochs=1, batch_size=64, print_every=100, training=None, plot_losses=False, isSoftMax=False):
# have tensorflow compute accuracy
if isSoftMax:
correct_prediction = tf.nn.softmax(self.predict)
else:
correct_prediction = tf.equal(tf.argmax(self.predict,1), y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# shuffle indices
train_indicies = np.arange(Xd.shape[0])
np.random.shuffle(train_indicies)
# training state
training_now = training is not None
# setting up variables we want to compute (and optimizing)
variables = [mean_loss, correct_prediction, accuracy]
if training_now:
variables[-1] = training
# counter
iter_cnt = 0
for e in range(epochs):
# keep track of losses and accuracy
correct = 0
losses = []
# make sure we iterate over the dataset once
for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
# generate indicies for the batch
start_idx = (i*batch_size)%Xd.shape[0]
idx = train_indicies[start_idx:start_idx+batch_size]
# create a feed dictionary for this batch
feed_dict = {X: Xd[idx,:],
y: yd[idx],
is_training: training_now }
# get batch size
actual_batch_size = yd[idx].shape[0]
# have tensorflow compute loss and correct predictions
# and (if given) perform a training step
loss, corr, _ = session.run(variables,feed_dict=feed_dict)
# aggregate performance stats
losses.append(loss*actual_batch_size)
correct += np.sum(corr)
# print every now and then
if training_now and (iter_cnt % print_every) == 0:
print("Iteration {0}: with minibatch " +
"training loss = {1:.3g} and accuracy of {2:.2g}"\
.format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
iter_cnt += 1
total_correct = correct/Xd.shape[0]
total_loss = np.sum(losses)/Xd.shape[0]
print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}"\
.format(total_loss,total_correct,e+1))
if plot_losses:
plt.plot(losses)
plt.grid(True)
plt.title('Epoch {} Loss'.format(e+1))
plt.xlabel('minibatch number')
plt.ylabel('minibatch loss')
plt.show()
return total_loss, total_correct
Program the training
# setup the placeholders
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
net = CifarNet()
net.forward(X,y,is_training)
We are using an exponentially decaying learning rate.
# Annealing the learning rate
global_step = tf.Variable(0, trainable=False)
starter_learning_rate = 1e-3
exp_learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
100000, 0.96, staircase=True)
The softmax cross entropy with logits
[m]easures the probability error in discrete classification tasks in which the classes are mutually exclusive (each entry is in exactly one class). For example, each CIFAR-10 image is labeled with one and only one label: an image can be a dog or a truck, but not both.”
says the tensorflow documentation.
# define our loss
cross_entr_loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=net.predict)
mean_loss = tf.reduce_mean(cross_entr_loss)
We use Adam as an optimizer because I don’t know about any better at the moment.
# define our optimizer
optimizer = tf.train.AdamOptimizer(exp_learning_rate)
# batch normalization in tensorflow requires this extra dependency
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
train_step = optimizer.minimize(mean_loss, global_step=global_step)
Run the training
Now we can place the training on the free-to-use Kaggle GPU!
# train with 10 epochs
sess = tf.Session()
with tf.device("/gpu:0") as dev:
sess.run(tf.global_variables_initializer())
print('Training')
net.run(sess, mean_loss, X_train, y_train, 10, 64, 200, train_step, True)
print('Validation')
net.run(sess, mean_loss, X_val, y_val, 1, 64)
And evaluate on the test set (I had to use batches here too because the whole set would not fit into the memory of the gpu):
net.run(sess, mean_loss, X_test, y_test, 1, 64)
Epoch 1, Overall loss = 0.989 and accuracy of 0.731