Deep Boltzmann machines on MNIST¶

Example for training a centered Deep Boltzmann machine on the MNIST handwritten digit dataset.

It allows to reproduce the results from the publication How to Center Deep Boltzmann Machines. Melchior et al. JMLR 2016..

Results¶

The code given below produces the following output that is quite similar to the results produced by an RBM.

The learned filters of the first layer

The learned filters of the second layer, linearly back projected

DBM filters of the second layer on MNIST

Some generated samples

AE filter on MNIST with contrastive penalty

See also RBM_MNIST_big.

Source code¶

import pydeep.misc.visualization as VIS
import pydeep.misc.io as IO
import pydeep.base.numpyextension as numxExt
from pydeep.dbm.unit_layer import *
from pydeep.dbm.weight_layer import *
from pydeep.dbm.model import *

# Set the same seed value for all algorithms
numx.random.seed(42)

# Load Data
train_data = IO.load_mnist("mnist.pkl.gz", True)[0]

# Set dimensions Layer 1-3
v11 = v12 = 28
v21 = v22 = 10
v31 = v32 = 10
N = v11 * v12
M = v21 * v22
O = v31 * v32

# Create weight layers, which connect the unit layers
wl1 = Weight_layer(input_dim=N,
                   output_dim=M,
                   initial_weights=0.01,
                   dtype=numx.float64)
wl2 = Weight_layer(input_dim=M,
                   output_dim=O,
                   initial_weights=0.01,
                   dtype=numx.float64)

# Create three unit layers
l1 = Binary_layer(None,
                  wl1,
                  data=train_data,
                  initial_bias='AUTO',
                  initial_offsets='AUTO',
                  dtype=numx.float64)

l2 = Binary_layer(wl1,
                  wl2,
                  data=None,
                  initial_bias='AUTO',
                  initial_offsets='AUTO',
                  dtype=numx.float64)

l3 = Binary_layer(wl2,
                  None,
                  data=None,
                  initial_bias='AUTO',
                  initial_offsets='AUTO',
                  dtype=numx.float64)

# Initialize parameters
max_epochs = 10
batch_size = 20

# Sampling Setps positive and negative phase
k_d = 3
k_m = 1

# Set individual learning rates
lr_W1 = 0.01
lr_W2 = 0.01
lr_b1 = 0.01
lr_b2 = 0.01
lr_b3 = 0.01
lr_o1 = 0.01
lr_o2 = 0.01
lr_o3 = 0.01

# Initialize negative Markov chain
x_m = numx.zeros((batch_size, v11 * v12)) + l1.offset
y_m = numx.zeros((batch_size, v21 * v22)) + l2.offset
z_m = numx.zeros((batch_size, v31 * v32)) + l3.offset
chain_m = [x_m, y_m, z_m]

# Reparameterize RBM such that the inital setting is the same for centereing and centered training
l1.bias += numx.dot(0.0 - l2.offset, wl1.weights.T)
l2.bias += numx.dot(0.0 - l1.offset, wl1.weights) + numx.dot(0.0 - l3.offset, wl2.weights.T)
l3.bias += numx.dot(0.0 - l2.offset, wl2.weights)

# Finally create model
model = DBM_model([l1, l2, l3])

# Loop over data and batches to traing th emodel
for epoch in range(0, max_epochs):
    rec_sum = 0
    for b in range(0, train_data.shape[0], batch_size):
        # Positive Phase

        # Initialize Markov chains with data or offsets
        x_d = train_data[b:b + batch_size, :]
        y_d = numx.zeros((batch_size, M)) + l2.offset
        z_d = numx.zeros((batch_size, O)) + l3.offset
        chain_d = [x_d, y_d, z_d]

        # Sample for k_d steps mean field estimation inplace, but clamp the data units
        model.meanfield(chain_d, k_d, [True, False, False], True)
        # or sample instead
        #model.sample(chain_d, k_d, [True, False, False], True)

        # Negative Phase

        # PCD, sample k_m steps without clamping
        model.sample(chain_m, k_m, [False, False, False], True)

        # Update the model using the sampled states and learning rates
        model.update(chain_d, chain_m, lr_W1, lr_b1, lr_o1)

    # Print Norms of the Parameters
    print(numx.mean(numxExt.get_norms(wl1.weights)), '\t', numx.mean(numxExt.get_norms(wl2.weights)), '\t')
    print(numx.mean(numxExt.get_norms(l1.bias)), '\t', numx.mean(numxExt.get_norms(l2.bias)), '\t')
    print(numx.mean(numxExt.get_norms(l3.bias)), '\t', numx.mean(l1.offset), '\t', numx.mean(l2.offset), '\t', numx.mean(l3.offset))

# Show weights
VIS.imshow_matrix(VIS.tile_matrix_rows(wl1.weights, v11, v12, v21, v22, border_size=1, normalized=False), 'Weights 1')
VIS.imshow_matrix(
    VIS.tile_matrix_rows(numx.dot(wl1.weights, wl2.weights), v11, v12, v31, v32, border_size=1, normalized=False),
    'Weights 2')

# # Samplesome steps
chain_m = [numx.float64(numx.random.rand(10 * batch_size, v11 * v12) < 0.5),
           numx.float64(numx.random.rand(10 * batch_size, v21 * v22) < 0.5),
           numx.float64(numx.random.rand(10 * batch_size, v31 * v32) < 0.5)]
model.sample(chain_m, 100, [False, False, False], True)
# GEt probabilities
samples = l1.activation(None, chain_m[1])[0]
VIS.imshow_matrix(VIS.tile_matrix_columns(samples, v11, v12, 10, batch_size, 1, False), 'Samples')

VIS.show()