python source code of mnist

# mnist_parallel.py
#	
# Author         : James Mnatzaganian
# Contact        : http://techtorials.me
# Organization   : NanoComputing Research Lab - Rochester Institute of
# Technology
# Website        : https://www.rit.edu/kgcoe/nanolab/
# Date Created   : 12/02/15
# 
# Description    : Testing SP with MNIST using parameter optimization and doing
# iterations in parallel.
# Python Version : 2.7.X
#
# License        : MIT License http://opensource.org/licenses/mit-license.php
# Copyright      : (c) 2016 James Mnatzaganian

"""
Testing SP with MNIST using parameter optimization and doing iterations in
parallel.

G{packagetree mHTM}
"""

__docformat__ = 'epytext'

# Native imports
import cPickle, os, json, pkgutil

# Third party imports
import numpy as np
from scipy.stats import uniform, randint
from sklearn.grid_search import RandomizedSearchCV
from sklearn.svm import LinearSVC

# Program imports
from mHTM.region import SPRegion
from mHTM.parallel import create_runner, execute_runner, ParamGenerator
from mHTM.datasets.loader import load_mnist, MNISTCV

def main(log_dir, ntrain=800, ntest=200, niter=10, nsplits=5,
	global_inhibition=True, seed=None):
	"""
	Build the information needed to perform CV on a subset of the MNIST
	dataset.

	@param log_dir: The directory to store the results in.
	
	@param ntrain: The number of training samples to use.
	
	@param ntest: The number of testing samples to use.
	
	@param niter: The number of parameter iterations to use.
	
	@param nsplits: The number of splits of the data to use.
	
	@param global_inhibition: If True use global inhibition; otherwise, use
	local inhibition.
	
	@param seed: The seed for the random number generators.
	
	@return: The full set of X, the full set of Y, the keyword arguments for
	the classifier, the params for CV, and the CV.
	"""
	
	# Get the data
	(tr_x, tr_y), (te_x, te_y) = load_mnist()
	x, y = np.vstack((tr_x, te_x)), np.hstack((tr_y, te_y))
	cv = MNISTCV(tr_y, te_y, ntrain, ntest, nsplits, seed)
			
	# Create static parameters
	ninputs = tr_x.shape[1]
	kargs = {
		# Region parameters
		'ninputs': ninputs,
		'global_inhibition': global_inhibition,
		'trim': 1e-4,
		'seed': seed,
		
		# Synapse parameters
		'syn_th': 0.5,
		'random_permanence': True,
		
		# Fitting parameters
		'nepochs': 30,
		'clf': LinearSVC(random_state=seed)
		# NOTE: The SVM's will be identical, despite being seeded now
	}
	
	# Come up with some parameters to search
	param_distributions = {
		# Region parameters
		'ncolumns':randint(100, 1001),
		'nactive':uniform(0, 0.2),
		# As a percentage of the number of columns
		
		# Column parameters
		'max_boost': randint(1, 21),
		'duty_cycle': randint(10, 1001),
		
		# Segment parameters
		'nsynapses': randint(1, ninputs + 1),
		'seg_th': uniform(0, 0.1),
		# As a percentage of the number of synapses
		
		# Synapse parameters
		'pinc': uniform(0.001, 0.1),
		'pdec': uniform(0.001, 0.1),
		'pwindow': uniform(0.001, 0.1),
		
		# Fitting parameters
		'log_dir': log_dir
	}
	
	# Build the parameter generator
	gen = ParamGenerator(param_distributions, niter, nsplits, ninputs)
	params = {key:gen for key in param_distributions}
	
	return x, y, kargs, params, cv

def main_local(log_dir, ntrain=800, ntest=200, niter=5, nsplits=3,
	global_inhibition=True, ncores=4, seed=None):
	"""
	Perform CV on a subset of the MNIST dataset. Performs parallelizations on
	a local machine.
	
	@param log_dir: The directory to store the results in.
	
	@param ntrain: The number of training samples to use.
	
	@param ntest: The number of testing samples to use.
	
	@param niter: The number of parameter iterations to use.
	
	@param nsplits: The number of splits of the data to use.
	
	@param global_inhibition: If True use global inhibition; otherwise, use
	local inhibition.
	
	@param ncores: The number of cores to use.
	
	@param seed: The seed for the random number generators.
	"""
	
	# Run the initialization
	x, y, kargs, params, cv = main(log_dir, ntrain, ntest, niter, nsplits,
		seed)
	
	# Build the classifier for doing CV
	clf = RandomizedSearchCV(
		estimator=SPRegion(**kargs),
		param_distributions=params,
		n_iter=niter, # Total runs
		n_jobs=ncores, # Use this many number of cores
		pre_dispatch=1 * ncores, # Give each core two jobs at a time
		iid=True, # Data is iid across folds
		cv=cv, # The CV split for the data
		refit=False, # Disable fitting best estimator on full dataset
		random_state=seed # Force same SP across runs
	)
	
	# Fit the models
	clf.fit(x, y)
	
	# Extract the CV results
	parameter_names = sorted(clf.grid_scores_[0].parameters.keys())
	parameter_names.pop(parameter_names.index('log_dir'))
	parameter_values = np.zeros((niter, len(parameter_names)))
	results = np.zeros((niter, nsplits))
	for i, score in enumerate(clf.grid_scores_):
		parameter_values[i] = np.array([score.parameters[k] for k in
			parameter_names])
		results[i] = score.cv_validation_scores
	
	# Save the CV results
	with open(os.path.join(log_dir, 'cv_results.pkl'), 'wb') as f:
		cPickle.dump((parameter_names, parameter_values, results), f,
			cPickle.HIGHEST_PROTOCOL)
	with open(os.path.join(log_dir, 'cv_clf.pkl'), 'wb') as f:
		cPickle.dump((clf.grid_scores_, clf.best_score_, clf.best_params_), f,
			cPickle.HIGHEST_PROTOCOL)

def main_slurm(log_dir, ntrain=800, ntest=200, niter=5, nsplits=3,
	global_inhibition=True, partition_name='debug', seed=None):
	"""
	Perform CV on a subset of the MNIST dataset, using SLRUM. Iterations will
	be run in complete parallel. Splits within an iteration will be run
	sequentially.
	
	@param log_dir: The directory to store the results in.
	
	@param ntrain: The number of training samples to use.
	
	@param ntest: The number of testing samples to use.
	
	@param niter: The number of parameter iterations to use.
	
	@param nsplits: The number of splits of the data to use.
	
	@param global_inhibition: If True use global inhibition; otherwise, use
	local inhibition.
	
	@param partition_name: The partition name of the cluster to use.
	
	@param seed: The seed for the random number generators.
	"""
	
	# Run the initialization
	x, y, kargs, params, cv = main(log_dir, ntrain, ntest, niter, nsplits,
		global_inhibition, seed)
	
	# Create the runs
	for i in xrange(1, niter + 1):
		# Build the initial params
		param = {k:v.rvs() for k, v in sorted(params.items())}
		
		# Create the base directory
		dir = param['log_dir']
		splits = os.path.basename(dir).split('-')
		dir = os.path.join(os.path.dirname(dir),
			'-'.join(s for s in splits[:-1]))
		try:
			os.makedirs(dir)
		except OSError:
			pass
		
		# Dump the CV data
		with open(os.path.join(dir, 'cv.pkl'), 'wb') as f:
			cPickle.dump(list(cv), f, cPickle.HIGHEST_PROTOCOL)
		
		# Build the full params
		for k, v in kargs.items():
			if k != 'clf': # Add the classifier later
				param[k] = v
		
		# Dump the params as JSON
		s = json.dumps(param, sort_keys=True, indent=4,
			separators=(',', ': ')).replace('},', '},\n')
		with open(os.path.join(dir, 'config.json'), 'wb') as f:
			f.write(s)
		
		# Create the runner
		mnist_runner_path = os.path.join(pkgutil.get_loader('mHTM.examples').
			filename, 'mnist_runner.py')
		command = 'python "{0}" "{1}"'.format(mnist_runner_path, dir)
		runner_path = os.path.join(dir, 'runner.sh')
		job_name = str(i)
		stdio_path = os.path.join(dir, 'stdio.txt')
		stderr_path = os.path.join(dir, 'stderr.txt')
		create_runner(command=command, runner_path=runner_path,
			job_name=job_name, partition_name=partition_name,
			stdio_path=stdio_path, stderr_path=stderr_path)
		
		# Execute the runner
		execute_runner(runner_path)

if __name__ == '__main__':
	ntrain, ntest, niter, nsplits, ncores = 800, 200, 1000, 5, 32
	global_inhibition, partition_name, seed = True, 'work', 123456789
	log_dir = 'results/partial_mnist-global'
	np.random.seed(seed) # To ensure consistency
	
	# Run on local
	# main_local(log_dir, ntrain, ntest, niter, nsplits, global_inhibition,
		# ncores, seed)
	
	# Run on cluster
	global_inhibition = True
	log_dir = 'results/partial_mnist-global'
	main_slurm(log_dir, ntrain, ntest, niter, nsplits, global_inhibition,
		partition_name, seed=seed)
	global_inhibition = False
	log_dir = 'results/partial_mnist-local'
	main_slurm(log_dir, ntrain, ntest, niter, nsplits, global_inhibition,
		partition_name, seed=seed)