diff --git a/.gitignore b/.gitignore
index c874ba890..833050d81 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,7 +3,6 @@
 *.swp
 tags
 gh-pages
-site
 *.tests
 *.pyc
 lib/.CC
@@ -31,3 +30,4 @@ data/*
 tool/*
 config.mk
 build
+site/_site
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 000000000..9bfc52407
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,6 @@
+language: c
+compiler: clang
+before_install:
+  - sudo apt-get update -qq
+  - sudo apt-get install -qq libpng-dev libjpeg-dev libblas-dev libgsl0-dev
+script: cd lib && ./configure && make && cd ../bin && make && cd ../test && make test
diff --git a/COPYING b/COPYING
index a671642b0..faa51a011 100644
--- a/COPYING
+++ b/COPYING
@@ -1,3 +1,5 @@
+Files in directories ./doc, ./samples and ./site are licensed under the Creative Commons Attribution 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by/4.0/.
+
 Copyright (c) 2010, Liu Liu
 All rights reserved.
 
diff --git a/README.md b/README.md
index 19448d133..5ee1014cd 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 Intro
 =====
 
+[![Build Status](https://travis-ci.org/liuliu/ccv.png?branch=unstable)](https://travis-ci.org/liuliu/ccv)
+
 Around 2010, when Lian and I was working on our gesture recognition demo, out
 of the frustration to abstract redundant image preprocessing operations into a
 set of clean and concise functions, I started to consider moving away from the
diff --git a/bin/bbfdetect.c b/bin/bbfdetect.c
index c6e56ef2e..72d96cff2 100644
--- a/bin/bbfdetect.c
+++ b/bin/bbfdetect.c
@@ -25,7 +25,7 @@ int main(int argc, char** argv)
 		for (i = 0; i < seq->rnum; i++)
 		{
 			ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-			printf("%d %d %d %d %f\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+			printf("%d %d %d %d %f\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence);
 		}
 		printf("total : %d in time %dms\n", seq->rnum, elapsed_time);
 		ccv_array_free(seq);
@@ -51,7 +51,7 @@ int main(int argc, char** argv)
 				for (i = 0; i < seq->rnum; i++)
 				{
 					ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-					printf("%s %d %d %d %d %f\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+					printf("%s %d %d %d %d %f\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence);
 				}
 				ccv_array_free(seq);
 				ccv_matrix_free(image);
diff --git a/bin/cifar-10.c b/bin/cifar-10.c
index 9da6abf9a..1552aae5c 100644
--- a/bin/cifar-10.c
+++ b/bin/cifar-10.c
@@ -14,6 +14,7 @@ int main(int argc, char** argv)
 					.rows = 31,
 					.cols = 31,
 					.channels = 3,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -24,41 +25,44 @@ int main(int argc, char** argv)
 					.border = 2,
 					.strides = 1,
 					.count = 32,
+					.partition = 1,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_MAX_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
 					.rows = 31,
 					.cols = 31,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
-				.pool = {
+				.rnorm = {
 					.size = 3,
-					.strides = 2,
-					.border = 0,
+					.kappa = 1,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
-					.rows = 15,
-					.cols = 15,
+					.rows = 31,
+					.cols = 31,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
-				.rnorm = {
+				.pool = {
 					.size = 3,
-					.kappa = 1,
-					.alpha = 0.0001,
-					.beta = 0.75,
+					.strides = 2,
+					.border = 0,
 				},
 			},
 		},
@@ -71,6 +75,7 @@ int main(int argc, char** argv)
 					.rows = 15,
 					.cols = 15,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -81,41 +86,44 @@ int main(int argc, char** argv)
 					.border = 2,
 					.strides = 1,
 					.count = 32,
+					.partition = 1,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_AVERAGE_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
 					.rows = 15,
 					.cols = 15,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
-				.pool = {
+				.rnorm = {
 					.size = 3,
-					.strides = 2,
-					.border = 0,
+					.kappa = 1,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_AVERAGE_POOL,
 			.input = {
 				.matrix = {
-					.rows = 7,
-					.cols = 7,
+					.rows = 15,
+					.cols = 15,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
-				.rnorm = {
+				.pool = {
 					.size = 3,
-					.kappa = 1,
-					.alpha = 0.0001,
-					.beta = 0.75,
+					.strides = 2,
+					.border = 0,
 				},
 			},
 		},
@@ -128,6 +136,7 @@ int main(int argc, char** argv)
 					.rows = 7,
 					.cols = 7,
 					.channels = 32,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -138,6 +147,7 @@ int main(int argc, char** argv)
 					.border = 2,
 					.strides = 1,
 					.count = 64,
+					.partition = 1,
 				},
 			},
 		},
@@ -148,6 +158,7 @@ int main(int argc, char** argv)
 					.rows = 7,
 					.cols = 7,
 					.channels = 64,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -167,6 +178,7 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 64,
+					.partition = 1,
 				},
 				.node = {
 					.count = 3 * 3 * 64,
@@ -174,12 +186,13 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 0,
 					.count = 10,
 				},
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(1, params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
+	ccv_convnet_t* convnet = ccv_convnet_new(1, ccv_size(32, 32), params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
 	assert(ccv_convnet_verify(convnet, 10) == 0);
 	assert(argc == 5);
 	int num1 = atoi(argv[2]);
@@ -195,18 +208,18 @@ int main(int argc, char** argv)
 		{
 			fread(bytes, 32 * 32 + 1, 1, r1);
 			int c = bytes[0];
-			ccv_dense_matrix_t* a = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C3, 0, 0);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3] = bytes[j + i * 32 + 1] / 255.0 * 2 - 1;
+			ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1];
 			fread(bytes, 32 * 32, 1, r1);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3 + 1] = bytes[j + i * 32] / 255.0 * 2 - 1;
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32];
 			fread(bytes, 32 * 32, 1, r1);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3 + 2] = bytes[j + i * 32] / 255.0 * 2 - 1;
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32];
 			ccv_categorized_t categorized = ccv_categorized(c, a, 0);
 			ccv_array_push(categorizeds, &categorized);
 		}
@@ -215,18 +228,18 @@ int main(int argc, char** argv)
 		{
 			fread(bytes, 32 * 32 + 1, 1, r2);
 			int c = bytes[0];
-			ccv_dense_matrix_t* a = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C3, 0, 0);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3] = bytes[j + i * 32 + 1] / 255.0 * 2 - 1;
+			ccv_dense_matrix_t* a = ccv_dense_matrix_new(32, 32, CCV_32F | CCV_C3, 0, 0);
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3] = bytes[j + i * 32 + 1];
 			fread(bytes, 32 * 32, 1, r2);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3 + 1] = bytes[j + i * 32] / 255.0 * 2 - 1;
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3 + 1] = bytes[j + i * 32];
 			fread(bytes, 32 * 32, 1, r2);
-			for (i = 0; i < 31; i++)
-				for (j = 0; j < 31; j++)
-					a->data.f32[(j + i * 31) * 3 + 2] = bytes[j + i * 32] / 255.0 * 2 - 1;
+			for (i = 0; i < 32; i++)
+				for (j = 0; j < 32; j++)
+					a->data.f32[(j + i * 32) * 3 + 2] = bytes[j + i * 32];
 			ccv_categorized_t categorized = ccv_categorized(c, a, 0);
 			ccv_array_push(tests, &categorized);
 		}
@@ -234,28 +247,28 @@ int main(int argc, char** argv)
 		memset(layer_params, 0, sizeof(layer_params));
 		
 		layer_params[0].w.decay = 0.005;
-		layer_params[0].w.learn_rate = 0.0005;
+		layer_params[0].w.learn_rate = 0.001;
 		layer_params[0].w.momentum = 0.9;
 		layer_params[0].bias.decay = 0;
 		layer_params[0].bias.learn_rate = 0.001;
 		layer_params[0].bias.momentum = 0.9;
 
 		layer_params[3].w.decay = 0.005;
-		layer_params[3].w.learn_rate = 0.0005;
+		layer_params[3].w.learn_rate = 0.001;
 		layer_params[3].w.momentum = 0.9;
 		layer_params[3].bias.decay = 0;
 		layer_params[3].bias.learn_rate = 0.001;
 		layer_params[3].bias.momentum = 0.9;
 
 		layer_params[6].w.decay = 0.005;
-		layer_params[6].w.learn_rate = 0.0005;
+		layer_params[6].w.learn_rate = 0.001;
 		layer_params[6].w.momentum = 0.9;
 		layer_params[6].bias.decay = 0;
 		layer_params[6].bias.learn_rate = 0.001;
 		layer_params[6].bias.momentum = 0.9;
 
 		layer_params[8].w.decay = 0.01;
-		layer_params[8].w.learn_rate = 0.0005;
+		layer_params[8].w.learn_rate = 0.001;
 		layer_params[8].w.momentum = 0.9;
 		layer_params[8].bias.decay = 0;
 		layer_params[8].bias.learn_rate = 0.001;
@@ -265,6 +278,8 @@ int main(int argc, char** argv)
 			.max_epoch = 999,
 			.mini_batch = 128,
 			.iterations = 500,
+			.symmetric = 1,
+			.color_gain = 0,
 			.layer_params = layer_params,
 		};
 		ccv_convnet_supervised_train(convnet, categorizeds, tests, "cifar-10.sqlite3", params);
diff --git a/bin/cnnclassify.c b/bin/cnnclassify.c
new file mode 100644
index 000000000..db21068c9
--- /dev/null
+++ b/bin/cnnclassify.c
@@ -0,0 +1,119 @@
+#include "ccv.h"
+#include <sys/time.h>
+#include <ctype.h>
+
+unsigned int get_current_time()
+{
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return tv.tv_sec * 1000 + tv.tv_usec / 1000;
+}
+
+int main(int argc, char** argv)
+{
+	assert(argc >= 3);
+	ccv_enable_default_cache();
+	ccv_dense_matrix_t* image = 0;
+	ccv_read(argv[1], &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+	if (image != 0)
+	{
+		ccv_convnet_t* convnet = ccv_convnet_read(0, argv[2]);
+		ccv_dense_matrix_t* input = 0;
+		ccv_convnet_input_formation(convnet, image, &input);
+		ccv_matrix_free(image);
+		unsigned int elapsed_time = get_current_time();
+		ccv_array_t* rank = 0;
+		ccv_convnet_classify(convnet, &input, 1, &rank, 5, 1);
+		elapsed_time = get_current_time() - elapsed_time;
+		int i;
+		for (i = 0; i < rank->rnum - 1; i++)
+		{
+			ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(rank, i);
+			printf("%d %f ", classification->id + 1, classification->confidence);
+		}
+		ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(rank, rank->rnum - 1);
+		printf("%d %f\n", classification->id + 1, classification->confidence);
+		printf("elapsed time %dms\n", elapsed_time);
+		ccv_array_free(rank);
+		ccv_matrix_free(input);
+		ccv_convnet_free(convnet);
+	} else {
+		FILE* r = fopen(argv[1], "rt");
+		if (argc == 4)
+			chdir(argv[3]);
+		if(r)
+		{
+			ccv_convnet_t* convnet = ccv_convnet_read(1, argv[2]);
+			int i, j, k = 0;
+			ccv_dense_matrix_t* images[32] = {
+				0
+			};
+			size_t len = 1024;
+			char* file = (char*)malloc(len);
+			ssize_t read;
+			while((read = getline(&file, &len, r)) != -1)
+			{
+				while(read > 1 && isspace(file[read - 1]))
+					read--;
+				file[read] = 0;
+				if (images[k % 32] != 0)
+					ccv_matrix_free(images[k % 32]);
+				ccv_dense_matrix_t* image = 0;
+				ccv_read(file, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+				assert(image != 0);
+				images[k % 32] = 0;
+				ccv_convnet_input_formation(convnet, image, images + (k % 32));
+				ccv_matrix_free(image);
+				++k;
+				if (k % 32 == 0)
+				{
+					ccv_array_t* ranks[32] = {
+						0
+					};
+					ccv_convnet_classify(convnet, images, 1, ranks, 5, 32);
+					for (i = 0; i < 32; i++)
+					{
+						for (j = 0; j < ranks[i]->rnum - 1; j++)
+						{
+							ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(ranks[i], j);
+							printf("%d %f ", classification->id + 1, classification->confidence);
+						}
+						ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(ranks[i], ranks[i]->rnum - 1);
+						printf("%d %f\n", classification->id + 1, classification->confidence);
+						ccv_array_free(ranks[i]);
+					}
+				}
+			}
+			if (k % 32 != 0)
+			{
+				if (k < 32) // special casing this
+					for (i = k; i < 32; i++)
+						images[i] = images[0]; // padding to 32 batch size
+				ccv_array_t* ranks[32] = {
+					0
+				};
+				ccv_convnet_classify(convnet, images, 1, ranks, 5, 32);
+				for (i = 0; i < (k % 32); i++)
+				{
+					for (j = 0; j < ranks[i]->rnum - 1; j++)
+					{
+						ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(ranks[i], j);
+						printf("%d %f ", classification->id + 1, classification->confidence);
+					}
+					ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(ranks[i], ranks[i]->rnum - 1);
+					printf("%d %f\n", classification->id + 1, classification->confidence);
+					ccv_array_free(ranks[i]);
+				}
+				for (i = (k % 32); i < 32; i++)
+					ccv_array_free(ranks[i]);
+				for (i = 0; i < ccv_min(32, k); i++)
+					ccv_matrix_free(images[i]);
+			}
+			ccv_convnet_free(convnet);
+			free(file);
+			fclose(r);
+		}
+	}
+	ccv_drain_cache();
+	return 0;
+}
diff --git a/bin/cnndraw.rb b/bin/cnndraw.rb
new file mode 100755
index 000000000..a7e678bab
--- /dev/null
+++ b/bin/cnndraw.rb
@@ -0,0 +1,27 @@
+#!/usr/bin/env ruby
+
+exit unless ARGV.length == 3
+
+words = File.new ARGV[0]
+labels = Hash.new
+i = 1
+
+words.each_line do |line|
+	word = line.split ","
+	labels[i.to_s] = word[0]
+	i = i + 1
+end
+
+draw = ""
+y = 15
+STDIN.each_line do |line|
+	print line
+	args = line.split " "
+	break if args[0] == 'elapsed'
+	for i in 0..(args.length / 2 - 1)
+		draw += sprintf("-fill none -strokewidth 1 -stroke DodgerBlue -draw \"rectangle 15,%d,165,%d\" -fill DodgerBlue -draw \"rectangle 15,%d,%d,%d\" -strokewidth 0 -stroke none -fill red -draw 'text 18,%d \"%s\"' ", y, y + 16, y, (args[i * 2 + 1].to_f * 150).to_i + 15, y + 16, y + 13, labels[args[i * 2]])
+		y += 31
+	end
+end
+
+%x[#{sprintf("convert %s %s%s", ARGV[1], draw, ARGV[2])}]
diff --git a/bin/cnnvldtr.rb b/bin/cnnvldtr.rb
new file mode 100755
index 000000000..527042fbe
--- /dev/null
+++ b/bin/cnnvldtr.rb
@@ -0,0 +1,22 @@
+#!/usr/bin/env ruby
+
+exit unless ARGV.length == 2
+
+truth = Array.new
+
+File.new(ARGV[0]).each_line do |line|
+	truth << line.to_i
+end
+
+miss1 = 0
+miss5 = 0
+i = 0
+
+File.new(ARGV[1]).each_line do |line|
+	args = line.split " "
+	miss1 += 1 if args[0].to_i != truth[i]
+	miss5 += 1 if args[0].to_i != truth[i] and args[2].to_i != truth[i] and args[4].to_i != truth[i] and args[6].to_i != truth[i] and args[8].to_i != truth[i]
+	i += 1
+end
+
+print ((miss1.to_f / i.to_f * 10000).round / 100.0).to_s + "% (1), " + ((miss5.to_f / i.to_f * 10000).round / 100.0).to_s + "% (5)\n"
diff --git a/bin/cuda/cwc-bench-runtime.cu b/bin/cuda/cwc-bench-runtime.cu
new file mode 100644
index 000000000..b9722121a
--- /dev/null
+++ b/bin/cuda/cwc-bench-runtime.cu
@@ -0,0 +1,153 @@
+#undef USE_DISPATCH // nvcc doesn't support libdispatch
+extern "C" {
+#include "ccv.h"
+}
+#include <ctype.h>
+#define CASE_TESTS // so that we don't include public available methods
+#include "../lib/cuda/cwc_convnet.cu"
+#include "../lib/ccv_convnet.c"
+
+extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params)
+{
+	int batch = params.mini_batch;
+	int i;
+	const int device_id = 0;
+	_cwc_convnet_alloc_reserved_both(convnet, batch, 0, params.layer_params);
+	cwc_convnet_context_t* context = GPU(convnet)->contexts;
+	for (i = 0; i < convnet->rows * convnet->cols * convnet->channels; i++)
+		convnet->mean_activity->data.f32[i] = 128;
+	_cwc_convnet_batch_formation(0, categorizeds, convnet->mean_activity, 0, 0, 0, 0, ccv_size(225, 225), convnet->rows, convnet->cols, convnet->channels, 1000, 0, batch, 0, batch, context->host[device_id].input, context->host[device_id].c);
+	cudaMemcpy(context->device[device_id].input, context->host[device_id].input, sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch, cudaMemcpyHostToDevice);
+
+	cudaEvent_t overallStart;
+	cudaEvent_t overallStop;
+	cudaEventCreate(&overallStart);
+	cudaEventCreate(&overallStop);
+	cudaEvent_t start;
+	cudaEvent_t stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	float elapsed_time;
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.forward.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.forward.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.forward.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.forward.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.forward.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.forward.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.forward.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.forward.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.forward.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.forward.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.forward.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.forward.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.forward.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.forward.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.forward.z = 32;
+	cudaEventRecord(overallStart, context->device[device_id].data_stream);
+	for (i = 0; i < convnet->count; i++)
+	{
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+		cudaEventRecord(start, context->device[device_id].data_stream);
+		_cwc_convnet_layer_forward_propagate(layer, device_id, i, layer->input.matrix.rows, layer->input.matrix.cols, batch, 0, i == 0 ? context->device[device_id].input : GPU(convnet)->device[device_id].forwards[i - 1], GPU(convnet)->device[device_id].forwards[i], GPU(convnet)->device[device_id].denoms[i], GPU(convnet)->device[device_id].unit, context);
+		cudaEventRecord(stop, context->device[device_id].data_stream);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&elapsed_time, start, stop);
+		if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
+			printf("%d %d %d, elapsed time for layer %d fprop: %f milliseconds\n", VARY(layer)->convolutional.forward.x, VARY(layer)->convolutional.forward.y, VARY(layer)->convolutional.forward.z, i + 1, elapsed_time);
+		else
+			printf("elapsed time for layer %d fprop: %f milliseconds\n", i + 1, elapsed_time);
+	}
+	cudaEventRecord(overallStop, context->device[device_id].data_stream);
+	cudaEventSynchronize(overallStop);
+	cudaEventElapsedTime(&elapsed_time, overallStart, overallStop);
+	printf("forward pass %f milliseconds\n", elapsed_time);
+
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.backward.coefficient.x = 1;
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.backward.coefficient.y = 3;
+	VARY(GPU(convnet)->device[device_id].layers + 0)->convolutional.backward.coefficient.z = 1;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.coefficient.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.coefficient.y = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.coefficient.z = 16;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.gradient.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.gradient.y = 6;
+	VARY(GPU(convnet)->device[device_id].layers + 3)->convolutional.backward.gradient.z = 24;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.coefficient.x = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.coefficient.y = 3;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.coefficient.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.gradient.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.gradient.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 6)->convolutional.backward.gradient.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.coefficient.x = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.coefficient.y = 3;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.coefficient.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.gradient.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.gradient.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 7)->convolutional.backward.gradient.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.coefficient.x = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.coefficient.y = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.coefficient.z = 32;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.gradient.x = 4;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.gradient.y = 8;
+	VARY(GPU(convnet)->device[device_id].layers + 8)->convolutional.backward.gradient.z = 32;
+	float* a = 0;
+	cudaMalloc(&a, sizeof(float) * 1000 * batch);
+	cudaMemcpy(a, GPU(convnet)->device[device_id].forwards[convnet->count - 1], sizeof(float) * 1000 * batch, cudaMemcpyDeviceToDevice);
+	cudaEventRecord(overallStart, context->device[device_id].data_stream);
+	for (i = convnet->count - 1; i >= 0; i--)
+	{
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+		ccv_convnet_layer_t* configuration = GPU(convnet)->device[device_id].configurations + i;
+		cudaEventRecord(start, context->device[device_id].data_stream);
+		switch (layer->type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
+				if (context->device[device_id].dor[i])
+				{
+					int out_rows, out_cols, out_partition;
+					_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
+					_cwc_kern_mute_neuron
+					<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device[device_id].data_stream>>>
+					(i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], context->device[device_id].dor[i]);
+				}
+				_cwc_convnet_convolutional_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : context->device[device_id].input, GPU(convnet)->device[device_id].backwards[i], configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+				assert(cudaGetLastError() == cudaSuccess);
+				break;
+			case CCV_CONVNET_FULL_CONNECT:
+				if (context->device[device_id].dor[i])
+					_cwc_kern_mute_neuron
+					<<<layer->net.full_connect.count, batch, 0, context->device[device_id].data_stream>>>
+					(i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], context->device[device_id].dor[i]);
+				_cwc_convnet_full_connect_backward_propagate(layer, batch,  i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : context->device[device_id].input, GPU(convnet)->device[device_id].backwards[i], GPU(convnet)->device[device_id].unit, configuration, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+				assert(cudaGetLastError() == cudaSuccess);
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+				_cwc_convnet_rnorm_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : context->device[device_id].input, GPU(convnet)->device[device_id].denoms[i], GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
+				assert(cudaGetLastError() == cudaSuccess);
+				break;
+			case CCV_CONVNET_MAX_POOL:
+				_cwc_convnet_max_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : context->device[device_id].input, GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
+				assert(cudaGetLastError() == cudaSuccess);
+				break;
+			case CCV_CONVNET_AVERAGE_POOL:
+				_cwc_convnet_average_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
+				assert(cudaGetLastError() == cudaSuccess);
+				break;
+		}
+		cudaEventRecord(stop, context->device[device_id].data_stream);
+		cudaEventSynchronize(stop);
+		cudaEventElapsedTime(&elapsed_time, start, stop);
+		if (layer->type == CCV_CONVNET_CONVOLUTIONAL)
+			printf("%d %d %d, %d %d %d, elapsed time for layer %d bprop: %f milliseconds\n", VARY(layer)->convolutional.backward.coefficient.x, VARY(layer)->convolutional.backward.coefficient.y, VARY(layer)->convolutional.backward.coefficient.z, VARY(layer)->convolutional.backward.gradient.x, VARY(layer)->convolutional.backward.gradient.y, VARY(layer)->convolutional.backward.gradient.z, i + 1, elapsed_time);
+		else
+			printf("elapsed time for layer %d bprop: %f milliseconds\n", i + 1, elapsed_time);
+	}
+	cudaEventRecord(overallStop, context->device[device_id].data_stream);
+	cudaEventSynchronize(overallStop);
+	cudaEventElapsedTime(&elapsed_time, overallStart, overallStop);
+	printf("backward pass %f milliseconds\n", elapsed_time);
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	cudaEventDestroy(overallStart);
+	cudaEventDestroy(overallStop);
+	cudaFree(a);
+}
diff --git a/bin/cuda/cwc-bench.c b/bin/cuda/cwc-bench.c
new file mode 100644
index 000000000..472ac28fe
--- /dev/null
+++ b/bin/cuda/cwc-bench.c
@@ -0,0 +1,950 @@
+#include "ccv.h"
+#include <ctype.h>
+
+void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params);
+
+int main(int argc, char** argv)
+{
+	ccv_enable_default_cache();
+	assert(argc == 2);
+	FILE *r = fopen(argv[1], "r");
+	char* file = (char*)malloc(1024);
+	ccv_array_t* categorizeds = ccv_array_new(sizeof(ccv_categorized_t), 64, 0);
+	size_t len = 1024;
+	ssize_t read;
+	while ((read = getline(&file, &len, r)) != -1)
+	{
+		while(read > 1 && isspace(file[read - 1]))
+			read--;
+		file[read] = 0;
+		ccv_file_info_t input;
+		input.filename = (char*)ccmalloc(1024);
+		strncpy(input.filename, file, 1024);
+		ccv_categorized_t categorized = ccv_categorized(0, 0, &input);
+		ccv_array_push(categorizeds, &categorized);
+	}
+	fclose(r);
+	free(file);
+	/* MattNet parameters */
+	ccv_convnet_layer_param_t params[13] = {
+		// first layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 225,
+					.cols = 225,
+					.channels = 3,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 96,
+					.strides = 2,
+					.border = 1,
+					.rows = 7,
+					.cols = 7,
+					.channels = 3,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 111,
+					.cols = 111,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 111,
+					.cols = 111,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// second layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 55,
+					.cols = 55,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 256,
+					.strides = 2,
+					.border = 1,
+					.rows = 5,
+					.cols = 5,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// third layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 256,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+		},
+		// fourth layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+		},
+		// fifth layer (convolutional => max pool)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 256,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// sixth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 6,
+					.cols = 6,
+					.channels = 256,
+					.partition = 1,
+				},
+				.node = {
+					.count = 6 * 6 * 256,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// seventh layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// eighth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 0,
+					.count = 1000,
+				},
+			},
+		},
+	};
+	/* AlexNet 12 (ImageNet 2012 winner)
+	ccv_convnet_layer_param_t params[13] = {
+		// first layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 225,
+					.cols = 225,
+					.channels = 3,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 96,
+					.strides = 4,
+					.border = 1,
+					.rows = 11,
+					.cols = 11,
+					.channels = 3,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 55,
+					.cols = 55,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 55,
+					.cols = 55,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// second layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 256,
+					.strides = 1,
+					.border = 2,
+					.rows = 5,
+					.cols = 5,
+					.channels = 96,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// third layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 256,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+		},
+		// fourth layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+		},
+		// fifth layer (convolutional => max pool)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 256,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 256,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// sixth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 6,
+					.cols = 6,
+					.channels = 256,
+					.partition = 1,
+				},
+				.node = {
+					.count = 6 * 6 * 256,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// seventh layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// eighth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 0,
+					.count = 1000,
+				},
+			},
+		},
+	};
+	*/
+	/* AlexNet 14 (One Weird Trick)
+	 * Note that Alex claimed that this is a one tower model,
+	 * but if this is a true one tower model, it should has
+	 * 11 * 11 * 64 * 3 + 5 * 5 * 64 * 192 + 3 * 3 * 192 * 384 + 3 * 3 * 384 * 384 + 3 * 3 * 384 * 256 + 6 * 6 * 256 * 4096 + 4096 * 4096 + 4096 * 1000 = 61827776 parameters
+	 * However, AlexNet 12 (ImageNet 2012 winner, the two towers model) has
+	 * 11 * 11 * 96 * 3 + 5 * 5 * 96 * 256 / 2 + 3 * 3 * 256 * 384 + 3 * 3 * 384 * 384 / 2 + 3 * 3 * 384 * 256 / 2 + 6 * 6 * 256 * 4096 + 4096 * 4096 + 4096 * 1000 = 60954656 parameters
+	 * That works out to be (61827776 - 60954656) / 60954656 = 1.4% more parameters
+	 * The (One Weird Trick claimed to have only 0.2% more parameters, that works out to be around 61076565 parameters
+	 * Thus, the following model, with
+	 * 11 * 11 * 64 * 3 + 5 * 5 * 64 * 192 / 2 + 3 * 3 * 192 * 384 + 3 * 3 * 384 * 384 / 2 + 3 * 3 * 384 * 256 + 6 * 6 * 256 * 4096 + 4096 * 4096 + 4096 * 1000 = 61010624 parameters
+	 * seems to be the closest (and the libccv's implementation works out to be roughly 500ms per 128 examples, about 100+ hours for 90 epochs, about the same performance as (One Weird Trick)'s one GPU case
+	ccv_convnet_layer_param_t params[13] = {
+		// first layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 225,
+					.cols = 225,
+					.channels = 3,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 64,
+					.strides = 4,
+					.border = 1,
+					.rows = 11,
+					.cols = 11,
+					.channels = 3,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 55,
+					.cols = 55,
+					.channels = 64,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 55,
+					.cols = 55,
+					.channels = 64,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// second layer (convolutional => max pool => rnorm)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 64,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 192,
+					.strides = 1,
+					.border = 2,
+					.rows = 5,
+					.cols = 5,
+					.channels = 64,
+					.partition = 2,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 192,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 27,
+					.cols = 27,
+					.channels = 192,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// third layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 192,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 192,
+					.partition = 2,
+				},
+			},
+		},
+		// fourth layer (convolutional)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 384,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 2,
+				},
+			},
+		},
+		// fifth layer (convolutional => max pool)
+		{
+			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 384,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.convolutional = {
+					.count = 256,
+					.strides = 1,
+					.border = 1,
+					.rows = 3,
+					.cols = 3,
+					.channels = 384,
+					.partition = 1,
+				},
+			},
+		},
+		{
+			.type = CCV_CONVNET_MAX_POOL,
+			.input = {
+				.matrix = {
+					.rows = 13,
+					.cols = 13,
+					.channels = 256,
+					.partition = 1,
+				},
+			},
+			.output = {
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
+				},
+			},
+		},
+		// sixth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 6,
+					.cols = 6,
+					.channels = 256,
+					.partition = 1,
+				},
+				.node = {
+					.count = 6 * 6 * 256,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// seventh layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 1,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 1,
+					.count = 4096,
+				},
+			},
+		},
+		// eighth layer (full connect)
+		{
+			.type = CCV_CONVNET_FULL_CONNECT,
+			.bias = 0,
+			.sigma = 0.01,
+			.input = {
+				.matrix = {
+					.rows = 4096,
+					.cols = 1,
+					.channels = 1,
+					.partition = 1,
+				},
+				.node = {
+					.count = 4096,
+				},
+			},
+			.output = {
+				.full_connect = {
+					.relu = 0,
+					.count = 1000,
+				},
+			},
+		},
+	};
+	*/
+	ccv_convnet_t* convnet = ccv_convnet_new(1, ccv_size(225, 225), params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
+	ccv_convnet_verify(convnet, 1000);
+	ccv_convnet_layer_train_param_t layer_params[13];
+	memset(layer_params, 0, sizeof(layer_params));
+	int i;
+	for (i = 0; i < 13; i++)
+	{
+		layer_params[i].w.decay = 0.005;
+		layer_params[i].w.learn_rate = 0.0005;
+		layer_params[i].w.momentum = 0.9;
+		layer_params[i].bias.decay = 0;
+		layer_params[i].bias.learn_rate = 0.001;
+		layer_params[i].bias.momentum = 0.9;
+	}
+	ccv_convnet_train_param_t train_params = {
+		.max_epoch = 100,
+		.mini_batch = 128,
+		.dual_device = 0,
+		.layer_params = layer_params,
+	};
+	for (i = 0; i < 128; i++)
+	{
+		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
+		ccv_dense_matrix_t* image = 0;
+		ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+		ccv_dense_matrix_t* b = 0;
+		if (image->rows > 225 && image->cols > 225)
+			ccv_resample(image, &b, 0, ccv_max(225, (int)(image->rows * 225.0 / image->cols + 0.5)), ccv_max(225, (int)(image->cols * 225.0 / image->rows + 0.5)), CCV_INTER_AREA);
+		else if (image->rows < 225 || image->cols < 225)
+			ccv_resample(image, &b, 0, ccv_max(225, (int)(image->rows * 225.0 / image->cols + 0.5)), ccv_max(225, (int)(image->cols * 225.0 / image->rows + 0.5)), CCV_INTER_CUBIC);
+		else
+			b = image;
+		if (b != image)
+			ccv_matrix_free(image);
+		ccv_dense_matrix_t* c = 0;
+		ccv_slice(b, (ccv_matrix_t**)&c, CCV_32F, 0, 0, 225, 225);
+		ccv_matrix_free(b);
+		categorized->type = CCV_CATEGORIZED_DENSE_MATRIX;
+		categorized->matrix = c;
+	}
+	cwc_bench_runtime(convnet, categorizeds, train_params);
+	ccv_disable_cache();
+	return 0;
+}
diff --git a/bin/cuda/cwc-verify-runtime.cu b/bin/cuda/cwc-verify-runtime.cu
new file mode 100644
index 000000000..7fb04d065
--- /dev/null
+++ b/bin/cuda/cwc-verify-runtime.cu
@@ -0,0 +1,666 @@
+#undef USE_DISPATCH // nvcc doesn't support libdispatch
+extern "C" {
+#include "ccv.h"
+}
+#include <ctype.h>
+#define CASE_TESTS // so that we don't include public available methods
+#include "../lib/cuda/cwc_convnet.cu"
+#include "../lib/ccv_convnet.c"
+
+extern "C" void cwc_verify_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params)
+{
+	int batch = params.mini_batch;
+	int i, j;
+	const int device_id = 0;
+	_cwc_convnet_alloc_reserved_both(convnet, batch, 0, params.layer_params);
+	cwc_convnet_context_t* context = GPU(convnet)->contexts;
+	for (i = 0; i < convnet->rows * convnet->cols * convnet->channels; i++)
+		convnet->mean_activity->data.f32[i] = 128;
+	_cwc_convnet_batch_formation(0, categorizeds, convnet->mean_activity, 0, 0, 0, 0, ccv_size(225, 225), convnet->rows, convnet->cols, convnet->channels, 1000, 0, batch, 0, batch, context->host[device_id].input, context->host[device_id].c);
+	cudaMemcpy(context->device[device_id].input, context->host[device_id].input, sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch, cudaMemcpyHostToDevice);
+
+	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
+	_ccv_convnet_update_zero(update_params);
+
+	// first convolutional layer forward propagate
+	ccv_convnet_layer_t* first_gpu_layer = GPU(convnet)->device[device_id].layers;
+	// these are the setups for TITAN, thus, skip the benching phase
+	VARY(first_gpu_layer)->convolutional.forward.x = 4;
+	VARY(first_gpu_layer)->convolutional.forward.y = 8;
+	VARY(first_gpu_layer)->convolutional.forward.z = 32;
+	cudaEvent_t start;
+	cudaEvent_t stop;
+	cudaEventCreate(&start);
+	cudaEventCreate(&stop);
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_forward_propagate(first_gpu_layer, first_gpu_layer->input.matrix.rows, first_gpu_layer->input.matrix.cols, batch, context->device[device_id].input, GPU(convnet)->device[device_id].forwards[0], context->device[device_id].data_stream);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	float elapsed_time = 0;
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	printf("%d %d %d, elapsed time for first convolutional layer fprop: %f milliseconds\n", VARY(first_gpu_layer)->convolutional.forward.x, VARY(first_gpu_layer)->convolutional.forward.y, VARY(first_gpu_layer)->convolutional.forward.z, elapsed_time);
+	int first_out_rows, first_out_cols, first_out_partition, first_out_channels = first_gpu_layer->net.convolutional.count;
+	_ccv_convnet_layer_derive_output(first_gpu_layer, first_gpu_layer->input.matrix.rows, first_gpu_layer->input.matrix.cols, &first_out_rows, &first_out_cols, &first_out_partition);
+	float* first_out = 0;
+	cudaMallocHost(&first_out, sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch);
+	cudaMemcpy(first_out, GPU(convnet)->device[device_id].forwards[0], sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate first convolutional layer on GPU\n");
+
+	// second average pool layer forward propagate
+	ccv_convnet_layer_t* second_gpu_layer = GPU(convnet)->device[device_id].layers + 1;
+	_cwc_convnet_average_pool_forward_propagate(second_gpu_layer, second_gpu_layer->input.matrix.rows, second_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[0], GPU(convnet)->device[device_id].forwards[1], context->device[device_id].data_stream);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int second_out_rows, second_out_cols, second_out_partition, second_out_channels = second_gpu_layer->input.matrix.channels;
+	_ccv_convnet_layer_derive_output(second_gpu_layer, second_gpu_layer->input.matrix.rows, second_gpu_layer->input.matrix.cols, &second_out_rows, &second_out_cols, &second_out_partition);
+	float* second_out = 0;
+	cudaMallocHost(&second_out, sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch);
+	cudaMemcpy(second_out, GPU(convnet)->device[device_id].forwards[1], sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate second average pool layer on GPU\n");
+
+	// third convolutional layer forward propagate
+	ccv_convnet_layer_t* third_gpu_layer = GPU(convnet)->device[device_id].layers + 2;
+	// these are the setups for TITAN, thus, skip the benching phase
+	VARY(third_gpu_layer)->convolutional.forward.x = 4;
+	VARY(third_gpu_layer)->convolutional.forward.y = 8;
+	VARY(third_gpu_layer)->convolutional.forward.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_forward_propagate(third_gpu_layer, third_gpu_layer->input.matrix.rows, third_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[1], GPU(convnet)->device[device_id].forwards[2], context->device[device_id].data_stream);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, elapsed time for third convolutional layer fprop: %f milliseconds\n", VARY(third_gpu_layer)->convolutional.forward.x, VARY(third_gpu_layer)->convolutional.forward.y, VARY(third_gpu_layer)->convolutional.forward.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int third_out_rows, third_out_cols, third_out_partition, third_out_channels = third_gpu_layer->net.convolutional.count;
+	_ccv_convnet_layer_derive_output(third_gpu_layer, third_gpu_layer->input.matrix.rows, third_gpu_layer->input.matrix.cols, &third_out_rows, &third_out_cols, &third_out_partition);
+	float* third_out = 0;
+	cudaMallocHost(&third_out, sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch);
+	cudaMemcpy(third_out, GPU(convnet)->device[device_id].forwards[2], sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate third convolutional layer on GPU\n");
+
+	// forth average pool layer forward propagate
+	ccv_convnet_layer_t* forth_gpu_layer = GPU(convnet)->device[device_id].layers + 3;
+	_cwc_convnet_average_pool_forward_propagate(forth_gpu_layer, forth_gpu_layer->input.matrix.rows, forth_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[2], GPU(convnet)->device[device_id].forwards[3], context->device[device_id].data_stream);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int forth_out_rows, forth_out_cols, forth_out_partition, forth_out_channels = forth_gpu_layer->input.matrix.channels;
+	_ccv_convnet_layer_derive_output(forth_gpu_layer, forth_gpu_layer->input.matrix.rows, forth_gpu_layer->input.matrix.cols, &forth_out_rows, &forth_out_cols, &forth_out_partition);
+	float* forth_out = 0;
+	cudaMallocHost(&forth_out, sizeof(float) * forth_out_rows * forth_out_cols * forth_out_channels * batch);
+	cudaMemcpy(forth_out, GPU(convnet)->device[device_id].forwards[3], sizeof(float) * forth_out_rows * forth_out_cols * forth_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate forth average pool layer on GPU\n");
+
+	// fifth convolutional layer forward propagate
+	ccv_convnet_layer_t* fifth_gpu_layer = GPU(convnet)->device[device_id].layers + 4;
+	// these are the setups for TITAN, thus, skip the benching phase
+	VARY(fifth_gpu_layer)->convolutional.forward.x = 4;
+	VARY(fifth_gpu_layer)->convolutional.forward.y = 8;
+	VARY(fifth_gpu_layer)->convolutional.forward.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_forward_propagate(fifth_gpu_layer, fifth_gpu_layer->input.matrix.rows, fifth_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[3], GPU(convnet)->device[device_id].forwards[4], context->device[device_id].data_stream);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, elapsed time for fifth convolutional layer fprop: %f milliseconds\n", VARY(fifth_gpu_layer)->convolutional.forward.x, VARY(fifth_gpu_layer)->convolutional.forward.y, VARY(fifth_gpu_layer)->convolutional.forward.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int fifth_out_rows, fifth_out_cols, fifth_out_partition, fifth_out_channels = fifth_gpu_layer->net.convolutional.count;
+	_ccv_convnet_layer_derive_output(fifth_gpu_layer, fifth_gpu_layer->input.matrix.rows, fifth_gpu_layer->input.matrix.cols, &fifth_out_rows, &fifth_out_cols, &fifth_out_partition);
+	float* fifth_out = 0;
+	cudaMallocHost(&fifth_out, sizeof(float) * fifth_out_rows * fifth_out_cols * fifth_out_channels * batch);
+	cudaMemcpy(fifth_out, GPU(convnet)->device[device_id].forwards[4], sizeof(float) * fifth_out_rows * fifth_out_cols * fifth_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate fifth convolutional layer on GPU\n");
+
+	// sixth convolutional layer forward propagate
+	ccv_convnet_layer_t* sixth_gpu_layer = GPU(convnet)->device[device_id].layers + 5;
+	// these are the setups for TITAN, thus, skip the benching phase
+	VARY(sixth_gpu_layer)->convolutional.forward.x = 4;
+	VARY(sixth_gpu_layer)->convolutional.forward.y = 8;
+	VARY(sixth_gpu_layer)->convolutional.forward.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_forward_propagate(sixth_gpu_layer, sixth_gpu_layer->input.matrix.rows, sixth_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[4], GPU(convnet)->device[device_id].forwards[5], context->device[device_id].data_stream);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, elapsed time for sixth convolutional layer fprop: %f milliseconds\n", VARY(sixth_gpu_layer)->convolutional.forward.x, VARY(sixth_gpu_layer)->convolutional.forward.y, VARY(sixth_gpu_layer)->convolutional.forward.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int sixth_out_rows, sixth_out_cols, sixth_out_partition, sixth_out_channels = sixth_gpu_layer->net.convolutional.count;
+	_ccv_convnet_layer_derive_output(sixth_gpu_layer, sixth_gpu_layer->input.matrix.rows, sixth_gpu_layer->input.matrix.cols, &sixth_out_rows, &sixth_out_cols, &sixth_out_partition);
+	float* sixth_out = 0;
+	cudaMallocHost(&sixth_out, sizeof(float) * sixth_out_rows * sixth_out_cols * sixth_out_channels * batch);
+	cudaMemcpy(sixth_out, GPU(convnet)->device[device_id].forwards[5], sizeof(float) * sixth_out_rows * sixth_out_cols * sixth_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate sixth convolutional layer on GPU\n");
+
+	// seventh convolutional layer forward propagate
+	ccv_convnet_layer_t* seventh_gpu_layer = GPU(convnet)->device[device_id].layers + 6;
+	// these are the setups for TITAN, thus, skip the benching phase
+	VARY(seventh_gpu_layer)->convolutional.forward.x = 4;
+	VARY(seventh_gpu_layer)->convolutional.forward.y = 8;
+	VARY(seventh_gpu_layer)->convolutional.forward.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_forward_propagate(seventh_gpu_layer, seventh_gpu_layer->input.matrix.rows, seventh_gpu_layer->input.matrix.cols, batch, GPU(convnet)->device[device_id].forwards[5], GPU(convnet)->device[device_id].forwards[6], context->device[device_id].data_stream);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, elapsed time for seventh convolutional layer fprop: %f milliseconds\n", VARY(seventh_gpu_layer)->convolutional.forward.x, VARY(seventh_gpu_layer)->convolutional.forward.y, VARY(seventh_gpu_layer)->convolutional.forward.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	int seventh_out_rows, seventh_out_cols, seventh_out_partition, seventh_out_channels = seventh_gpu_layer->net.convolutional.count;
+	_ccv_convnet_layer_derive_output(seventh_gpu_layer, seventh_gpu_layer->input.matrix.rows, seventh_gpu_layer->input.matrix.cols, &seventh_out_rows, &seventh_out_cols, &seventh_out_partition);
+	float* seventh_out = 0;
+	cudaMallocHost(&seventh_out, sizeof(float) * seventh_out_rows * seventh_out_cols * seventh_out_channels * batch);
+	cudaMemcpy(seventh_out, GPU(convnet)->device[device_id].forwards[6], sizeof(float) * seventh_out_rows * seventh_out_cols * seventh_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate seventh convolutional layer on GPU\n");
+
+	// the last full connect layer forward propagate
+	ccv_convnet_layer_t* eleventh_gpu_layer = GPU(convnet)->device[device_id].layers + 10;
+	float* eleventh_in = 0;
+	cudaMallocHost(&eleventh_in, sizeof(float) * batch * eleventh_gpu_layer->input.node.count);
+	for (i = 0; i < batch; i++)
+		for (j = 0; j < eleventh_gpu_layer->input.node.count; j++)
+			eleventh_in[j * batch + i] = (j - 100 + i) / 200;
+	cudaMemcpy(GPU(convnet)->device[device_id].forwards[9], eleventh_in, sizeof(float) * batch * eleventh_gpu_layer->input.node.count, cudaMemcpyHostToDevice);
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_full_connect_forward_propagate(eleventh_gpu_layer, 128, GPU(convnet)->device[device_id].forwards[9], GPU(convnet)->device[device_id].forwards[10], GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("elapsed time for eleventh full connect layer fprop: %f milliseconds\n", elapsed_time);
+	float* eleventh_out = 0;
+	cudaMallocHost(&eleventh_out, sizeof(float) * batch * eleventh_gpu_layer->net.full_connect.count);
+	cudaMemcpy(eleventh_out, GPU(convnet)->device[device_id].forwards[10], sizeof(float) * batch * eleventh_gpu_layer->net.full_connect.count, cudaMemcpyDeviceToHost);
+	printf("finished forward propagate eleventh full connect layer on GPU\n");
+
+	// eleventh full connect layer backward propagate
+	ccv_convnet_layer_t* eleventh_gpu_configuration = GPU(convnet)->device[device_id].configurations + 10;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_full_connect_backward_propagate(eleventh_gpu_layer, batch, GPU(convnet)->device[device_id].forwards[10], GPU(convnet)->device[device_id].forwards[10], GPU(convnet)->device[device_id].forwards[9], GPU(convnet)->device[device_id].backwards[10], GPU(convnet)->device[device_id].unit, eleventh_gpu_configuration, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("elapsed time for eleventh full connect layer bprop: %f milliseconds\n", elapsed_time);
+	float* eleventh_back = 0;
+	cudaMallocHost(&eleventh_back, sizeof(float) * eleventh_gpu_layer->input.node.count * batch);
+	cudaMemcpy(eleventh_back, GPU(convnet)->device[device_id].backwards[10], sizeof(float) * eleventh_gpu_layer->input.node.count * batch, cudaMemcpyDeviceToHost);
+	float* eleventh_grad = 0;
+	cudaMallocHost(&eleventh_grad, sizeof(float) * (eleventh_gpu_layer->wnum + eleventh_gpu_layer->net.full_connect.count));
+	assert(eleventh_grad);
+	cudaMemcpy(eleventh_grad, eleventh_gpu_configuration->w, sizeof(float) * (eleventh_gpu_layer->wnum + eleventh_gpu_layer->net.full_connect.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate eleventh full connect layer on GPU\n");
+
+	// seventh convolutonal layer backward propagate
+	cudaMemcpy(GPU(convnet)->device[device_id].backwards[7], GPU(convnet)->device[device_id].forwards[6], sizeof(float) * seventh_out_rows * seventh_out_cols * seventh_out_channels * batch, cudaMemcpyDeviceToDevice);
+	ccv_convnet_layer_t* seventh_gpu_configuration = GPU(convnet)->device[device_id].configurations + 6;
+	VARY(seventh_gpu_layer)->convolutional.backward.coefficient.x = 8;
+	VARY(seventh_gpu_layer)->convolutional.backward.coefficient.y = 4;
+	VARY(seventh_gpu_layer)->convolutional.backward.coefficient.z = 32;
+	VARY(seventh_gpu_layer)->convolutional.backward.gradient.x = 4;
+	VARY(seventh_gpu_layer)->convolutional.backward.gradient.y = 8;
+	VARY(seventh_gpu_layer)->convolutional.backward.gradient.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_backward_propagate(seventh_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[7], GPU(convnet)->device[device_id].forwards[6], GPU(convnet)->device[device_id].forwards[5], GPU(convnet)->device[device_id].backwards[6], seventh_gpu_configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, %d %d %d, elapsed time for seventh convolutional layer bprop: %f milliseconds\n", VARY(seventh_gpu_layer)->convolutional.backward.coefficient.x, VARY(seventh_gpu_layer)->convolutional.backward.coefficient.y, VARY(seventh_gpu_layer)->convolutional.backward.coefficient.z, VARY(seventh_gpu_layer)->convolutional.backward.gradient.x, VARY(seventh_gpu_layer)->convolutional.backward.gradient.y, VARY(seventh_gpu_layer)->convolutional.backward.gradient.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* seventh_back = 0;
+	cudaMallocHost(&seventh_back, sizeof(float) * sixth_out_rows * sixth_out_cols * sixth_out_channels * batch);
+	cudaMemcpy(seventh_back, GPU(convnet)->device[device_id].backwards[6], sizeof(float) * sixth_out_rows * sixth_out_cols * sixth_out_channels * batch, cudaMemcpyDeviceToHost);
+	float* seventh_grad = 0;
+	cudaMallocHost(&seventh_grad, sizeof(float) * (seventh_gpu_layer->wnum + seventh_gpu_layer->net.convolutional.count));
+	assert(seventh_grad);
+	cudaMemcpy(seventh_grad, seventh_gpu_configuration->w, sizeof(float) * (seventh_gpu_layer->wnum + seventh_gpu_layer->net.convolutional.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate seventh convolutional layer on GPU\n");
+
+	// sixth convolutonal layer backward propagate
+	ccv_convnet_layer_t* sixth_gpu_configuration = GPU(convnet)->device[device_id].configurations + 5;
+	VARY(sixth_gpu_layer)->convolutional.backward.coefficient.x = 8;
+	VARY(sixth_gpu_layer)->convolutional.backward.coefficient.y = 3;
+	VARY(sixth_gpu_layer)->convolutional.backward.coefficient.z = 32;
+	VARY(sixth_gpu_layer)->convolutional.backward.gradient.x = 4;
+	VARY(sixth_gpu_layer)->convolutional.backward.gradient.y = 8;
+	VARY(sixth_gpu_layer)->convolutional.backward.gradient.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_backward_propagate(sixth_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[6], GPU(convnet)->device[device_id].forwards[5], GPU(convnet)->device[device_id].forwards[4], GPU(convnet)->device[device_id].backwards[5], sixth_gpu_configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, %d %d %d, elapsed time for sixth convolutional layer bprop: %f milliseconds\n", VARY(sixth_gpu_layer)->convolutional.backward.coefficient.x, VARY(sixth_gpu_layer)->convolutional.backward.coefficient.y, VARY(sixth_gpu_layer)->convolutional.backward.coefficient.z, VARY(sixth_gpu_layer)->convolutional.backward.gradient.x, VARY(sixth_gpu_layer)->convolutional.backward.gradient.y, VARY(sixth_gpu_layer)->convolutional.backward.gradient.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* sixth_back = 0;
+	cudaMallocHost(&sixth_back, sizeof(float) * fifth_out_rows * fifth_out_cols * fifth_out_channels * batch);
+	cudaMemcpy(sixth_back, GPU(convnet)->device[device_id].backwards[5], sizeof(float) * fifth_out_rows * fifth_out_cols * fifth_out_channels * batch, cudaMemcpyDeviceToHost);
+	float* sixth_grad = 0;
+	cudaMallocHost(&sixth_grad, sizeof(float) * (sixth_gpu_layer->wnum + sixth_gpu_layer->net.convolutional.count));
+	assert(sixth_grad);
+	cudaMemcpy(sixth_grad, sixth_gpu_configuration->w, sizeof(float) * (sixth_gpu_layer->wnum + sixth_gpu_layer->net.convolutional.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate sixth convolutional layer on GPU\n");
+
+	// fifth convolutonal layer backward propagate
+	ccv_convnet_layer_t* fifth_gpu_configuration = GPU(convnet)->device[device_id].configurations + 4;
+	VARY(fifth_gpu_layer)->convolutional.backward.coefficient.x = 8;
+	VARY(fifth_gpu_layer)->convolutional.backward.coefficient.y = 3;
+	VARY(fifth_gpu_layer)->convolutional.backward.coefficient.z = 32;
+	VARY(fifth_gpu_layer)->convolutional.backward.gradient.x = 4;
+	VARY(fifth_gpu_layer)->convolutional.backward.gradient.y = 8;
+	VARY(fifth_gpu_layer)->convolutional.backward.gradient.z = 32;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_backward_propagate(fifth_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[5], GPU(convnet)->device[device_id].forwards[4], GPU(convnet)->device[device_id].forwards[3], GPU(convnet)->device[device_id].backwards[4], fifth_gpu_configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, %d %d %d, elapsed time for fifth convolutional layer bprop: %f milliseconds\n", VARY(fifth_gpu_layer)->convolutional.backward.coefficient.x, VARY(fifth_gpu_layer)->convolutional.backward.coefficient.y, VARY(fifth_gpu_layer)->convolutional.backward.coefficient.z, VARY(fifth_gpu_layer)->convolutional.backward.gradient.x, VARY(fifth_gpu_layer)->convolutional.backward.gradient.y, VARY(fifth_gpu_layer)->convolutional.backward.gradient.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* fifth_back = 0;
+	cudaMallocHost(&fifth_back, sizeof(float) * forth_out_rows * forth_out_cols * forth_out_channels * batch);
+	cudaMemcpy(fifth_back, GPU(convnet)->device[device_id].backwards[4], sizeof(float) * forth_out_rows * forth_out_cols * forth_out_channels * batch, cudaMemcpyDeviceToHost);
+	float* fifth_grad = 0;
+	cudaMallocHost(&fifth_grad, sizeof(float) * (fifth_gpu_layer->wnum + fifth_gpu_layer->net.convolutional.count));
+	assert(fifth_grad);
+	cudaMemcpy(fifth_grad, fifth_gpu_configuration->w, sizeof(float) * (fifth_gpu_layer->wnum + fifth_gpu_layer->net.convolutional.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate fifth convolutional layer on GPU\n");
+
+	// third convolutonal layer backward propagate
+	cudaMemcpy(GPU(convnet)->device[device_id].backwards[3], GPU(convnet)->device[device_id].forwards[2], sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch, cudaMemcpyDeviceToDevice);
+	ccv_convnet_layer_t* third_gpu_configuration = GPU(convnet)->device[device_id].configurations + 2;
+	VARY(third_gpu_layer)->convolutional.backward.coefficient.x = 4;
+	VARY(third_gpu_layer)->convolutional.backward.coefficient.y = 4;
+	VARY(third_gpu_layer)->convolutional.backward.coefficient.z = 16;
+	VARY(third_gpu_layer)->convolutional.backward.gradient.x = 4;
+	VARY(third_gpu_layer)->convolutional.backward.gradient.y = 6;
+	VARY(third_gpu_layer)->convolutional.backward.gradient.z = 24;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_backward_propagate(third_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[3], GPU(convnet)->device[device_id].forwards[2], GPU(convnet)->device[device_id].forwards[1], GPU(convnet)->device[device_id].backwards[2], third_gpu_configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, %d %d %d, elapsed time for third convolutional layer bprop: %f milliseconds\n", VARY(third_gpu_layer)->convolutional.backward.coefficient.x, VARY(third_gpu_layer)->convolutional.backward.coefficient.y, VARY(third_gpu_layer)->convolutional.backward.coefficient.z, VARY(third_gpu_layer)->convolutional.backward.gradient.x, VARY(third_gpu_layer)->convolutional.backward.gradient.y, VARY(third_gpu_layer)->convolutional.backward.gradient.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* third_back = 0;
+	cudaMallocHost(&third_back, sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch);
+	cudaMemcpy(third_back, GPU(convnet)->device[device_id].backwards[2], sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch, cudaMemcpyDeviceToHost);
+	float* third_grad = 0;
+	cudaMallocHost(&third_grad, sizeof(float) * (third_gpu_layer->wnum + third_gpu_layer->net.convolutional.count));
+	assert(third_grad);
+	cudaMemcpy(third_grad, third_gpu_configuration->w, sizeof(float) * (third_gpu_layer->wnum + third_gpu_layer->net.convolutional.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate third convolutional layer on GPU\n");
+
+	// second average pool layer backward propagate
+	_cwc_convnet_average_pool_backward_propagate(second_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[2], GPU(convnet)->device[device_id].backwards[1], context->device[device_id].data_stream);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* second_back = 0;
+	cudaMallocHost(&second_back, sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch);
+	cudaMemcpy(second_back, GPU(convnet)->device[device_id].backwards[1], sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch, cudaMemcpyDeviceToHost);
+	printf("finished backward propagate second average pool layer on GPU\n");
+
+	// first convolutional layer backward propagate
+	ccv_convnet_layer_t* first_gpu_configuration = GPU(convnet)->device[device_id].configurations;
+	VARY(first_gpu_layer)->convolutional.backward.coefficient.x = 1;
+	VARY(first_gpu_layer)->convolutional.backward.coefficient.y = 3;
+	VARY(first_gpu_layer)->convolutional.backward.coefficient.z = 1;
+	cudaEventRecord(start, context->device[device_id].data_stream);
+	_cwc_convnet_convolutional_backward_propagate(first_gpu_layer, batch, GPU(convnet)->device[device_id].backwards[1], GPU(convnet)->device[device_id].forwards[0], context->device[device_id].input, GPU(convnet)->device[device_id].backwards[0], first_gpu_configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+	cudaEventRecord(stop, context->device[device_id].data_stream);
+	cudaEventSynchronize(stop);
+	cudaEventElapsedTime(&elapsed_time, start, stop);
+	printf("%d %d %d, elapsed time for first convolutional layer bprop: %f milliseconds\n", VARY(first_gpu_layer)->convolutional.backward.coefficient.x, VARY(first_gpu_layer)->convolutional.backward.coefficient.y, VARY(first_gpu_layer)->convolutional.backward.coefficient.z, elapsed_time);
+	cudaStreamSynchronize(context->device[device_id].data_stream);
+	assert(cudaGetLastError() == cudaSuccess);
+	float* first_grad = 0;
+	cudaMallocHost(&first_grad, sizeof(float) * (first_gpu_layer->wnum + first_gpu_layer->net.convolutional.count));
+	assert(first_grad);
+	cudaMemcpy(first_grad, first_gpu_configuration->w, sizeof(float) * (first_gpu_layer->wnum + first_gpu_layer->net.convolutional.count), cudaMemcpyDeviceToHost);
+	printf("finished backward propagate first convolutional layer on GPU\n");
+	cudaEventDestroy(start);
+	cudaEventDestroy(stop);
+	int x, y, k, c;
+	for (i = 0; i < batch; i++)
+	{
+		printf("doing batch %d of %d\n", i + 1, batch);
+		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
+		for (x = 0; x < categorized->matrix->rows * categorized->matrix->cols * CCV_GET_CHANNEL(categorized->matrix->type); x++)
+			categorized->matrix->data.f32[x] = categorized->matrix->data.f32[x] - 128;
+
+		// first convolutional layer forward propagate
+		ccv_convnet_layer_t* first_cpu_layer = convnet->layers;
+		_ccv_convnet_convolutional_forward_propagate(first_cpu_layer, categorized->matrix, convnet->acts);
+		ccv_dense_matrix_t* a = convnet->acts[0];
+		for (y = 0; y < first_out_rows; y++)
+			for (x = 0; x < first_out_cols; x++)
+				for (k = 0; k < first_out_channels; k++)
+				{
+					float p = first_out[k * first_out_rows * first_out_cols * batch + (y * first_out_cols + x) * batch + i];
+					float q = a->data.f32[y * first_out_cols * first_out_channels + x * first_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv fprop 1: %d %d %d %d: |%f - %f| = %f\n", i, x, y, k, p, q, delta);
+				}
+		// second average pool layer forward propagate
+		ccv_convnet_layer_t* second_cpu_layer = convnet->layers + 1;
+		_ccv_convnet_average_pool_forward_propagate(second_cpu_layer, convnet->acts[0], convnet->acts + 1);
+		ccv_dense_matrix_t* b = convnet->acts[1];
+		for (y = 0; y < second_out_rows; y++)
+			for (x = 0; x < second_out_cols; x++)
+				for (k = 0; k < second_out_channels; k++)
+				{
+					float p = second_out[k * second_out_rows * second_out_cols * batch + (y * second_out_cols + x) * batch + i];
+					float q = b->data.f32[y * second_out_cols * second_out_channels + x * second_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("avgpool fprop 2: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// third convolutional layer forward propagate
+		ccv_convnet_layer_t* third_cpu_layer = convnet->layers + 2;
+		_ccv_convnet_convolutional_forward_propagate(third_cpu_layer, convnet->acts[1], convnet->acts + 2);
+		ccv_dense_matrix_t* c = convnet->acts[2];
+		for (y = 0; y < third_out_rows; y++)
+			for (x = 0; x < third_out_cols; x++)
+				for (k = 0; k < third_out_channels; k++)
+				{
+					float p = third_out[k * third_out_rows * third_out_cols * batch + (y * third_out_cols + x) * batch + i];
+					float q = c->data.f32[(y * third_out_cols + x) * third_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv fprop 3: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// forth average pool layer forward propagate
+		ccv_convnet_layer_t* forth_cpu_layer = convnet->layers + 3;
+		_ccv_convnet_average_pool_forward_propagate(forth_cpu_layer, convnet->acts[2], convnet->acts + 3);
+		ccv_dense_matrix_t* d = convnet->acts[3];
+		for (y = 0; y < forth_out_rows; y++)
+			for (x = 0; x < forth_out_cols; x++)
+				for (k = 0; k < forth_out_channels; k++)
+				{
+					float p = forth_out[k * forth_out_rows * forth_out_cols * batch + (y * forth_out_cols + x) * batch + i];
+					float q = d->data.f32[y * forth_out_cols * forth_out_channels + x * forth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("avgpool fprop 4: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// fifth convolutional layer forward propagate
+		ccv_convnet_layer_t* fifth_cpu_layer = convnet->layers + 4;
+		_ccv_convnet_convolutional_forward_propagate(fifth_cpu_layer, convnet->acts[3], convnet->acts + 4);
+		ccv_dense_matrix_t* e = convnet->acts[4];
+		for (y = 0; y < fifth_out_rows; y++)
+			for (x = 0; x < fifth_out_cols; x++)
+				for (k = 0; k < fifth_out_channels; k++)
+				{
+					float p = fifth_out[k * fifth_out_rows * fifth_out_cols * batch + (y * fifth_out_cols + x) * batch + i];
+					float q = e->data.f32[(y * fifth_out_cols + x) * fifth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv fprop 5: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// sixth convolutional layer forward propagate
+		ccv_convnet_layer_t* sixth_cpu_layer = convnet->layers + 5;
+		_ccv_convnet_convolutional_forward_propagate(sixth_cpu_layer, convnet->acts[4], convnet->acts + 5);
+		ccv_dense_matrix_t* f = convnet->acts[5];
+		for (y = 0; y < sixth_out_rows; y++)
+			for (x = 0; x < sixth_out_cols; x++)
+				for (k = 0; k < sixth_out_channels; k++)
+				{
+					float p = sixth_out[k * sixth_out_rows * sixth_out_cols * batch + (y * sixth_out_cols + x) * batch + i];
+					float q = f->data.f32[(y * sixth_out_cols + x) * sixth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv fprop 6: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// seventh convolutional layer forward propagate
+		ccv_convnet_layer_t* seventh_cpu_layer = convnet->layers + 6;
+		_ccv_convnet_convolutional_forward_propagate(seventh_cpu_layer, convnet->acts[5], convnet->acts + 6);
+		ccv_dense_matrix_t* g = convnet->acts[6];
+		for (y = 0; y < seventh_out_rows; y++)
+			for (x = 0; x < seventh_out_cols; x++)
+				for (k = 0; k < seventh_out_channels; k++)
+				{
+					float p = seventh_out[k * seventh_out_rows * seventh_out_cols * batch + (y * seventh_out_cols + x) * batch + i];
+					float q = g->data.f32[(y * seventh_out_cols + x) * seventh_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv fprop 7: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// eleventh full connect layer forward propagate
+		ccv_convnet_layer_t* eleventh_cpu_layer = convnet->layers + 10;
+		convnet->acts[9] = ccv_dense_matrix_new(eleventh_cpu_layer->input.node.count, 1, CCV_32F | CCV_C1, 0, 0);
+		for (k = 0; k < eleventh_cpu_layer->input.node.count; k++)
+			convnet->acts[9]->data.f32[k] = eleventh_in[k * batch + i];
+		_ccv_convnet_full_connect_forward_propagate(eleventh_cpu_layer, convnet->acts[9], convnet->acts + 10);
+		ccv_dense_matrix_t* z = convnet->acts[10];
+		for (k = 0; k < eleventh_cpu_layer->net.full_connect.count; k++)
+		{
+			float p = eleventh_out[k * batch + i];
+			float q = z->data.f32[k];
+			float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+			if (delta > 1e-4)
+				printf("fc fprop 11: %d %d: |%g - %g| = %g\n", i, k, p, q, delta);
+		}
+		_ccv_convnet_full_connect_backward_propagate(eleventh_cpu_layer, convnet->acts[10], convnet->acts[10], convnet->acts[9], update_params->acts + 9, update_params->layers + 10);
+		ccv_matrix_free(convnet->acts[9]);
+		ccv_dense_matrix_t* bz = update_params->acts[9];
+		for (k = 0; k < eleventh_cpu_layer->input.node.count; k++)
+		{
+			float p = eleventh_back[k * batch + i];
+			float q = bz->data.f32[k];
+			float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+			if (delta > 1e-4)
+				printf("fc bprop 11: %d %d: |%g - %g| = %g\n", i, k, p, q, delta);
+		}
+
+		// seventh convolutional layer backward propagate
+		_ccv_convnet_convolutional_backward_propagate(seventh_cpu_layer, convnet->acts[6], convnet->acts[6], convnet->acts[5], update_params->acts + 5, update_params->layers + 6);
+		ccv_dense_matrix_t* bg = update_params->acts[5];
+		for (y = 0; y < sixth_out_rows; y++)
+			for (x = 0; x < sixth_out_cols; x++)
+				for (k = 0; k < sixth_out_channels; k++)
+				{
+					float p = seventh_back[k * sixth_out_rows * sixth_out_cols * batch + (y * sixth_out_cols + x) * batch + i];
+					float q = bg->data.f32[(y * sixth_out_cols + x) * sixth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv bprop 7: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// sixth convolutional layer backward propagate
+		_ccv_convnet_convolutional_backward_propagate(sixth_cpu_layer, update_params->acts[5], convnet->acts[5], convnet->acts[4], update_params->acts + 4, update_params->layers + 5);
+		ccv_dense_matrix_t* bf = update_params->acts[4];
+		for (y = 0; y < fifth_out_rows; y++)
+			for (x = 0; x < fifth_out_cols; x++)
+				for (k = 0; k < fifth_out_channels; k++)
+				{
+					float p = sixth_back[k * fifth_out_rows * fifth_out_cols * batch + (y * fifth_out_cols + x) * batch + i];
+					float q = bf->data.f32[(y * fifth_out_cols + x) * fifth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-3)
+						printf("conv bprop 6: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// fifth convolutional layer backward propagate
+		_ccv_convnet_convolutional_backward_propagate(fifth_cpu_layer, update_params->acts[4], convnet->acts[4], convnet->acts[3], update_params->acts + 3, update_params->layers + 4);
+		ccv_dense_matrix_t* be = update_params->acts[3];
+		for (y = 0; y < forth_out_rows; y++)
+			for (x = 0; x < forth_out_cols; x++)
+				for (k = 0; k < forth_out_channels; k++)
+				{
+					float p = fifth_back[k * forth_out_rows * forth_out_cols * batch + (y * forth_out_cols + x) * batch + i];
+					float q = be->data.f32[(y * forth_out_cols + x) * forth_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-2)
+						printf("conv bprop 5: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// third convolutional layer backward propagate
+		_ccv_convnet_convolutional_backward_propagate(third_cpu_layer, convnet->acts[2], convnet->acts[2], convnet->acts[1], update_params->acts + 1, update_params->layers + 2);
+		ccv_dense_matrix_t* bc = update_params->acts[1];
+		for (y = 0; y < second_out_rows; y++)
+			for (x = 0; x < second_out_cols; x++)
+				for (k = 0; k < second_out_channels; k++)
+				{
+					float p = third_back[k * second_out_rows * second_out_cols * batch + (y * second_out_cols + x) * batch + i];
+					float q = bc->data.f32[(y * second_out_cols + x) * second_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv bprop 3: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+		// second average pool layer backward propagate
+		_ccv_convnet_average_pool_backward_propagate(second_cpu_layer, update_params->acts[1], convnet->acts[0], update_params->acts);
+		ccv_dense_matrix_t* bb = update_params->acts[0];
+		for (y = 0; y < first_out_rows; y++)
+			for (x = 0; x < first_out_cols; x++)
+				for (k = 0; k < first_out_channels; k++)
+				{
+					float p = second_back[k * first_out_rows * first_out_cols * batch + (y * first_out_cols + x) * batch + i];
+					float q = bb->data.f32[y * first_out_cols * first_out_channels + x * first_out_channels + k];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("avgpool bprop 2: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
+				}
+
+		// first convolutional layer backward propagate
+		_ccv_convnet_convolutional_backward_propagate(first_cpu_layer, update_params->acts[0], convnet->acts[0], categorized->matrix, 0, update_params->layers);
+	}
+
+	ccv_convnet_layer_t* eleventh_cpu_configuration = update_params->layers + 10;
+	for (x = 0; x < eleventh_cpu_configuration->net.full_connect.count; x++)
+		for (y = 0; y < eleventh_cpu_configuration->input.node.count; y++)
+		{
+			float p = eleventh_cpu_configuration->w[x * eleventh_cpu_configuration->input.node.count + y];
+			float q = eleventh_grad[x * eleventh_cpu_configuration->input.node.count + y];
+			float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+			if (delta > 1e-3)
+				printf("fc bprop 11: %d %d: |%g - %g| = %g\n", x, y, p, q, delta);
+		}
+	for (x = 0; x < eleventh_cpu_configuration->net.full_connect.count; x++)
+	{
+		float p = eleventh_cpu_configuration->bias[x];
+		float q = eleventh_grad[eleventh_cpu_configuration->net.full_connect.count * eleventh_cpu_configuration->input.node.count + x];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-3)
+			printf("fc bprop 11 bias: %d: |%g - %g| = %g\n", x, p, q, delta);
+	}
+
+	ccv_convnet_layer_t* seventh_cpu_configuration = update_params->layers + 6;
+	int seventh_filter_rows = seventh_gpu_layer->net.convolutional.rows;
+	int seventh_filter_cols = seventh_gpu_layer->net.convolutional.cols;
+	int seventh_filter_count = seventh_gpu_layer->net.convolutional.count;
+	int seventh_filter_channels = seventh_gpu_layer->net.convolutional.channels / 2;
+	for (y = 0; y < seventh_filter_rows; y++)
+		for (x = 0; x < seventh_filter_cols; x++)
+			for (k = 0; k < seventh_filter_count; k++)
+				for (c = 0; c < seventh_filter_channels; c++)
+				{
+					float p = seventh_cpu_configuration->w[(y * seventh_filter_cols + x) * seventh_filter_channels + k * seventh_filter_cols * seventh_filter_rows * seventh_filter_channels + c];
+					float q = seventh_grad[(y * seventh_filter_cols + x) * seventh_filter_count + k + c * seventh_filter_cols * seventh_filter_rows * seventh_filter_count];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv bprop 7: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
+				}
+	for (k = 0; k < seventh_filter_count; k++)
+	{
+		float p = seventh_cpu_configuration->bias[k];
+		float q = seventh_grad[seventh_gpu_layer->wnum + k];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-4)
+			printf("conv bprop 7 bias: %d: |%g - %g| = %g\n", k, p, q, delta);
+	}
+
+	ccv_convnet_layer_t* sixth_cpu_configuration = update_params->layers + 5;
+	int sixth_filter_rows = sixth_gpu_layer->net.convolutional.rows;
+	int sixth_filter_cols = sixth_gpu_layer->net.convolutional.cols;
+	int sixth_filter_count = sixth_gpu_layer->net.convolutional.count;
+	int sixth_filter_channels = sixth_gpu_layer->net.convolutional.channels / 2;
+	for (y = 0; y < sixth_filter_rows; y++)
+		for (x = 0; x < sixth_filter_cols; x++)
+			for (k = 0; k < sixth_filter_count; k++)
+				for (c = 0; c < sixth_filter_channels; c++)
+				{
+					float p = sixth_cpu_configuration->w[(y * sixth_filter_cols + x) * sixth_filter_channels + k * sixth_filter_cols * sixth_filter_rows * sixth_filter_channels + c];
+					float q = sixth_grad[(y * sixth_filter_cols + x) * sixth_filter_count + k + c * sixth_filter_cols * sixth_filter_rows * sixth_filter_count];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-3)
+						printf("conv bprop 6: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
+				}
+	for (k = 0; k < sixth_filter_count; k++)
+	{
+		float p = sixth_cpu_configuration->bias[k];
+		float q = sixth_grad[sixth_gpu_layer->wnum + k];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-4)
+			printf("conv bprop 6 bias: %d: |%g - %g| = %g\n", k, p, q, delta);
+	}
+
+	ccv_convnet_layer_t* fifth_cpu_configuration = update_params->layers + 4;
+	int fifth_filter_rows = fifth_gpu_layer->net.convolutional.rows;
+	int fifth_filter_cols = fifth_gpu_layer->net.convolutional.cols;
+	int fifth_filter_count = fifth_gpu_layer->net.convolutional.count;
+	int fifth_filter_channels = fifth_gpu_layer->net.convolutional.channels;
+	for (y = 0; y < fifth_filter_rows; y++)
+		for (x = 0; x < fifth_filter_cols; x++)
+			for (k = 0; k < fifth_filter_count; k++)
+				for (c = 0; c < fifth_filter_channels; c++)
+				{
+					float p = fifth_cpu_configuration->w[(y * fifth_filter_cols + x) * fifth_filter_channels + k * fifth_filter_cols * fifth_filter_rows * fifth_filter_channels + c];
+					float q = fifth_grad[(y * fifth_filter_cols + x) * fifth_filter_count + k + c * fifth_filter_cols * fifth_filter_rows * fifth_filter_count];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-2)
+						printf("conv bprop 5: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
+				}
+	for (k = 0; k < fifth_filter_count; k++)
+	{
+		float p = fifth_cpu_configuration->bias[k];
+		float q = fifth_grad[fifth_gpu_layer->wnum + k];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-4)
+			printf("conv bprop 5 bias: %d: |%g - %g| = %g\n", k, p, q, delta);
+	}
+
+	ccv_convnet_layer_t* third_cpu_configuration = update_params->layers + 2;
+	int third_filter_rows = third_gpu_layer->net.convolutional.rows;
+	int third_filter_cols = third_gpu_layer->net.convolutional.cols;
+	int third_filter_count = third_gpu_layer->net.convolutional.count;
+	int third_filter_channels = third_gpu_layer->net.convolutional.channels / 2;
+	for (y = 0; y < third_filter_rows; y++)
+		for (x = 0; x < third_filter_cols; x++)
+			for (k = 0; k < third_filter_count; k++)
+				for (c = 0; c < third_filter_channels; c++)
+				{
+					float p = third_cpu_configuration->w[(y * third_filter_cols + x) * third_filter_channels + k * third_filter_cols * third_filter_rows * third_filter_channels + c];
+					float q = third_grad[(y * third_filter_cols + x) * third_filter_count + k + c * third_filter_cols * third_filter_rows * third_filter_count];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-4)
+						printf("conv bprop 3: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
+				}
+	for (k = 0; k < third_filter_count; k++)
+	{
+		float p = third_cpu_configuration->bias[k];
+		float q = third_grad[third_gpu_layer->wnum + k];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-4)
+			printf("conv bprop 3 bias: %d: |%g - %g| = %g\n", k, p, q, delta);
+	}
+
+	ccv_convnet_layer_t* first_cpu_configuration = update_params->layers;
+	int first_filter_rows = first_gpu_layer->net.convolutional.rows;
+	int first_filter_cols = first_gpu_layer->net.convolutional.cols;
+	int first_filter_count = first_gpu_layer->net.convolutional.count;
+	int first_filter_channels = first_gpu_layer->net.convolutional.channels;
+	for (y = 0; y < first_filter_rows; y++)
+		for (x = 0; x < first_filter_cols; x++)
+			for (k = 0; k < first_filter_count; k++)
+				for (c = 0; c < first_filter_channels; c++)
+				{
+					float p = first_cpu_configuration->w[(y * first_filter_cols + x) * first_filter_channels + k * first_filter_cols * first_filter_rows * first_filter_channels + c];
+					float q = first_grad[(y * first_filter_cols + x) * first_filter_count + k + c * first_filter_cols * first_filter_rows * first_filter_count];
+					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+					if (delta > 1e-3)
+						printf("conv bprop 1: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
+				}
+	for (k = 0; k < first_filter_count; k++)
+	{
+		float p = first_cpu_configuration->bias[k];
+		float q = first_grad[first_gpu_layer->wnum + k];
+		float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
+		if (delta > 1e-4)
+			printf("conv bprop 1 bias: %d: |%g - %g| = %g\n", k, p, q, delta);
+	}
+	cudaFreeHost(eleventh_in);
+}
diff --git a/bin/cwc-bench.c b/bin/cuda/cwc-verify.c
similarity index 77%
rename from bin/cwc-bench.c
rename to bin/cuda/cwc-verify.c
index b49ac23f2..64a4f0779 100644
--- a/bin/cwc-bench.c
+++ b/bin/cuda/cwc-verify.c
@@ -1,7 +1,7 @@
 #include "ccv.h"
 #include <ctype.h>
 
-void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params);
+void cwc_verify_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params);
 
 int main(int argc, char** argv)
 {
@@ -25,27 +25,29 @@ int main(int argc, char** argv)
 	}
 	fclose(r);
 	free(file);
-	ccv_convnet_layer_param_t params[] = {
+	ccv_convnet_layer_param_t params[11] = {
 		// first layer (convolutional => max pool => rnorm)
 		{
 			.type = CCV_CONVNET_CONVOLUTIONAL,
-			.bias = 1,
+			.bias = 0,
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 225,
 					.cols = 225,
 					.channels = 3,
+					.partition = 1,
 				},
 			},
 			.output = {
 				.convolutional = {
 					.count = 96,
-					.strides = 4,
+					.strides = 2,
 					.border = 1,
-					.rows = 11,
-					.cols = 11,
+					.rows = 7,
+					.cols = 7,
 					.channels = 3,
+					.partition = 2,
 				},
 			},
 		},
@@ -53,9 +55,10 @@ int main(int argc, char** argv)
 			.type = CCV_CONVNET_AVERAGE_POOL,
 			.input = {
 				.matrix = {
-					.rows = 55,
-					.cols = 55,
+					.rows = 111,
+					.cols = 111,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -69,23 +72,25 @@ int main(int argc, char** argv)
 		// second layer (convolutional => max pool => rnorm)
 		{
 			.type = CCV_CONVNET_CONVOLUTIONAL,
-			.bias = 0,
+			.bias = 1,
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
-					.rows = 27,
-					.cols = 27,
+					.rows = 55,
+					.cols = 55,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 			.output = {
 				.convolutional = {
 					.count = 256,
-					.strides = 1,
-					.border = 2,
+					.strides = 2,
+					.border = 1,
 					.rows = 5,
 					.cols = 5,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 		},
@@ -96,6 +101,7 @@ int main(int argc, char** argv)
 					.rows = 27,
 					.cols = 27,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -109,11 +115,14 @@ int main(int argc, char** argv)
 		// third layer (convolutional)
 		{
 			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 0,
+			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 13,
 					.cols = 13,
 					.channels = 256,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -124,17 +133,21 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 		},
 		// fourth layer (convolutional)
 		{
 			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 13,
 					.cols = 13,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -145,17 +158,21 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 		},
 		// fifth layer (convolutional => max pool)
 		{
 			.type = CCV_CONVNET_CONVOLUTIONAL,
+			.bias = 1,
+			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 13,
 					.cols = 13,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -166,16 +183,18 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_AVERAGE_POOL,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
 					.rows = 13,
 					.cols = 13,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -189,13 +208,14 @@ int main(int argc, char** argv)
 		// sixth layer (full connect)
 		{
 			.type = CCV_CONVNET_FULL_CONNECT,
-			.bias = 0,
+			.bias = 1,
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 6,
 					.cols = 6,
 					.channels = 256,
+					.partition = 1,
 				},
 				.node = {
 					.count = 6 * 6 * 256,
@@ -203,6 +223,7 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 1,
 					.count = 4096,
 				},
 			},
@@ -210,13 +231,14 @@ int main(int argc, char** argv)
 		// seventh layer (full connect)
 		{
 			.type = CCV_CONVNET_FULL_CONNECT,
-			.bias = 0,
+			.bias = 1,
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
 					.rows = 4096,
 					.cols = 1,
 					.channels = 1,
+					.partition = 1,
 				},
 				.node = {
 					.count = 4096,
@@ -224,6 +246,7 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 1,
 					.count = 4096,
 				},
 			},
@@ -238,6 +261,7 @@ int main(int argc, char** argv)
 					.rows = 4096,
 					.cols = 1,
 					.channels = 1,
+					.partition = 1,
 				},
 				.node = {
 					.count = 4096,
@@ -245,17 +269,18 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 0,
 					.count = 1000,
 				},
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(1, params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
+	ccv_convnet_t* convnet = ccv_convnet_new(1, ccv_size(225, 225), params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
 	ccv_convnet_verify(convnet, 1000);
-	ccv_convnet_layer_train_param_t layer_params[13];
+	ccv_convnet_layer_train_param_t layer_params[11];
 	memset(layer_params, 0, sizeof(layer_params));
 	int i;
-	for (i = 0; i < 13; i++)
+	for (i = 0; i < 11; i++)
 	{
 		layer_params[i].w.decay = 0.005;
 		layer_params[i].w.learn_rate = 0.0005;
@@ -266,33 +291,31 @@ int main(int argc, char** argv)
 	}
 	ccv_convnet_train_param_t train_params = {
 		.max_epoch = 100,
-		.mini_batch = 256,
+		.mini_batch = 128,
+		.dual_device = 0,
 		.layer_params = layer_params,
 	};
-	for (i = 0; i < 256; i++)
+	for (i = 0; i < 128; i++)
 	{
 		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
 		ccv_dense_matrix_t* image = 0;
 		ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
 		ccv_dense_matrix_t* b = 0;
-		if (image->rows > 251 && image->cols > 251)
-			ccv_resample(image, &b, 0, ccv_max(251, (int)(image->rows * 251.0 / image->cols + 0.5)), ccv_max(251, (int)(image->cols * 251.0 / image->rows + 0.5)), CCV_INTER_AREA);
-		else if (image->rows < 251 || image->cols < 251)
-			ccv_resample(image, &b, 0, ccv_max(251, (int)(image->rows * 251.0 / image->cols + 0.5)), ccv_max(251, (int)(image->cols * 251.0 / image->rows + 0.5)), CCV_INTER_CUBIC);
+		if (image->rows > 225 && image->cols > 225)
+			ccv_resample(image, &b, 0, ccv_max(225, (int)(image->rows * 225.0 / image->cols + 0.5)), ccv_max(225, (int)(image->cols * 225.0 / image->rows + 0.5)), CCV_INTER_AREA);
+		else if (image->rows < 225 || image->cols < 225)
+			ccv_resample(image, &b, 0, ccv_max(225, (int)(image->rows * 225.0 / image->cols + 0.5)), ccv_max(225, (int)(image->cols * 225.0 / image->rows + 0.5)), CCV_INTER_CUBIC);
 		else
 			b = image;
 		if (b != image)
 			ccv_matrix_free(image);
 		ccv_dense_matrix_t* c = 0;
 		ccv_slice(b, (ccv_matrix_t**)&c, CCV_32F, 0, 0, 225, 225);
-		int j, ch = CCV_GET_CHANNEL(c->type);
-		for (j = 0; j < c->rows * c->cols * ch; j++)
-			c->data.f32[j] = c->data.f32[j] / 255.0 * 2 - 1;
 		ccv_matrix_free(b);
 		categorized->type = CCV_CATEGORIZED_DENSE_MATRIX;
 		categorized->matrix = c;
 	}
-	cwc_bench_runtime(convnet, categorizeds, train_params);
+	cwc_verify_runtime(convnet, categorizeds, train_params);
 	ccv_disable_cache();
 	return 0;
 }
diff --git a/bin/cuda/makefile b/bin/cuda/makefile
new file mode 100644
index 000000000..8952d7b2e
--- /dev/null
+++ b/bin/cuda/makefile
@@ -0,0 +1,26 @@
+include ../../lib/config.mk
+
+#CC += -faddress-sanitizer -fno-omit-frame-pointer
+LDFLAGS := -L"../../lib" -lccv $(LDFLAGS)
+CFLAGS := -O3 -Wall -I"../../lib" $(CFLAGS)
+NVFLAGS := -O3 -I"../../lib" -lineinfo $(NVFLAGS)
+
+all: libccv.a cwc-bench cwc-verify
+
+clean:
+	${MAKE} clean -C ../../lib ; rm -f *.o $(TARGETS)
+
+cwc-bench: %: %.o cwc-bench-runtime.o libccv.a
+	$(CC) -o $@ cwc-bench-runtime.o $< $(LDFLAGS)
+
+cwc-verify: %: %.o cwc-verify-runtime.o libccv.a
+	$(CC) -o $@ cwc-verify-runtime.o $< $(LDFLAGS)
+
+libccv.a:
+	${MAKE} -C ../../lib
+
+%.o: %.c ../../lib/ccv.h
+	$(CC) $< -o $@ -c $(CFLAGS)
+
+%.o: %.cu ../../lib/ccv.h ../../lib/cuda/*.h ../../lib/cuda/*.cu
+	$(NVCC) $< -o $@ -c $(NVFLAGS)
diff --git a/bin/cwc-bench-runtime.cu b/bin/cwc-bench-runtime.cu
deleted file mode 100644
index de132000f..000000000
--- a/bin/cwc-bench-runtime.cu
+++ /dev/null
@@ -1,203 +0,0 @@
-extern "C" {
-#include "ccv.h"
-}
-#include <ctype.h>
-#define CASE_TESTS // so that we don't include public available methods
-#include "../lib/cuda/cwc_convnet.cu"
-#include "../lib/ccv_convnet.c"
-
-extern "C" void cwc_bench_runtime(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_convnet_train_param_t params)
-{
-	int batch = params.mini_batch;
-	_cwc_convnet_alloc_reserved(convnet, batch, params.layer_params);
-	cwc_convnet_context_t* context = GPU(convnet)->contexts;
-	_cwc_convnet_batch_formation(0, categorizeds, 0, ccv_size(251, 251), convnet->rows, convnet->cols, convnet->channels, batch, 0, batch, context->host.input, context->host.c);
-	cudaMemcpy(context->device.input, context->host.input, sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch, cudaMemcpyHostToDevice);
-
-	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
-	_ccv_convnet_update_zero(update_params);
-
-	// first convolutional layer forward propagate
-	ccv_convnet_layer_t* first_gpu_layer = GPU(convnet)->layers;
-	_cwc_convnet_convolutional_forward_propagate(first_gpu_layer, batch, context->device.input, GPU(convnet)->forwards[0], context->device.stream);
-	cudaStreamSynchronize(context->device.stream);
-	int first_out_rows, first_out_cols, first_out_channels = first_gpu_layer->net.convolutional.count;
-	_cwc_convnet_layer_deduce_output_format(first_gpu_layer, &first_out_rows, &first_out_cols);
-	float* first_out = 0;
-	cudaMallocHost(&first_out, sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch);
-	cudaMemcpy(first_out, GPU(convnet)->forwards[0], sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch, cudaMemcpyDeviceToHost);
-	printf("finished forward propagate first convolutional layer on GPU\n");
-
-	// second average pool layer forward propagate
-	ccv_convnet_layer_t* second_gpu_layer = GPU(convnet)->layers + 1;
-	_cwc_convnet_average_pool_forward_propagate(second_gpu_layer, batch,  GPU(convnet)->forwards[0], GPU(convnet)->forwards[1], context->device.stream);
-	cudaStreamSynchronize(context->device.stream);
-	int second_out_rows, second_out_cols, second_out_channels = second_gpu_layer->input.matrix.channels;
-	_cwc_convnet_layer_deduce_output_format(second_gpu_layer, &second_out_rows, &second_out_cols);
-	float* second_out = 0;
-	cudaMallocHost(&second_out, sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch);
-	cudaMemcpy(second_out, GPU(convnet)->forwards[1], sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch, cudaMemcpyDeviceToHost);
-	printf("finished forward propagate second average pool layer on GPU\n");
-
-	// third convolutional layer forward propagate
-	ccv_convnet_layer_t* third_gpu_layer = GPU(convnet)->layers + 2;
-	_cwc_convnet_convolutional_forward_propagate(third_gpu_layer, batch, GPU(convnet)->forwards[1], GPU(convnet)->forwards[2], context->device.stream);
-	cudaStreamSynchronize(context->device.stream);
-	int third_out_rows, third_out_cols, third_out_channels = third_gpu_layer->net.convolutional.count;
-	_cwc_convnet_layer_deduce_output_format(third_gpu_layer, &third_out_rows, &third_out_cols);
-	float* third_out = 0;
-	cudaMallocHost(&third_out, sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch);
-	cudaMemcpy(third_out, GPU(convnet)->forwards[2], sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch, cudaMemcpyDeviceToHost);
-	printf("finished forward propagate third convolutional layer on GPU\n");
-
-	// third convolutonal layer backward propagate
-	cudaMemcpy(GPU(convnet)->backwards[3], GPU(convnet)->forwards[2], sizeof(float) * third_out_rows * third_out_cols * third_out_channels * batch, cudaMemcpyDeviceToDevice);
-	ccv_convnet_layer_t* third_gpu_configuration = GPU(convnet)->configurations + 2;
-	_cwc_convnet_convolutional_backward_propagate(third_gpu_layer, batch, GPU(convnet)->backwards[3], GPU(convnet)->forwards[2], GPU(convnet)->forwards[1], GPU(convnet)->backwards[2], third_gpu_configuration, GPU(convnet)->scratch, GPU(convnet)->unit, context->device.stream, context->device.cublas);
-	cudaStreamSynchronize(context->device.stream);
-	assert(cudaGetLastError() == cudaSuccess);
-	float* third_back = 0;
-	cudaMallocHost(&third_back, sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch);
-	cudaMemcpy(third_back, GPU(convnet)->backwards[2], sizeof(float) * second_out_rows * second_out_cols * second_out_channels * batch, cudaMemcpyDeviceToHost);
-	float* third_grad = 0;
-	cudaMallocHost(&third_grad, sizeof(float) * third_gpu_layer->wnum);
-	assert(third_grad);
-	cudaMemcpy(third_grad, third_gpu_configuration->w, sizeof(float) * third_gpu_layer->wnum, cudaMemcpyDeviceToHost);
-	printf("finished backward propagate third convolutional layer on GPU\n");
-
-	// second average pool layer backward propagate
-	_cwc_convnet_average_pool_backward_propagate(second_gpu_layer, batch, GPU(convnet)->backwards[2], GPU(convnet)->backwards[1], context->device.stream);
-	cudaStreamSynchronize(context->device.stream);
-	assert(cudaGetLastError() == cudaSuccess);
-	float* second_back = 0;
-	cudaMallocHost(&second_back, sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch);
-	cudaMemcpy(second_back, GPU(convnet)->backwards[1], sizeof(float) * first_out_rows * first_out_cols * first_out_channels * batch, cudaMemcpyDeviceToHost);
-	printf("finished backward propagate second average pool layer on GPU\n");
-
-	// first convolutional layer backward propagate
-	ccv_convnet_layer_t* first_gpu_configuration = GPU(convnet)->configurations;
-	_cwc_convnet_convolutional_backward_propagate(first_gpu_layer, batch, GPU(convnet)->backwards[1], GPU(convnet)->forwards[0], context->device.input, GPU(convnet)->backwards[0], first_gpu_configuration, GPU(convnet)->scratch, GPU(convnet)->unit, context->device.stream, context->device.cublas);
-	cudaStreamSynchronize(context->device.stream);
-	assert(cudaGetLastError() == cudaSuccess);
-	float* first_grad = 0;
-	cudaMallocHost(&first_grad, sizeof(float) * first_gpu_layer->wnum);
-	assert(first_grad);
-	cudaMemcpy(first_grad, first_gpu_configuration->w, sizeof(float) * first_gpu_layer->wnum, cudaMemcpyDeviceToHost);
-	printf("finished backward propagate first convolutional layer on GPU\n");
-
-	int i, x, y, k, c;
-	for (i = 0; i < batch; i++)
-	{
-		printf("doing batch %d of %d\n", i + 1, batch);
-		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
-
-		// first convolutional layer forward propagate
-		ccv_convnet_layer_t* first_cpu_layer = convnet->layers;
-		_ccv_convnet_convolutional_forward_propagate(first_cpu_layer, categorized->matrix, 0, convnet->acts);
-		ccv_dense_matrix_t* a = convnet->acts[0];
-		for (y = 0; y < first_out_rows; y++)
-			for (x = 0; x < first_out_cols; x++)
-				for (k = 0; k < first_out_channels; k++)
-				{
-					float p = first_out[k * first_out_rows * first_out_cols * batch + (y * first_out_cols + x) * batch + i];
-					float q = a->data.f32[y * first_out_cols * first_out_channels + x * first_out_channels + k];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-5)
-						printf("conv fprop 1: %d %d %d %d: |%f - %f| = %f\n", i, x, y, k, p, q, delta);
-				}
-
-		// second average pool layer forward propagate
-		ccv_convnet_layer_t* second_cpu_layer = convnet->layers + 1;
-		_ccv_convnet_average_pool_forward_propagate(second_cpu_layer, convnet->acts[0], convnet->acts + 1);
-		ccv_dense_matrix_t* b = convnet->acts[1];
-		for (y = 0; y < second_out_rows; y++)
-			for (x = 0; x < second_out_cols; x++)
-				for (k = 0; k < second_out_channels; k++)
-				{
-					float p = second_out[k * second_out_rows * second_out_cols * batch + (y * second_out_cols + x) * batch + i];
-					float q = b->data.f32[y * second_out_cols * second_out_channels + x * second_out_channels + k];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-5)
-						printf("avgpool fprop 2: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
-				}
-
-		// third convolutional layer forward propagate
-		ccv_convnet_layer_t* third_cpu_layer = convnet->layers + 2;
-		_ccv_convnet_convolutional_forward_propagate(third_cpu_layer, convnet->acts[1], 0, convnet->acts + 2);
-		ccv_dense_matrix_t* c = convnet->acts[2];
-		for (y = 0; y < third_out_rows; y++)
-			for (x = 0; x < third_out_cols; x++)
-				for (k = 0; k < third_out_channels; k++)
-				{
-					float p = third_out[k * third_out_rows * third_out_cols * batch + (y * third_out_cols + x) * batch + i];
-					float q = c->data.f32[y * third_out_cols * third_out_channels + x * third_out_channels + k];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-5)
-						printf("conv fprop 3: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
-				}
-
-		// third convolutional layer backward propagate
-		_ccv_convnet_convolutional_backward_propagate(third_cpu_layer, convnet->acts[2], convnet->acts[2], 0, convnet->acts[1], update_params->acts + 1, update_params->layers + 2);
-		ccv_dense_matrix_t* bc = update_params->acts[1];
-		for (y = 0; y < second_out_rows; y++)
-			for (x = 0; x < second_out_cols; x++)
-				for (k = 0; k < second_out_channels; k++)
-				{
-					float p = third_back[k * second_out_rows * second_out_cols * batch + (y * second_out_cols + x) * batch + i];
-					float q = bc->data.f32[y * second_out_cols * second_out_channels + x * second_out_channels + k];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-5)
-						printf("conv bprop 3: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
-				}
-
-		// second average pool layer backward propagate
-		_ccv_convnet_average_pool_backward_propagate(second_cpu_layer, update_params->acts[1], convnet->acts[0], update_params->acts);
-		ccv_dense_matrix_t* bb = update_params->acts[0];
-		for (y = 0; y < first_out_rows; y++)
-			for (x = 0; x < first_out_cols; x++)
-				for (k = 0; k < first_out_channels; k++)
-				{
-					float p = second_back[k * first_out_rows * first_out_cols * batch + (y * first_out_cols + x) * batch + i];
-					float q = bb->data.f32[y * first_out_cols * first_out_channels + x * first_out_channels + k];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-5)
-						printf("avgpool bprop 2: %d %d %d %d: |%g - %g| = %g\n", i, x, y, k, p, q, delta);
-				}
-
-		// first convolutional layer backward propagate
-		_ccv_convnet_convolutional_backward_propagate(first_cpu_layer, update_params->acts[0], convnet->acts[0], 0, categorized->matrix, 0, update_params->layers);
-	}
-	ccv_convnet_layer_t* third_cpu_configuration = update_params->layers + 2;
-	int third_filter_rows = third_gpu_layer->net.convolutional.rows;
-	int third_filter_cols = third_gpu_layer->net.convolutional.cols;
-	int third_filter_count = third_gpu_layer->net.convolutional.count;
-	int third_filter_channels = third_gpu_layer->net.convolutional.channels;
-	for (y = 0; y < third_filter_rows; y++)
-		for (x = 0; x < third_filter_cols; x++)
-			for (k = 0; k < third_filter_count; k++)
-				for (c = 0; c < third_filter_channels; c++)
-				{
-					float p = third_cpu_configuration->w[(y * third_filter_cols + x) * third_filter_channels + k * third_filter_cols * third_filter_rows * third_filter_channels + c];
-					float q = third_grad[(y * third_filter_cols + x) * third_filter_count + k + c * third_filter_cols * third_filter_rows * third_filter_count];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-4)
-						printf("conv bprop 3: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
-				}
-
-	ccv_convnet_layer_t* first_cpu_configuration = update_params->layers;
-	int first_filter_rows = first_gpu_layer->net.convolutional.rows;
-	int first_filter_cols = first_gpu_layer->net.convolutional.cols;
-	int first_filter_count = first_gpu_layer->net.convolutional.count;
-	int first_filter_channels = first_gpu_layer->net.convolutional.channels;
-	for (y = 0; y < first_filter_rows; y++)
-		for (x = 0; x < first_filter_cols; x++)
-			for (k = 0; k < 1; k++) // first_filter_count; k++)
-				for (c = 0; c < first_filter_channels; c++)
-				{
-					float p = first_cpu_configuration->w[(y * first_filter_cols + x) * first_filter_channels + k * first_filter_cols * first_filter_rows * first_filter_channels + c];
-					float q = first_grad[(y * first_filter_cols + x) * first_filter_count + k + c * first_filter_cols * first_filter_rows * first_filter_count];
-					float delta = fabs(p - q) / ccv_max(ccv_max(fabs(p), fabs(q)), 1);
-					if (delta > 1e-4)
-						printf("conv bprop 1: %d %d %d %d: |%g - %g| = %g\n", x, y, k, c, p, q, delta);
-				}
-}
diff --git a/bin/dpmdetect.c b/bin/dpmdetect.c
index 44e24dd5b..8fd908777 100644
--- a/bin/dpmdetect.c
+++ b/bin/dpmdetect.c
@@ -9,27 +9,96 @@ unsigned int get_current_time()
 	return tv.tv_sec * 1000 + tv.tv_usec / 1000;
 }
 
+int count_models(const char* directory, int *model_list)
+{
+	int num_models = 0;
+
+	FILE* r = fopen(directory, "rt");
+	if(r)
+	{
+		size_t len = 1024;
+		char* line = (char*)malloc(len);
+		ssize_t read;
+		/* check if it is a model file */
+		if ((read = getline(&line, &len, r)) != -1)
+		{
+			while(read > 1 && isspace(line[read - 1]))
+				read--;
+			line[read] = 0;
+			if (strlen(line) == 1 && line[0] == '.')
+				return 1;
+			/* if it reaches here, it must be a list of model files */
+			(*model_list) = 1;
+			num_models++;
+		}
+		while (getline(&line, &len, r) != -1)
+			if (line[0] != '\n')
+				num_models++;
+		free(line);
+		fclose(r);
+	}
+	return num_models;
+}
+
+ccv_dpm_mixture_model_t** read_models(const char* directory, int num_models, int model_list)
+{
+	ccv_dpm_mixture_model_t** models = (ccv_dpm_mixture_model_t**)ccmalloc(sizeof(ccv_dpm_mixture_model_t*) * num_models);
+	if (num_models > 1 || model_list)
+	{
+		int i;
+		FILE* r = fopen(directory, "rt");
+		if(r)
+		{
+			size_t len = 1024;
+			char* line = (char*)malloc(len);
+			ssize_t read;
+			for (i = 0; i < num_models; i++)
+			{
+				if ((read = getline(&line, &len, r)) != -1)
+				{
+					if (line[0] != '\n')
+					{
+						while(read > 1 && isspace(line[read - 1]))
+							read--;
+						line[read] = 0;
+						models[i] = ccv_dpm_read_mixture_model(line);
+					}
+					else
+						i--;
+				}
+			}
+			free(line);
+			fclose(r);
+		}
+	}
+	else
+		models[0] = ccv_dpm_read_mixture_model(directory);
+
+	return models;
+}
+
 int main(int argc, char** argv)
 {
 	assert(argc >= 3);
-	int i, j;
+	int i, j, num_models, model_list = 0;
 	ccv_enable_default_cache();
 	ccv_dense_matrix_t* image = 0;
 	ccv_read(argv[1], &image, CCV_IO_ANY_FILE);
-	ccv_dpm_mixture_model_t* model = ccv_dpm_read_mixture_model(argv[2]);
+	num_models = count_models(argv[2], &model_list);
+	ccv_dpm_mixture_model_t** models = read_models(argv[2], num_models, model_list);
 	if (image != 0)
 	{
 		unsigned int elapsed_time = get_current_time();
-		ccv_array_t* seq = ccv_dpm_detect_objects(image, &model, 1, ccv_dpm_default_params);
+		ccv_array_t* seq = ccv_dpm_detect_objects(image, models, num_models, ccv_dpm_default_params);
 		elapsed_time = get_current_time() - elapsed_time;
 		if (seq)
 		{
 			for (i = 0; i < seq->rnum; i++)
 			{
 				ccv_root_comp_t* comp = (ccv_root_comp_t*)ccv_array_get(seq, i);
-				printf("%d %d %d %d %f %d\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence, comp->pnum);
+				printf("%d %d %d %d %f %d\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence, comp->pnum);
 				for (j = 0; j < comp->pnum; j++)
-					printf("| %d %d %d %d %f\n", comp->part[j].rect.x, comp->part[j].rect.y, comp->part[j].rect.width, comp->part[j].rect.height, comp->part[j].confidence);
+					printf("| %d %d %d %d %f\n", comp->part[j].rect.x, comp->part[j].rect.y, comp->part[j].rect.width, comp->part[j].rect.height, comp->part[j].classification.confidence);
 			}
 			printf("total : %d in time %dms\n", seq->rnum, elapsed_time);
 			ccv_array_free(seq);
@@ -54,15 +123,15 @@ int main(int argc, char** argv)
 				image = 0;
 				ccv_read(file, &image, CCV_IO_GRAY | CCV_IO_ANY_FILE);
 				assert(image != 0);
-				ccv_array_t* seq = ccv_dpm_detect_objects(image, &model, 1, ccv_dpm_default_params);
+				ccv_array_t* seq = ccv_dpm_detect_objects(image, models, num_models, ccv_dpm_default_params);
 				if (seq != 0)
 				{
 					for (i = 0; i < seq->rnum; i++)
 					{
 						ccv_root_comp_t* comp = (ccv_root_comp_t*)ccv_array_get(seq, i);
-						printf("%s %d %d %d %d %f %d\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence, comp->pnum);
+						printf("%s %d %d %d %d %f %d\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence, comp->pnum);
 						for (j = 0; j < comp->pnum; j++)
-							printf("| %d %d %d %d %f\n", comp->part[j].rect.x, comp->part[j].rect.y, comp->part[j].rect.width, comp->part[j].rect.height, comp->part[j].confidence);
+							printf("| %d %d %d %d %f\n", comp->part[j].rect.x, comp->part[j].rect.y, comp->part[j].rect.width, comp->part[j].rect.height, comp->part[j].classification.confidence);
 					}
 					ccv_array_free(seq);
 				}
@@ -73,6 +142,8 @@ int main(int argc, char** argv)
 		}
 	}
 	ccv_drain_cache();
-	ccv_dpm_mixture_model_free(model);
+	for (i = 0; i < num_models; i++)
+		ccv_dpm_mixture_model_free(models[i]);
+	ccfree(models);
 	return 0;
 }
diff --git a/bin/icfdetect.c b/bin/icfdetect.c
index 2787ad64a..103c74183 100644
--- a/bin/icfdetect.c
+++ b/bin/icfdetect.c
@@ -25,7 +25,7 @@ int main(int argc, char** argv)
 		for (i = 0; i < seq->rnum; i++)
 		{
 			ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-			printf("%d %d %d %d %f\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+			printf("%d %d %d %d %f\n", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence);
 		}
 		printf("total : %d in time %dms\n", seq->rnum, elapsed_time);
 		ccv_array_free(seq);
@@ -51,7 +51,7 @@ int main(int argc, char** argv)
 				for (i = 0; i < seq->rnum; i++)
 				{
 					ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-					printf("%s %d %d %d %d %f\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+					printf("%s %d %d %d %d %f\n", file, comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->classification.confidence);
 				}
 				ccv_array_free(seq);
 				ccv_matrix_free(image);
diff --git a/bin/image-net.c b/bin/image-net.c
index 83b6cc0e4..231f94a61 100644
--- a/bin/image-net.c
+++ b/bin/image-net.c
@@ -1,4 +1,5 @@
 #include "ccv.h"
+#include "ccv_internal.h"
 #include <ctype.h>
 #include <getopt.h>
 
@@ -17,7 +18,7 @@ void exit_with_help(void)
 	"    --max-epoch : how many epoch are needed for stochastic gradient descent (an epoch corresponds to go through the full train list) [DEFAULT TO 100]\n"
 	"    --iterations : how many iterations are needed for stochastic gradient descent (an iteration corresponds to go through a mini batch) [DEFAULT TO 5000]\n\n"
 	);
-	exit(-1);
+	exit(0);
 }
 
 int main(int argc, char** argv)
@@ -41,8 +42,11 @@ int main(int argc, char** argv)
 	char* base_dir = 0;
 	ccv_convnet_train_param_t train_params = {
 		.max_epoch = 100,
-		.mini_batch = 256,
-		.iterations = 5000,
+		.mini_batch = 128,
+		.iterations = 20000,
+		.dual_device = 0,
+		.symmetric = 1,
+		.color_gain = 0.001,
 	};
 	int i, c;
 	while (getopt_long_only(argc, argv, "", image_net_options, &c) != -1)
@@ -93,7 +97,8 @@ int main(int argc, char** argv)
 		ccv_file_info_t file_info = {
 			.filename = filename,
 		};
-		ccv_categorized_t categorized = ccv_categorized(c, 0, &file_info);
+		// imageNet's category class starts from 1, thus, minus 1 to get 0-index
+		ccv_categorized_t categorized = ccv_categorized(c - 1, 0, &file_info);
 		ccv_array_push(categorizeds, &categorized);
 	}
 	fclose(r0);
@@ -110,7 +115,8 @@ int main(int argc, char** argv)
 		ccv_file_info_t file_info = {
 			.filename = filename,
 		};
-		ccv_categorized_t categorized = ccv_categorized(c, 0, &file_info);
+		// imageNet's category class starts from 1, thus, minus 1 to get 0-index
+		ccv_categorized_t categorized = ccv_categorized(c - 1, 0, &file_info);
 		ccv_array_push(tests, &categorized);
 	}
 	fclose(r1);
@@ -126,51 +132,55 @@ int main(int argc, char** argv)
 					.rows = 225,
 					.cols = 225,
 					.channels = 3,
+					.partition = 1,
 				},
 			},
 			.output = {
 				.convolutional = {
 					.count = 96,
-					.strides = 4,
+					.strides = 2,
 					.border = 1,
-					.rows = 11,
-					.cols = 11,
+					.rows = 7,
+					.cols = 7,
 					.channels = 3,
+					.partition = 2,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_MAX_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
-					.rows = 55,
-					.cols = 55,
+					.rows = 111,
+					.cols = 111,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 			.output = {
-				.pool = {
-					.strides = 2,
-					.size = 3,
-					.border = 0,
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
-					.rows = 27,
-					.cols = 27,
+					.rows = 111,
+					.cols = 111,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 			.output = {
-				.rnorm = {
-					.size = 5,
-					.kappa = 2,
-					.alpha = 1e-4,
-					.beta = 0.75,
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
 				},
 			},
 		},
@@ -181,54 +191,58 @@ int main(int argc, char** argv)
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
-					.rows = 27,
-					.cols = 27,
+					.rows = 55,
+					.cols = 55,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 			.output = {
 				.convolutional = {
 					.count = 256,
-					.strides = 1,
-					.border = 2,
+					.strides = 2,
+					.border = 1,
 					.rows = 5,
 					.cols = 5,
 					.channels = 96,
+					.partition = 2,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_MAX_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
 					.rows = 27,
 					.cols = 27,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 			.output = {
-				.pool = {
-					.strides = 2,
-					.size = 3,
-					.border = 0,
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
-					.rows = 13,
-					.cols = 13,
+					.rows = 27,
+					.cols = 27,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 			.output = {
-				.rnorm = {
-					.size = 5,
-					.kappa = 2,
-					.alpha = 1e-4,
-					.beta = 0.75,
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
 				},
 			},
 		},
@@ -242,6 +256,7 @@ int main(int argc, char** argv)
 					.rows = 13,
 					.cols = 13,
 					.channels = 256,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -252,6 +267,7 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 		},
@@ -265,6 +281,7 @@ int main(int argc, char** argv)
 					.rows = 13,
 					.cols = 13,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -275,6 +292,7 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 		},
@@ -288,6 +306,7 @@ int main(int argc, char** argv)
 					.rows = 13,
 					.cols = 13,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -298,6 +317,7 @@ int main(int argc, char** argv)
 					.rows = 3,
 					.cols = 3,
 					.channels = 384,
+					.partition = 2,
 				},
 			},
 		},
@@ -308,6 +328,7 @@ int main(int argc, char** argv)
 					.rows = 13,
 					.cols = 13,
 					.channels = 256,
+					.partition = 2,
 				},
 			},
 			.output = {
@@ -328,6 +349,7 @@ int main(int argc, char** argv)
 					.rows = 6,
 					.cols = 6,
 					.channels = 256,
+					.partition = 1,
 				},
 				.node = {
 					.count = 6 * 6 * 256,
@@ -335,7 +357,8 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
-					.count = 4096,
+					.relu = 1,
+					.count = 2048,
 				},
 			},
 		},
@@ -346,17 +369,19 @@ int main(int argc, char** argv)
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
-					.rows = 4096,
+					.rows = 2048,
 					.cols = 1,
 					.channels = 1,
+					.partition = 1,
 				},
 				.node = {
-					.count = 4096,
+					.count = 2048,
 				},
 			},
 			.output = {
 				.full_connect = {
-					.count = 4096,
+					.relu = 1,
+					.count = 2048,
 				},
 			},
 		},
@@ -367,98 +392,39 @@ int main(int argc, char** argv)
 			.sigma = 0.01,
 			.input = {
 				.matrix = {
-					.rows = 4096,
+					.rows = 2048,
 					.cols = 1,
 					.channels = 1,
+					.partition = 1,
 				},
 				.node = {
-					.count = 4096,
+					.count = 2048,
 				},
 			},
 			.output = {
 				.full_connect = {
+					.relu = 0,
 					.count = 1000,
 				},
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(1, params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
+	ccv_convnet_t* convnet = ccv_convnet_new(1, ccv_size(257, 257), params, sizeof(params) / sizeof(ccv_convnet_layer_param_t));
 	ccv_convnet_verify(convnet, 1000);
 	ccv_convnet_layer_train_param_t layer_params[13];
 	memset(layer_params, 0, sizeof(layer_params));
 	for (i = 0; i < 13; i++)
 	{
 		layer_params[i].w.decay = 0.0005;
-		layer_params[i].w.learn_rate = 0.00000001;
+		layer_params[i].w.learn_rate = 0.01;
 		layer_params[i].w.momentum = 0.9;
 		layer_params[i].bias.decay = 0;
-		layer_params[i].bias.learn_rate = 0.00000001;
+		layer_params[i].bias.learn_rate = 0.01;
 		layer_params[i].bias.momentum = 0.9;
 	}
 	layer_params[10].dor = 0.5;
 	layer_params[11].dor = 0.5;
 	train_params.layer_params = layer_params;
-	train_params.size = ccv_size(257, 257);
-	/*
-	ccv_size_t size = ccv_size(257, 257);
-	for (i = 0; i < categorizeds->rnum; i++)
-	{
-		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
-		ccv_dense_matrix_t* image = 0;
-		ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
-		if (image)
-		{
-			ccv_dense_matrix_t* norm = 0;
-			if (image->rows > size.height && image->cols > size.width)
-				ccv_resample(image, &norm, 0, ccv_max(size.height, (int)(image->rows * (float)size.height / image->cols + 0.5)), ccv_max(size.width, (int)(image->cols * (float)size.width / image->rows + 0.5)), CCV_INTER_AREA);
-			else if (image->rows < size.height || image->cols < size.width)
-				ccv_resample(image, &norm, 0, ccv_max(size.height, (int)(image->rows * (float)size.height / image->cols + 0.5)), ccv_max(size.width, (int)(image->cols * (float)size.width / image->rows + 0.5)), CCV_INTER_CUBIC);
-			else
-				norm = image;
-			if (norm != image)
-				ccv_matrix_free(image);
-			char filename[1024];
-			snprintf(filename, 1024, "%s.resize.png", categorized->file.filename);
-			ccv_write(norm, filename, 0, CCV_IO_PNG_FILE, 0);
-			ccv_dense_matrix_t* patch = 0;
-			int x = (norm->cols - size.width) / 2;
-			int y = (norm->rows - size.height) / 2;
-			ccv_slice(norm, (ccv_matrix_t**)&patch, CCV_64F, y, x, size.width, size.height);
-			ccv_matrix_free(norm);
-			ccv_matrix_free(patch);
-			FLUSH("done %s, %d / %d", filename, i + 1, categorizeds->rnum);
-		} else {
-			printf("\ncannot handle %s\n", categorized->file.filename);
-		}
-	}
-	printf("\n");
-	for (i = 0; i < tests->rnum; i++)
-	{
-		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(tests, i);
-		ccv_dense_matrix_t* image = 0;
-		ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
-		if (image)
-		{
-			ccv_dense_matrix_t* norm = 0;
-			if (image->rows > size.height && image->cols > size.width)
-				ccv_resample(image, &norm, 0, ccv_max(size.height, (int)(image->rows * (float)size.height / image->cols + 0.5)), ccv_max(size.width, (int)(image->cols * (float)size.width / image->rows + 0.5)), CCV_INTER_AREA);
-			else if (image->rows < size.height || image->cols < size.width)
-				ccv_resample(image, &norm, 0, ccv_max(size.height, (int)(image->rows * (float)size.height / image->cols + 0.5)), ccv_max(size.width, (int)(image->cols * (float)size.width / image->rows + 0.5)), CCV_INTER_CUBIC);
-			else
-				norm = image;
-			if (norm != image)
-				ccv_matrix_free(image);
-			char filename[1024];
-			snprintf(filename, 1024, "%s.resize.png", categorized->file.filename);
-			ccv_write(norm, filename, 0, CCV_IO_PNG_FILE, 0);
-			ccv_matrix_free(norm);
-			FLUSH("done %s, %d / %d", filename, i + 1, tests->rnum);
-		} else {
-			printf("\ncannot handle %s\n", categorized->file.filename);
-		}
-	}
-	printf("\n");
-	*/
 	ccv_convnet_supervised_train(convnet, categorizeds, tests, working_dir, train_params);
 	ccv_convnet_free(convnet);
 	ccv_disable_cache();
diff --git a/bin/makefile b/bin/makefile
index b4fd7427a..f25af1128 100644
--- a/bin/makefile
+++ b/bin/makefile
@@ -3,18 +3,14 @@ include ../lib/config.mk
 #CC += -faddress-sanitizer -fno-omit-frame-pointer
 LDFLAGS := -L"../lib" -lccv $(LDFLAGS)
 CFLAGS := -O3 -Wall -I"../lib" $(CFLAGS)
-NVFLAGS := -O3 -I"../lib" $(NVFLAGS)
 
-TARGETS = bbffmt msermatch siftmatch bbfcreate bbfdetect swtcreate swtdetect dpmcreate dpmdetect convert tld icfcreate icfdetect icfoptimize cifar-10 image-net
+TARGETS = bbffmt msermatch siftmatch bbfcreate bbfdetect swtcreate swtdetect dpmcreate dpmdetect convert tld icfcreate icfdetect icfoptimize cifar-10 image-net cnnclassify
 
-all: libccv.a $(TARGETS) cwc-bench
+all: libccv.a $(TARGETS)
 
 clean:
 	${MAKE} clean -C ../lib ; rm -f *.o $(TARGETS)
 
-cwc-bench: %: %.o cwc-bench-runtime.o libccv.a
-	$(CC) -o $@ cwc-bench-runtime.o $< $(LDFLAGS)
-
 $(TARGETS): %: %.o libccv.a
 	$(CC) -o $@ $< $(LDFLAGS)
 
@@ -23,6 +19,3 @@ libccv.a:
 
 %.o: %.c ../lib/ccv.h
 	$(CC) $< -o $@ -c $(CFLAGS)
-
-%.o: %.cu ../lib/ccv.h ../lib/cuda/*.h ../lib/cuda/*.cu
-	$(NVCC) $< -o $@ -c $(NVFLAGS)
diff --git a/bin/tld.c b/bin/tld.c
index 698b2f301..23921f981 100644
--- a/bin/tld.c
+++ b/bin/tld.c
@@ -119,12 +119,12 @@ int main(int argc, char** argv)
 		ccv_write(image, filename, 0, CCV_IO_PNG_FILE, 0);
 		ccv_matrix_free(image);
 		if (tld->found)
-			printf("%d,%d,%d,%d,%f\n", newbox.rect.x, newbox.rect.y, newbox.rect.width + newbox.rect.x - 1, newbox.rect.height + newbox.rect.y - 1, newbox.confidence);
+			printf("%d,%d,%d,%d,%f\n", newbox.rect.x, newbox.rect.y, newbox.rect.width + newbox.rect.x - 1, newbox.rect.height + newbox.rect.y - 1, newbox.classification.confidence);
 		else
 			printf("NaN,NaN,NaN,NaN,NaN\n");
 		*/
 		if (tld->found)
-			printf("%05d: %d %d %d %d %f\n", tld->count, newbox.rect.x, newbox.rect.y, newbox.rect.width, newbox.rect.height, newbox.confidence);
+			printf("%05d: %d %d %d %d %f\n", tld->count, newbox.rect.x, newbox.rect.y, newbox.rect.width, newbox.rect.height, newbox.classification.confidence);
 		else
 			printf("%05d: --------------\n", tld->count);
 		x = y;
diff --git a/doc/convnet.md b/doc/convnet.md
new file mode 100644
index 000000000..ed294cc41
--- /dev/null
+++ b/doc/convnet.md
@@ -0,0 +1,279 @@
+ConvNet: Deep Convolutional Networks
+====================================
+
+What's ConvNet?
+---------------
+
+Convolutional neural network is a specific artificial neural network topology that
+is inspired by biological visual cortex and tailored for computer vision tasks by
+Yann LeCun in early 1990s. See http://deeplearning.net/tutorial/lenet.html for
+introduction.
+
+The convolutional neural network implemented in ccv is based on Alex Krizhevsky's
+ground-breaking work presented in:
+
+ImageNet Classification with Deep Convolutional Neural Networks, Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton, NIPS 2012
+
+The parameters are modified based on Matthew D. Zeiler's work presented in:
+
+Visualizing and Understanding Convolutional Networks, Matthew D. Zeiler, and Rob Fergus, Arxiv 1311.2901 (Nov 2013)
+
+The multi-GPU implementation was heavily influenced by:
+
+One Weird Trick for Parallelizing Convolutional Neural Networks, Alex Krizhevsky, ICLR 2014
+
+How it works?
+-------------
+
+Long story short, with advances in GPGPU programming, we can have very large neural networks
+(with over 50 million parameters) trained on millions of images. It turns out that once you
+have both and a bag of tricks (dropout, pooling etc.), the resulted neural networks can achieve
+good image classification results.
+
+	./cnnclassify ../samples/dex.png ../samples/image-net-2010.sqlite3 | ./cnndraw.rb ../samples/image-net-2010.words ../samples/dex.png output.png
+
+Check output.png, the neural networks suggest a few possible relevant classes in the top
+left chart.
+
+What about the performance?
+---------------------------
+
+ConvNet on the very large scale is not extremely fast. There are a few implementations available
+for ConvNet that focused on speed performance, such as [Caffe from Berkeley](http://caffe.berkeleyvision.org/),
+or [OverFeat from NYU](http://cilvr.nyu.edu/doku.php?id=software:overfeat:start). Although not
+explicitly optimized for speed (ccv chooses correctness over speed in this preliminary implementation),
+the ConvNet implementation presented in ccv speed-wise is inline with other implementations.
+
+Therefore, the analysis related to performance is implemented on ImageNet dataset and the network
+topology followed the exact specification detailed in the paper.
+
+Accuracy-wise:
+
+The test is performed on ILSVRC 2010 test dataset and ILSVRC 2012 validation dataset.
+
+For ILSVRC2010 dataset, The training stopped to improve at around 60 epochs, at that time, the central
+patch from test set obtained 36.56% of top-1 missing rate (lower is better) and the training set
+obtained 32.2% of top-1 missing rate. In Alex's paper, they reported 37.5% top-1 missing rate when
+averaging 10 patches, and 39% top-1 missing rate when using the central patch in test set.
+
+For ILSVRC2012 dataset, the training stopped to improve at around 70 epochs, at that time, the central
+patch from validation set obtained 41.4% of top-1 missing rate (lower is better) and the training set
+obtained 37.8% of top-1 missing rate. In Alex's paper, they reported 40.5% top-1 missing rate when
+averaging 10 patches. In Matt's paper, they reported 38.4% top-1 missing rate when using 1 convnet as
+configured in Fig.3 and averaging 10 patches.
+
+Assuming you have ILSVRC 2012 validation set files ordered in image-net-2012-val.txt, run
+
+	./cnnclassify image-net-2012-val.txt ../samples/image-net-2012.sqlite3 > image-net-2012-classify.txt
+
+For complete validation set to finish, this command takes half an hour on GPU, and if you don't have GPU
+enabled, it will take about half a day to run on CPU.
+
+Assuming you have the ILSVRC 2012 validation ground truth data in LSVRC2012_val_ground_truth.txt
+
+	./cnnvldtr.rb LSVRC2012_val_ground_truth.txt image-net-2012-classify.txt
+
+will reports the top-1 missing rate as well as top-5 missing rate.
+
+For 32-bit float point image-net-2012.sqlite3 on GPU, the top-1 missing rate is 38.17%, 2.33% better
+than Alex's result with 1 convnet, and 0.23% better than Matt's result with 1 convnet and configured
+with Fig.3. The top-5 missing rate is 16.22%, 1.98% better than Alex's and 0.28% better than Matt's.
+For half precision image-net-2012.sqlite3 (the one included in ./samples/), the top-1 missing rate is
+38.18% and the top-5 missing rate is 16.17%.
+
+See http://www.image-net.org/challenges/LSVRC/2013/results.php#cls for the current state-of-the-art,
+ccv's implementation is still about 5% behind Clarifai (Matt's commercial implementation, later claimed
+to be 10.7%: http://www.clarifai.com/) and 2% behind OverFeat on top-5 missing rate.
+
+For 32-bit float point image-net-2012.sqlite3 on CPU, the top-1 missing rate is 38.51%, and the top-5
+missing rate is 16.57%.
+
+For 32-bit float point image-net-2010.sqlite3 on GPU, the top-1 missing rate is 33.91%, and the top-5
+missing rate is 14.08%.
+
+You can download the 32-bit float point versions with ./samples/download-image-net.sh
+
+Speed-wise:
+
+The experiment conducted on a computer with Core i7 3770, NVIDIA TITAN graphic card at stock
+frequency, and Samsung MZ-7TE500BW 500GiB SSD with clang, libdispatch, libatlas and GNU
+Scientific Library.
+
+The CPU version of forward pass (from RGB image input to the classification result) takes about
+700ms per image. This is achieved with multi-threaded convolutional kernel computation. Decaf (
+the CPU counter-part of Caffe) reported their forward pass at around 0.5s per image with
+unspecified hardware over 10 patches (the same as ccv's cnnclassify implementation). I cannot
+get sensible number off OverFeat on my machine (it reports about 1.4s for forward pass, that
+makes little sense). Their reported number are 1s per image on unspecified configuration with
+unspecified hardware (I suspect that their unspecified configuration does much more than the
+averaging 10 patches ccv or Decaf does).
+
+For AlexNet 12, the GPU version does forward pass + backward error propagate for batch size of 128
+in about 0.664s. Thus, training ImageNet convolutional network takes about 186 hours with 100 epochs.
+Caffe reported their forward pass + backward error propagate for batch size of 256 in about 1.3s
+on NVIDIA TITAN. In the paper, Alex reported 90 epochs within 6 days on two GeForce 580. In
+"Multi-GPU Training of ConvNets" (Omry Yadan, Keith Adams, Yaniv Taigman, and Marc'Aurelio Ranzato,
+arXiv:1312.5853), Omry mentioned that they did 100 epochs of AlexNet in 10.5 days on 1 GPU), which
+suggests my time is within line of these implementations.
+
+For MattNet, the GPU version does forward pass + backward error propagate for batch size of 128 in
+about 0.845s.
+
+For AlexNet 14 (One Weird Trick), the reported time on one GPU with 90 epochs is 98.05 hours. ccv's
+implementation of AlexNet 14 does forward pass + backward error propagate for batch size of 128 in
+about 0.55s, thus, for 90 epochs, will take 137.6 hours.
+
+As a preliminary implementation, I didn't spend enough time to optimize these operations in ccv if
+any at all. For example, [cuda-convnet](http://code.google.com/p/cuda-convnet/) implements its
+functionalities in about 10,000 lines of code, Caffe implements with 14,000 lines of code, as of
+this release, ccv implements with about 4,300 lines of code. For the future, the low-hanging
+optimization opportunities include using SIMD instruction, doing FFT in densely convolved layers
+etc.
+
+How to train my own image classifier?
+-------------------------------------
+
+First, you need to figure out your network topology. For all intents and purposes, I will walk you
+through how to train with ImageNet LSVRC 2010 data.
+
+You need three things: the actual ImageNet dataset (and metadata), a CUDA GPU with no less than 6GiB
+on-board memory and a sufficient large SSD device to hold ImageNet dataset (otherwise loading data
+from your rotational disk will take more time than the actual computation).
+
+I downloaded the ImageNet dataset from this torrent:
+
+Assuming you've downloaded / bought all these and installed on your computer, get a hot tea, it will
+take a while to get all the puzzles and riddles in place for the training starts.
+
+The ImageNet metadata for 2010 challenge can be downloaded from
+http://www.image-net.org/challenges/LSVRC/2010/download-public
+
+Unfortunately, the metadata are stored in Matlab proprietary format, there are some conversion work
+to be done. Here will demonstrate how to use Octave to do this. Install Octave on Linux-like system
+is easy, for me on Ubuntu, it is about one line:
+
+	sudo apt-get install octave
+
+Assuming you've downloaded devkit-1.0 from the above link, and found meta.mat file somewhere in that
+tarball, launching Octave interactive environment and run:
+
+	file = fopen('meta.txt', 'w+')
+	for i = 1:1000
+		fprintf(file, "%d %s %d\n", synsets(i).ILSVRC2010_ID, synsets(i).WNID, synsets(i).num_train_images)
+	endfor
+	fclose(file)
+
+The newly created meta.txt file will give us the class id, the WordNet id, and the number of training
+image available for each class.
+
+The ImageNet data downloaded from the torrent puts the training images into directories named by the
+WordNet ids.
+
+	find <ImageNet dataset>/train/ -name "*.JPEG" > train-file.txt
+
+I use this script to generate format that ccv understands: https://gist.github.com/liuliu/8393461
+
+The test dataset is ordered numerically, thus,
+
+	find <ImageNet dataset>/test/ -name "*.JPEG" > test-file.txt
+
+will generate file list corresponding to ILSVRC2010_test_ground_truth.txt for class ids.
+
+This script: https://gist.github.com/liuliu/8393516 will generate the plain text that ccv understands
+for tests.
+
+These images need to be first pre-processed to correct size for training.
+
+I partially replaced ./bin/image-net.c with this snippet: https://gist.github.com/liuliu/8906523 to
+generate files suffixed with ".resize.png". Compile and run:
+
+	./image-net --train-list ~/Fast/imageNet/train-file.txt --test-list ~/Fast/imageNet/test-file.txt --base-dir ~/Fast/imageNet --working-dir image-net.sqlite3
+
+The resize will take about 3 hours, and after that, train.txt and test.txt are generated from
+train-file.txt and test-file.txt by suffixing .resize.png on every line.
+
+Now, everything is ready. Assuming you have a TITAN GPU as I do, it takes 9 days. And follows Alex procedure,
+the learn_rate will be decreased three times, for the specific image-net.sqlite3 you see in ./samples, I
+started with 0.01 learn_rate, decreased to 0.001 at 30th epoch, and then decreased to 0.0001 at 60th epoch,
+and then decreased to 0.00001 at 80th epoch.
+
+The generated image-net.sqlite3 file is about 600MiB in size because it contains data needed for training
+and resume. You can either open this file with sqlite command-line tool (it is a vanilla sqlite database
+file), and do:
+
+	drop table function_state;
+	drop table momentum_data;
+	vacuum;
+
+The file size will shrink to about 200MiB. You can achieve further reduction in file size by rewrite it into
+half-precision, with ccv_convnet_write and write_param.half_precision = 1. The resulted image-net.sqlite3
+is exactly what I included in ./samples.
+
+Can I use the ImageNet pre-trained data model?
+----------------------------------------------
+
+ccv is released under FreeBSD 3-clause license, and the pre-trained data models ./samples/image-net-2010.sqlite3
+and ./samples/image-net-2012.sqlite3 are released under Creative Commons Attribution 4.0 International License.
+You can use it, modify it practically anywhere and anyhow with proper attribution. As far as I can tell, this is
+the first pre-trained data model released under commercial-friendly license (Caffe itself is released under
+FreeBSD license but its pre-trained data model is "research only" and OverFeat is released under custom research
+only license).
+
+Differences between ccv's implementation, Caffe's, Alex's and Matt's
+--------------------------------------------------------------------
+
+Although the network topology of ccv's implementation followed closely to Matt's, the reported results
+diverged significantly enough for me to document the differences in implementation details.
+
+Network Topology:
+
+ccv's local response normalization layer followed the convolutional layer, and the pooling layer is after
+the local response normalization. This is briefly mentioned in Alex's paper, but in Caffe, their local
+response normalization layer followed the pooling layer.
+
+The input dimension to ccv's implemented network is 225x225, and in Caffe, it is 227x227. Alex's paper
+as well as Matt's mentioned their input size is 224x224. For 225x225, it implies a 1 pixel padding around
+the input image such that with 7x7 filter and 2 stride size, a 111x111 output will be generated. However,
+the output of the first convolutional layer in Matt's paper is 110x110.
+
+Data Preparation:
+
+Caffe's implementation resizes image into 256x256 size without retaining aspect ratio. Alex's implementation
+resizes image into sizes such that the minimal dimension is 256 while retains the aspect ratio (at least
+as the paper implied) and cropped the image into 256x256 size. ccv's implementation resizes image into sizes
+such that the minimal dimension is 257 while retains the aspect ratio (downsamples with CCV_INTER_AREA
+interpolation and upsamples with CCV_INTER_CUBIC interpoliation if needed). ccv's implementation obtains
+the mean image from center cropped 257x257 images.
+
+Data Augmentation:
+
+Caffe's implementation randomly crops image from 256x256 to 227x227. Alex's implementation randomly crops
+image from 256x256 to 224x224 and then applied color augmentation with Gaussian random coefficient sampled
+with sigma == 0.1. ccv's implementation randomly crops image from the aspect retained sizes into 257x257,
+subtract the mean image and then randomly crops it into 225x225, color augmentation is applied with Gaussian
+random coefficient sampled with sigma == 0.001. All three implementations did horizontal mirroring as a
+data augmentation technique.
+
+Averaged Classification:
+
+Caffe averages the softmax output of 10 patches from the test image by first resize image into 256x256 without
+retaining aspect ratio, and then the first 5 patches of size 227x227 cropped from top left, top right, center,
+bottom left, bottom right of the resized test image, the second 5 patches are the horizontal mirrors of the
+first 5 patches.
+
+Alex's implementation averages the softmax output of 10 patches from the test image by first resize image into
+sizes such that the minimal dimension is 256 while retains the aspect ratio and then center-crops into 256x256.
+The 10 patches of size 224x224 are sampled from the 256x256 crop the same way as Caffe did.
+
+ccv's GPU implementation averages the softmax output of 30 patches from the test image by first resize the image
+into sizes such that the minimal dimension is 257. Then it makes 3 crops from top left, center, and bottom right
+so that the cropped image is 257x257. The cropped images subtract mean image, and then each cropped from
+top left, top right, center, bottom left, bottom right into 225x225. This generates 15 patches, and each one
+of them has its horizontally-mirrored counter-part.
+
+ccv's CPU implementation for efficiency considerations averages the softmax output of 10 patches from the test
+image by first resize the image into sizes such that the minimal dimension is 257. The mean image is upsampled
+into the same size with CCV_INTER_CUBIC and then is subtracted from the resized image. The top left, top right,
+center, bottom left, bottom right patches of 225x225 is extracted and horizontally mirrored to generate the 10
+patches.
+
diff --git a/lib/ccv.h b/lib/ccv.h
index 73742a98a..e0720da54 100644
--- a/lib/ccv.h
+++ b/lib/ccv.h
@@ -14,9 +14,6 @@
 #include <string.h>
 #include <float.h>
 #include <math.h>
-#ifdef HAVE_SSE2
-#include <xmmintrin.h>
-#endif
 #include <assert.h>
 #include <alloca.h>
 
@@ -524,7 +521,7 @@ void ccv_contour_free(ccv_contour_t* contour);
 
 void ccv_invert(ccv_matrix_t* a, ccv_matrix_t** b, int type);
 void ccv_solve(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type);
-void ccv_eigen(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type);
+void ccv_eigen(ccv_dense_matrix_t* a, ccv_dense_matrix_t** vector, ccv_dense_matrix_t** lambda, int type, double epsilon);
 
 typedef struct {
 	double interp;
@@ -740,17 +737,20 @@ typedef struct {
 #define CCV_DPM_PART_MAX (10)
 
 typedef struct {
-	ccv_rect_t rect;
-	int neighbors;
 	int id;
 	float confidence;
+} ccv_classification_t;
+
+typedef struct {
+	ccv_rect_t rect;
+	int neighbors;
+	ccv_classification_t classification;
 } ccv_comp_t;
 
 typedef struct {
 	ccv_rect_t rect;
 	int neighbors;
-	int id;
-	float confidence;
+	ccv_classification_t classification;
 	int pnum;
 	ccv_comp_t part[CCV_DPM_PART_MAX];
 } ccv_root_comp_t;
@@ -1093,10 +1093,11 @@ typedef union {
 		int strides;
 		// padding for input
 		int border;
-		// rows, cols, channels for the kernel
+		// rows, cols, channels and partition for the kernel
 		int rows;
 		int cols;
 		int channels;
+		int partition;
 	} convolutional;
 	struct {
 		// strides
@@ -1115,6 +1116,7 @@ typedef union {
 		float beta;
 	} rnorm;
 	struct {
+		int relu; // apply relu or not
 		int count;
 	} full_connect;
 } ccv_convnet_type_t;
@@ -1124,6 +1126,7 @@ typedef struct {
 		int rows;
 		int cols;
 		int channels;
+		int partition;
 	} matrix;
 	struct {
 		int count;
@@ -1151,16 +1154,17 @@ typedef struct {
 typedef struct {
 	int use_cwc_accel; // use "ccv with cuda" acceleration
 	// this is redundant, but good to enforcing what the input should look like
+	ccv_size_t input;
 	int rows;
 	int cols;
 	int channels;
 	// count and layer of the convnet
 	int count;
+	ccv_dense_matrix_t* mean_activity; // mean activity to subtract from
 	ccv_convnet_layer_t* layers; // the layer configuration
 	// these can be reused and we don't need to reallocate memory
 	ccv_dense_matrix_t** denoms; // denominators
 	ccv_dense_matrix_t** acts; // hidden layers and output layers
-	ccv_dense_matrix_t** dors; // the dropout for hidden layers
 	void* reserved;
 } ccv_convnet_t;
 
@@ -1184,7 +1188,9 @@ typedef struct {
 	int max_epoch;
 	int mini_batch;
 	int iterations;
-	ccv_size_t size;
+	int symmetric;
+	int dual_device; // for now, ccv's implementation only support up to 2 GPUs
+	float color_gain; // the gaussian value for color variations
 	ccv_convnet_layer_train_param_t* layer_params;
 } ccv_convnet_train_param_t;
 
@@ -1218,11 +1224,12 @@ typedef struct {
 	int half_precision;
 } ccv_convnet_write_param_t;
 
-ccv_convnet_t* __attribute__((warn_unused_result)) ccv_convnet_new(int use_cwc_accel, ccv_convnet_layer_param_t params[], int count);
+ccv_convnet_t* __attribute__((warn_unused_result)) ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count);
 int ccv_convnet_verify(ccv_convnet_t* convnet, int output);
 void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params);
 void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch);
-void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch);
+void ccv_convnet_input_formation(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b);
+void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch);
 ccv_convnet_t* __attribute__((warn_unused_result)) ccv_convnet_read(int use_cwc_accel, const char* filename);
 void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params);
 void ccv_convnet_compact(ccv_convnet_t* convnet); // remove unused resources
diff --git a/lib/ccv_algebra.c b/lib/ccv_algebra.c
index 6d0b58136..ab21a60f2 100644
--- a/lib/ccv_algebra.c
+++ b/lib/ccv_algebra.c
@@ -23,6 +23,7 @@ double ccv_normalize(ccv_matrix_t* a, ccv_matrix_t** b, int btype, int flag)
 	ccv_declare_derived_signature(sig, da->sig != 0, ccv_sign_with_format(20, "ccv_normalize(%d)", flag), da->sig, CCV_EOF_SIGN);
 	btype = (btype == 0) ? CCV_GET_DATA_TYPE(da->type) | CCV_C1 : CCV_GET_DATA_TYPE(btype) | CCV_C1;
 	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, da->rows, da->cols, CCV_ALL_DATA_TYPE | CCV_C1, btype, sig);
+	assert(db);
 	ccv_object_return_if_cached(db->tag.f64, db);
 	double sum = 0, inv;
 	int i, j;
@@ -146,7 +147,7 @@ double ccv_sum(ccv_matrix_t* mat, int flag)
 			for (i = 0; i < dmt->rows; i++) \
 			{ \
 				for (j = 0; j < dmt->cols * ch; j++) \
-					sum += fabs(_for_get(m_ptr, j, 0)); \
+					sum += fabs((double)(_for_get(m_ptr, j, 0))); \
 				m_ptr += dmt->step; \
 			} \
 			break; \
@@ -218,7 +219,7 @@ void ccv_add(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** c, int type)
 {
 	ccv_dense_matrix_t* da = ccv_get_dense_matrix(a);
 	ccv_dense_matrix_t* db = ccv_get_dense_matrix(b);
-	assert(da->rows == db->rows && da->cols == db->cols && CCV_GET_DATA_TYPE(da->type) == CCV_GET_DATA_TYPE(db->type) && CCV_GET_CHANNEL(da->type) == CCV_GET_CHANNEL(db->type));
+	assert(da->rows == db->rows && da->cols == db->cols && CCV_GET_CHANNEL(da->type) == CCV_GET_CHANNEL(db->type));
 	ccv_declare_derived_signature(sig, da->sig != 0 && db->sig != 0, ccv_sign_with_literal("ccv_add"), da->sig, db->sig, CCV_EOF_SIGN);
 	int no_8u_type = (da->type & CCV_8U) ? CCV_32S : da->type;
 	type = (type == 0) ? CCV_GET_DATA_TYPE(no_8u_type) | CCV_GET_CHANNEL(da->type) : CCV_GET_DATA_TYPE(type) | CCV_GET_CHANNEL(da->type);
@@ -228,16 +229,16 @@ void ccv_add(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** c, int type)
 	unsigned char* aptr = da->data.u8;
 	unsigned char* bptr = db->data.u8;
 	unsigned char* cptr = dc->data.u8;
-#define for_block(_for_get, _for_set) \
+#define for_block(_for_get_a, _for_get_b, _for_set) \
 	for (i = 0; i < da->rows; i++) \
 	{ \
 		for (j = 0; j < da->cols * ch; j++) \
-			_for_set(cptr, j, _for_get(aptr, j, 0) + _for_get(bptr, j, 0), 0); \
+			_for_set(cptr, j, _for_get_a(aptr, j, 0) + _for_get_b(bptr, j, 0), 0); \
 		aptr += da->step; \
 		bptr += db->step; \
 		cptr += dc->step; \
 	}
-	ccv_matrix_getter(da->type, ccv_matrix_setter, dc->type, for_block);
+	ccv_matrix_getter_a(da->type, ccv_matrix_getter_b, db->type, ccv_matrix_setter, dc->type, for_block);
 #undef for_block
 }
 
@@ -299,5 +300,7 @@ void ccv_gemm(ccv_matrix_t* a, ccv_matrix_t* b, double alpha, ccv_matrix_t* c, d
 			cblas_dgemm(CblasRowMajor, (transpose & CCV_A_TRANSPOSE) ? CblasTrans : CblasNoTrans, (transpose & CCV_B_TRANSPOSE) ? CblasTrans : CblasNoTrans, dd->rows, dd->cols, (transpose & CCV_A_TRANSPOSE) ? da->rows : da->cols, alpha, da->data.f64, da->cols, db->data.f64, db->cols, beta, dd->data.f64, dd->cols);
 			break;
 	}
+#else
+	assert(0 && "You need a BLAS compatible library for this function, e.g. libatlas.");
 #endif
 }
diff --git a/lib/ccv_basic.c b/lib/ccv_basic.c
index e2b6815d5..b4a6fe439 100644
--- a/lib/ccv_basic.c
+++ b/lib/ccv_basic.c
@@ -1,5 +1,10 @@
 #include "ccv.h"
 #include "ccv_internal.h"
+#if defined(HAVE_SSE2)
+#include <xmmintrin.h>
+#elif defined(HAVE_NEON)
+#include <arm_neon.h>
+#endif
 
 /* sobel filter is fundamental to many other high-level algorithms,
  * here includes 2 special case impl (for 1x3/3x1, 3x3) and one general impl */
@@ -272,6 +277,7 @@ void ccv_gradient(ccv_dense_matrix_t* a, ccv_dense_matrix_t** theta, int ttype,
 	int ch = CCV_GET_CHANNEL(a->type);
 	ccv_dense_matrix_t* dtheta = *theta = ccv_dense_matrix_renew(*theta, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, tsig);
 	ccv_dense_matrix_t* dm = *m = ccv_dense_matrix_renew(*m, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, msig);
+	assert(dtheta && dm);
 	ccv_object_return_if_cached(, dtheta, dm);
 	ccv_revive_object_if_cached(dtheta, dm);
 	ccv_dense_matrix_t* tx = 0;
@@ -283,7 +289,7 @@ void ccv_gradient(ccv_dense_matrix_t* a, ccv_dense_matrix_t** theta, int ttype,
 	ccv_matrix_free(ty);
 }
 
-void _ccv_flip_y_self(ccv_dense_matrix_t* a)
+static void _ccv_flip_y_self(ccv_dense_matrix_t* a)
 {
 	int i;
 	unsigned char* buffer = (unsigned char*)alloca(a->step);
@@ -299,7 +305,7 @@ void _ccv_flip_y_self(ccv_dense_matrix_t* a)
 	}
 }
 
-void _ccv_flip_x_self(ccv_dense_matrix_t* a)
+static void _ccv_flip_x_self(ccv_dense_matrix_t* a)
 {
 	int i, j;
 	int len = CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
@@ -339,7 +345,8 @@ void ccv_flip(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int btype, int type
 		btype = CCV_GET_DATA_TYPE(a->type) | CCV_GET_CHANNEL(a->type);
 		*b = db = ccv_dense_matrix_renew(*b, a->rows, a->cols, btype, btype, sig);
 		ccv_object_return_if_cached(, db);
-		memcpy(db->data.u8, a->data.u8, a->rows * a->step);
+		if (a->data.u8 != db->data.u8)
+			memcpy(db->data.u8, a->data.u8, a->rows * a->step);
 	}
 	if (type & CCV_FLIP_Y)
 		_ccv_flip_y_self(db);
@@ -355,10 +362,12 @@ void ccv_blur(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, double si
 	ccv_object_return_if_cached(, db);
 	int fsz = ccv_max(1, (int)(4.0 * sigma + 1.0 - 1e-8)) * 2 + 1;
 	int hfz = fsz / 2;
-	unsigned char* buf = (unsigned char*)alloca(sizeof(double) * ccv_max(fsz + a->rows, (fsz + a->cols) * CCV_GET_CHANNEL(a->type)));
+	assert(hfz > 0);
+	unsigned char* buf = (unsigned char*)alloca(sizeof(double) * ccv_max(hfz * 2 + a->rows, (hfz * 2 + a->cols) * CCV_GET_CHANNEL(a->type)));
 	unsigned char* filter = (unsigned char*)alloca(sizeof(double) * fsz);
 	double tw = 0;
 	int i, j, k, ch = CCV_GET_CHANNEL(a->type);
+	assert(fsz > 0);
 	for (i = 0; i < fsz; i++)
 		tw += ((double*)filter)[i] = exp(-((i - hfz) * (i - hfz)) / (2.0 * sigma * sigma));
 	int no_8u_type = (db->type & CCV_8U) ? CCV_32S : db->type;
@@ -370,11 +379,12 @@ void ccv_blur(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, double si
 	} else {
 		tw = 1.0 / tw;
 		for (i = 0; i < fsz; i++)
-			ccv_set_value(db->type, filter, i, ((double*)filter)[i] * tw, 0);
+			ccv_set_value(no_8u_type, filter, i, ((double*)filter)[i] * tw, 0);
 	}
 	/* horizontal */
 	unsigned char* a_ptr = a->data.u8;
 	unsigned char* b_ptr = db->data.u8;
+	assert(ch > 0);
 #define for_block(_for_type, _for_set_b, _for_get_b, _for_set_a, _for_get_a) \
 	for (i = 0; i < a->rows; i++) \
 	{ \
diff --git a/lib/ccv_bbf.c b/lib/ccv_bbf.c
index 1725aab5e..132d5e1f5 100644
--- a/lib/ccv_bbf.c
+++ b/lib/ccv_bbf.c
@@ -156,7 +156,7 @@ static int _ccv_prune_positive_data(ccv_bbf_classifier_cascade_t* cascade, unsig
 static int _ccv_prepare_background_data(ccv_bbf_classifier_cascade_t* cascade, char** bgfiles, int bgnum, unsigned char** negdata, int negnum)
 {
 	int t, i, j, k, q;
-	int negperbg = negnum / bgnum + 1;
+	int negperbg;
 	int negtotal = 0;
 	int steps[] = { _ccv_width_padding(cascade->size.width),
 					_ccv_width_padding(cascade->size.width >> 1),
@@ -418,6 +418,7 @@ static ccv_bbf_feature_t _ccv_bbf_genetic_optimize(unsigned char** posdata, int
 	gsl_rng_set(rng, dbli.li);
 	int i, j;
 	int pnum = ftnum * 100;
+	assert(pnum > 0);
 	ccv_bbf_gene_t* gene = (ccv_bbf_gene_t*)ccmalloc(pnum * sizeof(ccv_bbf_gene_t));
 	int rows[] = { size.height, size.height >> 1, size.height >> 2 };
 	int cols[] = { size.width, size.width >> 1, size.width >> 2 };
@@ -1178,7 +1179,7 @@ static int _ccv_is_equal_same_class(const void* _r1, const void* _r2, void* data
 	const ccv_comp_t* r2 = (const ccv_comp_t*)_r2;
 	int distance = (int)(r1->rect.width * 0.25 + 0.5);
 
-	return r2->id == r1->id &&
+	return r2->classification.id == r1->classification.id &&
 		   r2->rect.x <= r1->rect.x + distance &&
 		   r2->rect.x >= r1->rect.x - distance &&
 		   r2->rect.y <= r1->rect.y + distance &&
@@ -1260,9 +1261,9 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 						{
 							ccv_comp_t comp;
 							comp.rect = ccv_rect((int)((x * 4 + dx[q] * 2) * scale_x + 0.5), (int)((y * 4 + dy[q] * 2) * scale_y + 0.5), (int)(cascade->size.width * scale_x + 0.5), (int)(cascade->size.height * scale_y + 0.5));
-							comp.id = t;
 							comp.neighbors = 1;
-							comp.confidence = sum;
+							comp.classification.id = t;
+							comp.classification.confidence = sum;
 							ccv_array_push(seq, &comp);
 						}
 						u8[0] += 4;
@@ -1301,7 +1302,7 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 				int idx = *(int*)ccv_array_get(idx_seq, i);
 
 				if (comps[idx].neighbors == 0)
-					comps[idx].confidence = r1.confidence;
+					comps[idx].classification.confidence = r1.classification.confidence;
 
 				++comps[idx].neighbors;
 
@@ -1309,8 +1310,8 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 				comps[idx].rect.y += r1.rect.y;
 				comps[idx].rect.width += r1.rect.width;
 				comps[idx].rect.height += r1.rect.height;
-				comps[idx].id = r1.id;
-				comps[idx].confidence = ccv_max(comps[idx].confidence, r1.confidence);
+				comps[idx].classification.id = r1.classification.id;
+				comps[idx].classification.confidence = ccv_max(comps[idx].classification.confidence, r1.classification.confidence);
 			}
 
 			// calculate average bounding box
@@ -1325,8 +1326,8 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 					comp.rect.width = (comps[i].rect.width * 2 + n) / (2 * n);
 					comp.rect.height = (comps[i].rect.height * 2 + n) / (2 * n);
 					comp.neighbors = comps[i].neighbors;
-					comp.id = comps[i].id;
-					comp.confidence = comps[i].confidence;
+					comp.classification.id = comps[i].classification.id;
+					comp.classification.confidence = comps[i].classification.confidence;
 					ccv_array_push(seq2, &comp);
 				}
 			}
@@ -1343,7 +1344,7 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 					int distance = (int)(r2.rect.width * 0.25 + 0.5);
 
 					if(i != j &&
-					   r1.id == r2.id &&
+					   r1.classification.id == r2.classification.id &&
 					   r1.rect.x >= r2.rect.x - distance &&
 					   r1.rect.y >= r2.rect.y - distance &&
 					   r1.rect.x + r1.rect.width <= r2.rect.x + r2.rect.width + distance &&
@@ -1383,12 +1384,12 @@ ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_ca
 			ccv_comp_t r1 = *(ccv_comp_t*)ccv_array_get(result_seq, i);
 			int idx = *(int*)ccv_array_get(idx_seq, i);
 
-			if (comps[idx].neighbors == 0 || comps[idx].confidence < r1.confidence)
+			if (comps[idx].neighbors == 0 || comps[idx].classification.confidence < r1.classification.confidence)
 			{
-				comps[idx].confidence = r1.confidence;
+				comps[idx].classification.confidence = r1.classification.confidence;
 				comps[idx].neighbors = 1;
 				comps[idx].rect = r1.rect;
-				comps[idx].id = r1.id;
+				comps[idx].classification.id = r1.classification.id;
 			}
 		}
 
@@ -1428,6 +1429,7 @@ ccv_bbf_classifier_cascade_t* ccv_bbf_read_classifier_cascade(const char* direct
 		return 0;
 	ccv_bbf_classifier_cascade_t* cascade = (ccv_bbf_classifier_cascade_t*)ccmalloc(sizeof(ccv_bbf_classifier_cascade_t));
 	s = fscanf(r, "%d %d %d", &cascade->count, &cascade->size.width, &cascade->size.height);
+	assert(s > 0);
 	cascade->stage_classifier = (ccv_bbf_stage_classifier_t*)ccmalloc(cascade->count * sizeof(ccv_bbf_stage_classifier_t));
 	for (i = 0; i < cascade->count; i++)
 	{
diff --git a/lib/ccv_convnet.c b/lib/ccv_convnet.c
index c32e5978e..42c50c36f 100644
--- a/lib/ccv_convnet.c
+++ b/lib/ccv_convnet.c
@@ -1,50 +1,28 @@
 #include "ccv.h"
 #include "ccv_internal.h"
+#if defined(HAVE_SSE2)
+#include <xmmintrin.h>
+#elif defined(HAVE_NEON)
+#include <arm_neon.h>
+#endif
 #ifdef HAVE_GSL
 #include <gsl/gsl_rng.h>
 #include <gsl/gsl_randist.h>
 #endif
+#ifdef USE_DISPATCH
+#include <dispatch/dispatch.h>
+#endif
 #ifdef HAVE_CUDA
 #include "cuda/cwc.h"
 #endif
 #include "3rdparty/sqlite3/sqlite3.h"
-
-inline static void _ccv_convnet_layer_deduce_output_format(ccv_convnet_layer_t* layer, int* rows, int* cols)
-{
-	assert(rows != 0 && cols != 0);
-	switch(layer->type)
-	{
-		case CCV_CONVNET_CONVOLUTIONAL:
-			assert(layer->net.convolutional.rows % 2); // as of now, don't support even number of kernel size
-			assert(layer->net.convolutional.cols % 2);
-			assert((layer->input.matrix.rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows) % layer->net.convolutional.strides == 0);
-			assert((layer->input.matrix.cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols) % layer->net.convolutional.strides == 0);
-			*rows = (layer->input.matrix.rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
-			*cols = (layer->input.matrix.cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
-			break;
-		case CCV_CONVNET_FULL_CONNECT:
-			*rows = layer->net.full_connect.count;
-			*cols = 1;
-			break;
-		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-			*rows = layer->input.matrix.rows;
-			*cols = layer->input.matrix.cols;
-			break;
-		case CCV_CONVNET_MAX_POOL:
-		case CCV_CONVNET_AVERAGE_POOL:
-			assert((layer->input.matrix.rows + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
-			assert((layer->input.matrix.cols + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
-			*rows = (layer->input.matrix.rows + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
-			*cols = (layer->input.matrix.cols + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
-			break;
-	}
-}
+#include "inl/ccv_convnet_inl.h"
 
 #ifndef CASE_TESTS
 
-ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_convnet_layer_param_t params[], int count)
+ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count)
 {
-	ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2 + sizeof(ccv_dense_matrix_t*) * (count - 1));
+	ccv_convnet_t* convnet = (ccv_convnet_t*)ccmalloc(sizeof(ccv_convnet_t) + sizeof(ccv_convnet_layer_t) * count + sizeof(ccv_dense_matrix_t*) * count * 2);
 	convnet->use_cwc_accel = use_cwc_accel;
 #ifdef HAVE_GSL
 	gsl_rng_env_setup();
@@ -57,17 +35,13 @@ ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_convnet_layer_param_t para
 	memset(convnet->acts, 0, sizeof(ccv_dense_matrix_t*) * count);
 	convnet->denoms = (ccv_dense_matrix_t**)(convnet->acts + count);
 	memset(convnet->denoms, 0, sizeof(ccv_dense_matrix_t*) * count);
-	if (count > 1) 
-	{
-		convnet->dors = (ccv_dense_matrix_t**)(convnet->acts + count * 2);
-		memset(convnet->dors, 0, sizeof(ccv_dense_matrix_t*) * (count - 1));
-	} else {
-		convnet->dors = 0;
-	}
 	convnet->count = count;
+	convnet->input = input;
 	convnet->rows = params[0].input.matrix.rows;
 	convnet->cols = params[0].input.matrix.cols;
 	convnet->channels = params[0].input.matrix.channels;
+	convnet->mean_activity = ccv_dense_matrix_new(convnet->input.height, convnet->input.width, convnet->channels | CCV_32F, 0, 0);
+	ccv_zero(convnet->mean_activity);
 	ccv_convnet_layer_t* layers = convnet->layers;
 	int i, j;
 	for (i = 0; i < count; i++)
@@ -75,10 +49,15 @@ ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_convnet_layer_param_t para
 		layers[i].type = params[i].type;
 		layers[i].input = params[i].input;
 		layers[i].net = params[i].output;
+		layers[i].reserved = 0;
 		switch (params[i].type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels * params[i].output.convolutional.count;
+				assert(params[i].input.matrix.channels % params[i].input.matrix.partition == 0);
+				assert(params[i].output.convolutional.count % params[i].output.convolutional.partition == 0);
+				assert(params[i].output.convolutional.partition % params[i].input.matrix.partition == 0);
+				assert(params[i].output.convolutional.partition >= params[i].input.matrix.partition);
+				layers[i].wnum = params[i].output.convolutional.rows * params[i].output.convolutional.cols * params[i].output.convolutional.channels / params[i].input.matrix.partition * params[i].output.convolutional.count;
 				layers[i].w = (float*)ccmalloc(sizeof(float) * (layers[i].wnum + params[i].output.convolutional.count));
 				layers[i].bias = layers[i].w + layers[i].wnum;
 #ifdef HAVE_GSL
@@ -120,13 +99,21 @@ ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_convnet_layer_param_t para
 
 int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
 {
-	int i, out_rows, out_cols;
+	int i, out_rows, out_cols, out_partition;
+	if (convnet->count < 1)
+		return -1;
+	// the last layer has to be full connect
+	if (convnet->layers[convnet->count - 1].type != CCV_CONVNET_FULL_CONNECT)
+		return -1;
+	// you cannot enable relu on the last layer
+	if (convnet->layers[convnet->count - 1].net.full_connect.relu)
+		return -1;
 	for (i = 0; i < convnet->count; i++)
 	{
 		ccv_convnet_layer_t* layer = convnet->layers + i;
 		if (i > 0 && (out_rows != layer->input.matrix.rows || out_cols != layer->input.matrix.cols))
 			return -1;
-		_ccv_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+		_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	}
 	if (out_rows * out_cols != output)
 		return -1;
@@ -135,79 +122,272 @@ int ccv_convnet_verify(ccv_convnet_t* convnet, int output)
 
 #endif
 
-static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* d, ccv_dense_matrix_t** b)
+#if defined(HAVE_SSE2) || defined(HAVE_NEON)
+
+static void _ccv_convnet_layer_simd_alloc_reserved(ccv_convnet_layer_t* layer)
 {
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	if (layer->reserved)
+		return;
+	int partition = layer->input.matrix.partition;
 	int ch = layer->net.convolutional.channels;
 	int count = layer->net.convolutional.count;
-	int strides = layer->net.convolutional.strides;
-	int border = layer->net.convolutional.border;
 	int kernel_rows = layer->net.convolutional.rows;
 	int kernel_cols = layer->net.convolutional.cols;
-	int type = CCV_32F | count;
-	assert(CCV_GET_CHANNEL(a->type) == ch);
-	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
-	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
-	int i, j, x, y, k;
-#define for_block(act_block_setup, act_block_begin, act_block_end) \
-	for (k = 0; k < count; k++) \
-	{ \
-		float* ap = a->data.f32; \
-		float* bp = db->data.f32 + k; \
-		float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch; \
-		float bias = layer->bias[k]; \
-		act_block_setup; \
+	int ch_per_partition = ch / partition;
+	int count_per_4 = count / 4;
+	float* simd_w = (float*)ccmalloc(sizeof(float) * layer->wnum);
+	int i, j, k, c;
+	for (k = 0; k < count_per_4; k++)
+		for (i = 0; i < kernel_rows * kernel_cols; i++)
+			for (j = 0; j < ch_per_partition; j++)
+				for (c = 0; c < 4; c++)
+					simd_w[(k * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j) * 4 + c] = layer->w[(k * 4 + c) * kernel_rows * kernel_cols * ch_per_partition + i * ch_per_partition + j];
+	layer->reserved = simd_w;
+}
+
+#endif
+
+#define SIMD(x) ((float*)((x)->reserved))
+
+#if defined(HAVE_SSE2)
+static inline void _ccv_convnet_convolutional_forward_propagate_sse2(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
+{
+	assert(SIMD(layer));
+#define main_for(block) \
+	parallel_for(k, (count >> 2)) { \
+		int i, j, x, y, c; \
+		int p = k * 4 / count_per_partition; \
+		float* ap = a->data.f32 + p * ch_per_partition; \
+		float* bp = db->data.f32 + k * 4; \
+		float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
+		float bias[4] __attribute__ ((__aligned__(16))); \
+		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
+		/* 4 accumulators */ \
+		__m128 z4 = _mm_setzero_ps(); \
 		for (i = 0; i < db->rows; i++) \
 		{ \
 			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
 			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
-			comy *= ch * kernel_cols; \
+			comy *= ch_per_partition * kernel_cols; \
 			for (j = 0; j < db->cols; j++) \
 			{ \
-				act_block_begin; \
-				float v = bias; \
-				int comx = (ccv_max(j * strides - border, 0) - (j * strides - border)) * ch; \
-				int maxx = kernel_cols * ch - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)) * ch; \
-				float* w = layer_w + comx + comy; \
+				__m128 v40 = _mm_load_ps(bias); \
+				__m128 v41 = _mm_setzero_ps(); \
+				__m128 v42 = _mm_setzero_ps(); \
+				__m128 v43 = _mm_setzero_ps(); \
+				int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
+				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
+				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
 				float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
 				/* when we have border, we simply do zero padding */ \
 				for (y = 0; y < maxy; y++) \
 				{ \
+					/* special casing for these cases to speed up SIMD computation */ \
 					for (x = 0; x < maxx; x++) \
-						v += w[x] * apz[x]; \
-					w += kernel_cols * ch; \
+					{ \
+						c = 0; \
+						for (; c < ch_per_partition - 3; c += 4) \
+						{ \
+							__m128 apz4 = _mm_loadu_ps(apz + x * ch + c); \
+							__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
+							__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
+							__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
+							__m128 w43 = _mm_loadu_ps(w + (x * ch_per_partition + c + 3) * 4); \
+							__m128 apz40 = _mm_shuffle_ps(apz4, apz4, 0x00); \
+							__m128 apz41 = _mm_shuffle_ps(apz4, apz4, 0x55); \
+							__m128 apz42 = _mm_shuffle_ps(apz4, apz4, 0xAA); \
+							__m128 apz43 = _mm_shuffle_ps(apz4, apz4, 0xFF); \
+							v40 =_mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
+							v41 =_mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
+							v42 =_mm_add_ps(_mm_mul_ps(w42, apz42), v42); \
+							v43 =_mm_add_ps(_mm_mul_ps(w43, apz43), v43); \
+						} \
+						block /* insert executions for tail partition */ \
+					} \
+					w += kernel_cols * ch_per_partition * 4; \
 					apz += a->cols * ch; \
 				} \
-				bp[j * count] = ccv_max(0, v) /* ReLU */; \
-				act_block_end; \
+				__m128 v4 = _mm_max_ps(z4, _mm_add_ps(_mm_add_ps(v40, v41), _mm_add_ps(v42, v43))); \
+				_mm_storeu_ps(bp + j * count, v4); /* ReLU */ \
 			} \
 			bp += db->cols * count; \
 			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
 		} \
-	}
-	if (d)
+	} parallel_endfor
+	if (ch_per_partition % 4 == 0)
 	{
-#define act_block_setup \
-		int* dp = d->data.i32 + k;
-#define act_block_begin \
-		if (!*dp) \
-		{
-#define act_block_end \
-		} else \
-			bp[j * count] = 0; \
-		dp += count;
-		for_block(act_block_setup, act_block_begin, act_block_end);
-#undef act_block_setup
-#undef act_block_begin
-#undef act_block_end
+		main_for();
+	} else if (ch_per_partition % 4 == 3) { // unroll the last for-loops
+#define block \
+		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
+		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
+		__m128 apz42 = _mm_load1_ps(apz + x * ch + c + 2); \
+		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
+		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
+		__m128 w42 = _mm_loadu_ps(w + (x * ch_per_partition + c + 2) * 4); \
+		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
+		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41); \
+		v42 = _mm_add_ps(_mm_mul_ps(w42, apz42), v42);
+		main_for(block);
+#undef block
+	} else if (ch_per_partition % 4 == 2) { // unroll the last for-loops
+#define block \
+		__m128 apz40 = _mm_load1_ps(apz + x * ch + c); \
+		__m128 apz41 = _mm_load1_ps(apz + x * ch + c + 1); \
+		__m128 w40 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
+		__m128 w41 = _mm_loadu_ps(w + (x * ch_per_partition + c + 1) * 4); \
+		v40 = _mm_add_ps(_mm_mul_ps(w40, apz40), v40); \
+		v41 = _mm_add_ps(_mm_mul_ps(w41, apz41), v41);
+		main_for(block);
+#undef block
 	} else {
-		for_block(/* empty act block setup */, /* empty act block begin */, /* empty act block end */);
+#define block \
+		__m128 apz4 = _mm_load1_ps(apz + x * ch + c); \
+		__m128 w4 = _mm_loadu_ps(w + (x * ch_per_partition + c) * 4); \
+		v40 = _mm_add_ps(_mm_mul_ps(w4, apz4), v40);
+		main_for(block);
+#undef block
+	}
+#undef main_for
+}
+#elif defined(HAVE_NEON)
+static inline void _ccv_convnet_convolutional_forward_propagate_neon(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
+{
+	assert(SIMD(layer));
+#define main_for(block) \
+	parallel_for(k, (count >> 2)) { \
+		int i, j, x, y, c; \
+		int p = k * 4 / count_per_partition; \
+		float* ap = a->data.f32 + p * ch_per_partition; \
+		float* bp = db->data.f32 + k * 4; \
+		float* layer_w = SIMD(layer) + k * 4 * kernel_rows * kernel_cols * ch_per_partition; \
+		float bias[4] __attribute__ ((__aligned__(16))); \
+		memcpy(bias, layer->bias + k * 4, sizeof(float) * 4); \
+		float32x4_t z4 = vmovq_n_f32(0); \
+		for (i = 0; i < db->rows; i++) \
+		{ \
+			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
+			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows)); \
+			comy *= ch_per_partition * kernel_cols; \
+			for (j = 0; j < db->cols; j++) \
+			{ \
+				float32x4_t v40 = vld1q_f32(bias); \
+				float32x4_t v41 = vmovq_n_f32(0); \
+				int comx = ccv_max(j * strides - border, 0) - (j * strides - border); \
+				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols)); \
+				float* w = layer_w + (comx * ch_per_partition + comy) * 4; \
+				float* apz = ap + ccv_max(j * strides - border, 0) * ch; \
+				/* when we have border, we simply do zero padding */ \
+				for (y = 0; y < maxy; y++) \
+				{ \
+					for (x = 0; x < maxx; x++) \
+					{ \
+						c = 0; \
+						for (; c < ch_per_partition - 1; c += 2) \
+						{ \
+							float32x2_t apz4 = vld1_f32(apz + x * ch + c); \
+							float32x4_t apz40 = vdupq_lane_f32(apz4, 0); \
+							float32x4_t apz41 = vdupq_lane_f32(apz4, 1); \
+							float32x4_t w40 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
+							float32x4_t w41 = vld1q_f32(w + (x * ch_per_partition + c + 1) * 4); \
+							v40 = vmlaq_f32(v40, w40, apz40); \
+							v41 = vmlaq_f32(v41, w41, apz41); \
+						} \
+						block /* insert executions for tail partition */ \
+					} \
+					w += kernel_cols * ch_per_partition * 4; \
+					apz += a->cols * ch; \
+				} \
+				float32x4_t v4 = vmaxq_f32(z4, vaddq_f32(v40, v41)); \
+				vst1q_f32(bp + j * count, v4); /* ReLU */ \
+			} \
+			bp += db->cols * count; \
+			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
+		} \
+	} parallel_endfor
+	if (ch_per_partition % 2 == 0)
+	{
+		main_for();
+	} else { // unroll the last for-loops
+#define block \
+		float32x4_t apz4 = vmovq_n_f32(apz[x * ch + c]); \
+		float32x4_t w4 = vld1q_f32(w + (x * ch_per_partition + c) * 4); \
+		v40 = vmlaq_f32(v40, w4, apz4);
+		main_for(block);
+#undef block
 	}
-#undef for_block
+#undef main_for
+}
+#else
+static inline void _ccv_convnet_convolutional_forward_propagate_fallback(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* db, int rows, int cols, int ch, int count, int strides, int border, int kernel_rows, int kernel_cols, int ch_per_partition, int count_per_partition)
+{
+	parallel_for(k, count) {
+		int i, j, x, y, c;
+		int p = k / count_per_partition;
+		float* ap = a->data.f32 + p * ch_per_partition;
+		float* bp = db->data.f32 + k;
+		float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
+		float bias = layer->bias[k];
+		for (i = 0; i < db->rows; i++)
+		{
+			int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
+			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(a->rows + border, i * strides + kernel_rows));
+			comy *= ch_per_partition * kernel_cols;
+			for (j = 0; j < db->cols; j++)
+			{
+				float v = bias;
+				int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
+				int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(a->cols + border, j * strides + kernel_cols));
+				float* w = layer_w + comx * ch_per_partition + comy;
+				float* apz = ap + ccv_max(j * strides - border, 0) * ch;
+				// when we have border, we simply do zero padding
+				for (y = 0; y < maxy; y++)
+				{
+					for (x = 0; x < maxx; x++)
+						for (c = 0; c < ch_per_partition; c++)
+							v += w[x * ch_per_partition + c] * apz[x * ch + c];
+					w += kernel_cols * ch_per_partition;
+					apz += a->cols * ch;
+				}
+				bp[j * count] = ccv_max(0, v); // ReLU
+			}
+			bp += db->cols * count;
+			ap += a->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
+		}
+	} parallel_endfor
 }
+#endif
 
-static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* d, ccv_dense_matrix_t** b)
+static void _ccv_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
+{
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, a->rows, a->cols, &rows, &cols, &partition);
+	int ch = layer->net.convolutional.channels;
+	int count = layer->net.convolutional.count;
+	int strides = layer->net.convolutional.strides;
+	int border = layer->net.convolutional.border;
+	int kernel_rows = layer->net.convolutional.rows;
+	int kernel_cols = layer->net.convolutional.cols;
+	int type = CCV_32F | count;
+	assert(CCV_GET_CHANNEL(a->type) == ch);
+	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
+	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
+	int ch_per_partition = ch / partition;
+	int count_per_partition = count / partition;
+	assert(count_per_partition % 4 == 0);
+#if defined(HAVE_SSE2) || defined(HAVE_NEON)
+	_ccv_convnet_layer_simd_alloc_reserved(layer);
+#endif
+#if defined(HAVE_SSE2)
+	_ccv_convnet_convolutional_forward_propagate_sse2(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
+#elif defined(HAVE_NEON)
+	_ccv_convnet_convolutional_forward_propagate_neon(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
+#else
+	_ccv_convnet_convolutional_forward_propagate_fallback(layer, a, db, rows, cols, ch, count, strides, border, kernel_rows, kernel_cols, ch_per_partition, count_per_partition);
+#endif
+}
+
+static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
 {
 	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
 	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, layer->net.full_connect.count, 1, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
@@ -220,38 +400,21 @@ static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* lay
 	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type);
 	int i;
 	float* bptr = db->data.f32;
-	if (d)
-	{
-		int j;
-		float* aptr = a->data.f32;
-		float* wptr = layer->w;
-		int* dptr = d->data.i32;
-		for (i = 0; i < db->rows; i++)
-		{
-			if (!dptr[i])
-			{
-				float v = layer->bias[i];
-				for (j = 0; j < a->rows; j++)
-					v += aptr[j] * wptr[j];
-				wptr += a->rows;
-				bptr[i] = v;
-			} else
-				bptr[i] = 0;
-		}
-	} else {
+	for (i = 0; i < db->rows; i++)
+		bptr[i] = layer->bias[i];
+	ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
+	ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
+	if (layer->net.full_connect.relu)
 		for (i = 0; i < db->rows; i++)
-			bptr[i] = layer->bias[i];
-		ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
-		ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
-	}
+			bptr[i] = ccv_max(0, bptr[i]); // relu
 	a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
 	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
 }
 
 static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
 {
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, a->rows, a->cols, &rows, &cols, &partition);
 	int size = layer->net.rnorm.size;
 	float kappa = layer->net.rnorm.kappa;
 	float alpha = layer->net.rnorm.alpha;
@@ -261,34 +424,56 @@ static void _ccv_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, ccv
 	int ch = CCV_GET_CHANNEL(a->type);
 	int type = CCV_32F | ch;
 	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
-	ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
-	int i, j, k, x;
+	int i, j, k, x, p;
 	float* ap = a->data.f32;
-	float* dp = ddenoms->data.f32;
 	float* bp = db->data.f32;
-	for (i = 0; i < db->rows; i++)
+	int ch_per_partition = ch / partition;
+	if (denoms)
 	{
-		for (j = 0; j < db->cols; j++)
-			for (k = 0; k < ch; k++)
-			{
-				float v = ap[j * ch + k];
-				float denom = 0;
-				for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch - 1); x++)
-					denom += ap[j * ch + x] * ap[j * ch + x];
-				denom = kappa + alpha * denom;
-				dp[j * ch + k] = denom;
-				bp[j * ch + k] = v * powf(denom, -beta);
-			}
-		ap += a->cols * ch;
-		dp += ddenoms->cols * ch;
-		bp += db->cols * ch;
+		ccv_dense_matrix_t* ddenoms = *denoms = ccv_dense_matrix_renew(*denoms, rows, cols, type, type, 0);
+		float* dp = ddenoms->data.f32;
+		for (i = 0; i < db->rows; i++)
+		{
+			for (j = 0; j < db->cols; j++)
+				for (p = 0; p < partition; p++)
+					for (k = 0; k < ch_per_partition; k++)
+					{
+						float v = ap[j * ch + p * ch_per_partition + k];
+						float denom = 0;
+						for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
+							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
+						denom = kappa + alpha * denom;
+						dp[j * ch + p * ch_per_partition + k] = denom;
+						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
+					}
+			ap += a->cols * ch;
+			dp += ddenoms->cols * ch;
+			bp += db->cols * ch;
+		}
+	} else {
+		for (i = 0; i < db->rows; i++)
+		{
+			for (j = 0; j < db->cols; j++)
+				for (p = 0; p < partition; p++)
+					for (k = 0; k < ch_per_partition; k++)
+					{
+						float v = ap[j * ch + p * ch_per_partition + k];
+						float denom = 0;
+						for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
+							denom += ap[j * ch + p * ch_per_partition + x] * ap[j * ch + p * ch_per_partition + x];
+						denom = kappa + alpha * denom;
+						bp[j * ch + p * ch_per_partition + k] = v * powf(denom, -beta);
+					}
+			ap += a->cols * ch;
+			bp += db->cols * ch;
+		}
 	}
 }
 
 static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
 {
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, a->rows, a->cols, &rows, &cols, &partition);
 	int size = layer->net.pool.size;
 	int strides = layer->net.pool.strides;
 	int border = layer->net.pool.border;
@@ -326,8 +511,8 @@ static void _ccv_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer,
 
 static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
 {
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, a->rows, a->cols, &rows, &cols, &partition);
 	int size = layer->net.pool.size;
 	int strides = layer->net.pool.strides;
 	int border = layer->net.pool.border;
@@ -360,6 +545,80 @@ static void _ccv_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* lay
 	}
 }
 
+static void _ccv_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, ccv_dense_matrix_t** denoms)
+{
+	switch(layer->type)
+	{
+		case CCV_CONVNET_CONVOLUTIONAL:
+			_ccv_convnet_convolutional_forward_propagate(layer, a, b);
+			break;
+		case CCV_CONVNET_FULL_CONNECT:
+			_ccv_convnet_full_connect_forward_propagate(layer, a, b);
+			break;
+		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			_ccv_convnet_rnorm_forward_propagate(layer, a, b, denoms);
+			break;
+		case CCV_CONVNET_MAX_POOL:
+			_ccv_convnet_max_pool_forward_propagate(layer, a, b);
+			break;
+		case CCV_CONVNET_AVERAGE_POOL:
+			_ccv_convnet_average_pool_forward_propagate(layer, a, b);
+			break;
+	}
+}
+
+static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
+{
+	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
+	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, layer->net.full_connect.count, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
+	// reshape a for gemm
+	int i, j;
+	float* bptr = db->data.f32;
+	for (i = 0; i < db->rows; i++)
+	{
+		for (j = 0; j < db->cols; j++)
+			bptr[j] = layer->bias[j];
+		bptr += db->cols;
+	}
+	ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
+	ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
+	bptr = db->data.f32;
+	if (layer->net.full_connect.relu)
+		for (i = 0; i < db->rows; i++)
+		{
+			for (j = 0; j < db->cols; j++)
+				bptr[j] = ccv_max(0, bptr[j]); // relu
+			bptr += db->cols;
+		}
+}
+
+static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
+{
+	assert(CCV_GET_CHANNEL(a->type) == CCV_C1);
+	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
+	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, 1, a->cols, CCV_32F | CCV_C1, CCV_32F | CCV_C1, 0);
+	ccv_zero(db);
+	int i, j;
+	float* aptr = a->data.f32;
+	float* bptr = db->data.f32;
+	float* cptr = (float*)ccmalloc(sizeof(float) * a->cols);
+	for (i = 0; i < a->rows; i++)
+	{
+		double max = aptr[0];
+		for (j = 1; j < a->cols; j++)
+			if (aptr[j] > max)
+				max = aptr[j];
+		double tt = 0;
+		for (j = 0; j < a->cols; j++)
+			tt += (cptr[j] = expf(aptr[j] - max));
+		tt = 1.0 / tt;
+		for (j = 0; j < a->cols; j++)
+			bptr[j] += cptr[j] * tt;
+		aptr += a->cols;
+	}
+	ccfree(cptr);
+}
+
 #ifndef CASE_TESTS
 
 void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
@@ -377,74 +636,174 @@ void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dens
 	// save the last layer of neuron cache in case that we encode to a different matrix
 	ccv_dense_matrix_t* out_neuron = convnet->acts[convnet->count - 1];
 	convnet->acts[convnet->count - 1] = *b;
-	switch(convnet->layers->type)
+	_ccv_convnet_layer_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
+	for (i = 1; i < convnet->count; i++)
+		_ccv_convnet_layer_forward_propagate(convnet->layers + i, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
+	if (convnet->acts + convnet->count - 1 != b)
 	{
-		case CCV_CONVNET_CONVOLUTIONAL:
-			_ccv_convnet_convolutional_forward_propagate(convnet->layers, *a, convnet->count > 1 ? convnet->dors[0] : 0, convnet->acts);
-			break;
-		case CCV_CONVNET_FULL_CONNECT:
-			_ccv_convnet_full_connect_forward_propagate(convnet->layers, *a, convnet->count > 1 ? convnet->dors[0] : 0, convnet->acts);
-			break;
-		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-			_ccv_convnet_rnorm_forward_propagate(convnet->layers, *a, convnet->acts, convnet->denoms);
-			break;
-		case CCV_CONVNET_MAX_POOL:
-			_ccv_convnet_max_pool_forward_propagate(convnet->layers, *a, convnet->acts);
-			break;
-		case CCV_CONVNET_AVERAGE_POOL:
-			_ccv_convnet_average_pool_forward_propagate(convnet->layers, *a, convnet->acts);
-			break;
+		*b = convnet->acts[convnet->count - 1];
+		// restore the last layer of neuron cache
+		convnet->acts[convnet->count - 1] = out_neuron;
 	}
-	for (i = 1; i < convnet->count; i++)
+#ifdef HAVE_CUDA
+	}
+#endif
+}
+
+// find the layer for scanning (it is the last convolutional layer)
+static int _ccv_convnet_find_scan(ccv_convnet_t* convnet)
+{
+	int i;
+	ccv_convnet_layer_t* layers = convnet->layers;
+	for (i = convnet->count - 1; i >= 0; i--)
+		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
+			return i;
+	return -1;
+}
+
+static int _ccv_convnet_derive_scale(ccv_convnet_t* convnet, int scan)
+{
+	int i, scale = 1;
+	for (i = scan; i >= 0; i--)
 	{
 		ccv_convnet_layer_t* layer = convnet->layers + i;
-		ccv_dense_matrix_t* d = i < convnet->count - 1 ? convnet->dors[i] : 0;
-		switch(layer->type)
+		switch (layer->type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				_ccv_convnet_convolutional_forward_propagate(layer, convnet->acts[i - 1], d, convnet->acts + i);
-				break;
-			case CCV_CONVNET_FULL_CONNECT:
-				_ccv_convnet_full_connect_forward_propagate(layer, convnet->acts[i - 1], d, convnet->acts + i);
-				break;
-			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-				_ccv_convnet_rnorm_forward_propagate(layer, convnet->acts[i - 1], convnet->acts + i, convnet->denoms + i);
+				scale *= layer->net.convolutional.strides;
 				break;
 			case CCV_CONVNET_MAX_POOL:
-				_ccv_convnet_max_pool_forward_propagate(layer, convnet->acts[i - 1], convnet->acts + i);
-				break;
 			case CCV_CONVNET_AVERAGE_POOL:
-				_ccv_convnet_average_pool_forward_propagate(layer, convnet->acts[i - 1], convnet->acts + i);
+				scale *= layer->net.pool.strides;
 				break;
 		}
 	}
-	if (convnet->acts + convnet->count - 1 != b)
-	{
-		*b = convnet->acts[convnet->count - 1];
-		// restore the last layer of neuron cache
-		convnet->acts[convnet->count - 1] = out_neuron;
-	}
-#ifdef HAVE_CUDA
-	}
-#endif
+	return scale;
 }
 
-void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
+static int _ccv_convnet_find_full_connect(ccv_convnet_t* convnet)
+{
+	int i;
+	for (i = 0; i < convnet->count; i++)
+		if (convnet->layers[i].type == CCV_CONVNET_FULL_CONNECT)
+			return i;
+	return -1;
+}
+
+void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
 {
 #ifdef HAVE_CUDA
 	if (convnet->use_cwc_accel)
-		cwc_convnet_classify(convnet, a, labels, batch);
+		cwc_convnet_classify(convnet, a, symmetric, ranks, tops, batch);
 	else {
 #endif
-	assert(batch == 1);
-	ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
-	int i, c = 0;
-	ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
-	int maxc = b->data.f32[0];
-	for (i = 1; i < b->rows; i++)
-		if (b->data.f32[i] > maxc)
-			maxc = b->data.f32[i], c = i;
-	labels[0] = c;
+	int i, j, k, t;
+	ccv_dense_matrix_t** b = (ccv_dense_matrix_t**)alloca(sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
+	int scan = _ccv_convnet_find_scan(convnet);
+	int scale = _ccv_convnet_derive_scale(convnet, scan);
+	int full_connect = _ccv_convnet_find_full_connect(convnet);
+	assert(scan >= 0 && scan < convnet->count);
+	assert(full_connect >= 0 && full_connect < convnet->count);
+	memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
+	for (i = 0; i < batch; i++)
+	{
+		assert(CCV_GET_CHANNEL(a[i]->type) == convnet->channels);
+		assert(a[i]->rows == convnet->input.height || a[i]->cols == convnet->input.width);
+		assert(a[i]->rows >= convnet->input.height && a[i]->cols >= convnet->input.width);
+		// find optimal rows and cols to slice to
+		int rows = convnet->rows + ((a[i]->rows - convnet->rows) / scale) * scale;
+		int cols = convnet->cols + ((a[i]->cols - convnet->cols) / scale) * scale;
+		assert(rows == convnet->input.height || cols == convnet->input.width);
+		assert(rows <= a[i]->rows && cols <= a[i]->cols);
+		ccv_dense_matrix_t* slice = 0;
+		ccv_slice(a[i], (ccv_matrix_t**)&slice, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
+		ccv_dense_matrix_t* mean_activity = 0;
+		// scale mean activity up to be substractable (from this one, the CPU implementation is an approximation of GPU implementation)
+		ccv_resample(convnet->mean_activity, &mean_activity, 0, rows, cols, CCV_INTER_CUBIC);
+		ccv_subtract(slice, mean_activity, (ccv_matrix_t**)b, CCV_32F);
+		ccv_matrix_free(mean_activity);
+		ccv_matrix_free(slice);
+		// doing the first few layers until the first scan layer
+		int out_rows, out_cols, out_partition;
+		ccv_dense_matrix_t* c = ccv_dense_matrix_new(5 * (!!symmetric + 1), convnet->layers[full_connect].input.node.count, CCV_32F | CCV_C1, 0, 0);
+		for (t = 0; t <= !!symmetric; t++)
+		{
+			rows = b[0]->rows, cols = b[0]->cols;
+			for (j = 0; j < scan + 1; j++)
+			{
+				ccv_convnet_layer_t* layer = convnet->layers + j;
+				_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
+				_ccv_convnet_layer_forward_propagate(layer, b[j], b + j + 1, 0);
+				assert(b[j + 1]->rows == out_rows && b[j + 1]->cols == out_cols);
+				if (j > 0)
+					ccv_matrix_free(b[j]);
+				rows = out_rows, cols = out_cols;
+			}
+			int offsets[5][2] = {
+				{0, 0},
+				{cols - convnet->layers[scan + 1].input.matrix.cols, 0},
+				{(cols - convnet->layers[scan + 1].input.matrix.cols) / 2, (rows - convnet->layers[scan + 1].input.matrix.rows) / 2},
+				{0, rows - convnet->layers[scan + 1].input.matrix.rows},
+				{cols - convnet->layers[scan + 1].input.matrix.cols, rows - convnet->layers[scan + 1].input.matrix.rows},
+			};
+			for (k = 0; k < 5; k++)
+			{
+				ccv_dense_matrix_t* input = 0;
+				ccv_convnet_layer_t* layer = convnet->layers + scan + 1;
+				ccv_slice(b[scan + 1], (ccv_matrix_t**)&input, CCV_32F, offsets[k][1], offsets[k][0], layer->input.matrix.rows, layer->input.matrix.cols);
+				// copy the last layer for full connect compute
+				b[full_connect] = ccv_dense_matrix_new(convnet->layers[full_connect].input.matrix.rows, convnet->layers[full_connect].input.matrix.cols, CCV_NO_DATA_ALLOC | CCV_32F | convnet->layers[full_connect].input.matrix.channels, c->data.f32 + (t * 5 + k) * convnet->layers[full_connect].input.node.count, 0);
+				for (j = scan + 1; j < full_connect; j++)
+				{
+					layer = convnet->layers + j;
+					_ccv_convnet_layer_forward_propagate(layer, j > scan + 1 ? b[j] : input, b + j + 1, 0);
+					if (j > scan + 1)
+						ccv_matrix_free(b[j]);
+					else
+						ccv_matrix_free(input);
+				}
+				ccv_matrix_free(b[full_connect]);
+				// set it to 0
+				memset(b + scan + 2, 0, sizeof(ccv_dense_matrix_t*) * (full_connect - scan - 1));
+			}
+			ccv_matrix_free(b[scan + 1]);
+			memset(b + 1, 0, sizeof(ccv_dense_matrix_t*) * (scan + 1));
+			ccv_flip(b[0], &b[0], 0, CCV_FLIP_X);
+		}
+		ccv_matrix_free(b[0]);
+		// now have everything in c, do the last full connect propagate
+		b[full_connect] = c;
+		for (j = full_connect; j < convnet->count; j++)
+		{
+			ccv_convnet_layer_t* layer = convnet->layers + j;
+			assert(layer->type == CCV_CONVNET_FULL_CONNECT);
+			_ccv_convnet_full_connect_forward_propagate_parallel(layer, b[j], b + j + 1);
+			ccv_matrix_free(b[j]);
+		}
+		ccv_dense_matrix_t* softmax = 0;
+		_ccv_convnet_compute_softmax_parallel(b[convnet->count], &softmax, 0);
+		ccv_matrix_free(b[convnet->count]);
+		ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
+		float* r = softmax->data.f32;
+		assert(tops <= softmax->cols);
+		for (j = 0; j < tops; j++)
+		{
+			float max_val = -1;
+			int max_idx = -1;
+			for (k = 0; k < softmax->cols; k++)
+				if (r[k] >= 0 && r[k] > max_val)
+					max_val = r[k], max_idx = k;
+			assert(max_idx >= 0);
+			r[max_idx] = -1;
+			ccv_classification_t classification = {
+				.id = max_idx,
+				.confidence = max_val / ((!!symmetric + 1) * 5),
+			};
+			ccv_array_push(ranks[i], &classification);
+		}
+		ccv_matrix_free(softmax);
+		memset(b, 0, sizeof(ccv_dense_matrix_t*) * (convnet->count + 1));
+	}
 #ifdef HAVE_CUDA
 	}
 #endif
@@ -454,34 +813,14 @@ void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* l
 
 #ifdef HAVE_GSL
 
-static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
-{
-	int ch = CCV_GET_CHANNEL(a->type);
-	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
-	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
-	int i;
-	float* aptr = a->data.f32;
-	float* bptr = db->data.f32;
-	double max = aptr[0];
-	for (i = 1; i < a->rows * a->cols * ch; i++)
-		if (aptr[i] > max)
-			max = aptr[i];
-	double tt = 0;
-	for (i = 0; i < a->rows * a->cols * ch; i++)
-		tt += (bptr[i] = expf(aptr[i] - max));
-	tt = 1.0 / tt;
-	for (i = 0; i < a->rows * a->cols * ch; i++)
-		bptr[i] *= tt;
-}
-
 // compute back propagated gradient & weight update delta
-static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* d, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
+static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
 {
-	// a is the input gradient (for back prop), d is the dropout,
+	// a is the input gradient (for back prop).
 	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
 	// note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
 	int ch = layer->net.convolutional.channels;
 	int count = layer->net.convolutional.count;
 	int strides = layer->net.convolutional.strides;
@@ -495,190 +834,129 @@ static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* l
 	a->rows = rows, a->cols = cols, a->type = (a->type - a_ch) | count;
 	assert(CCV_GET_CHANNEL(m->type) == ch);
 	assert(CCV_GET_DATA_TYPE(m->type) == CCV_32F);
-	int i, j, x, y, k;
+	int count_per_partition = count / partition;
+	int ch_per_partition = ch / partition;
 	// update weight gradient
-#define for_block_w(act_block_setup, act_block_begin, act_block_end) \
-	for (k = 0; k < count; k++) \
-	{ \
-		float* mp = m->data.f32; \
-		float* ap = a->data.f32 + k; \
-		float* np = n->data.f32 + k; \
-		float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch; \
-		float bias = 0; \
-		act_block_setup; \
-		for (i = 0; i < rows; i++) \
-		{ \
-			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
-			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows)); \
-			comy *= ch * kernel_cols; \
-			for (j = 0; j < cols; j++) \
-			{ \
-				act_block_begin; \
-				if (np[j * count] > 0) \
-				{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ \
-					float v = ap[j * count]; \
-					bias += v; \
-					int comx = (ccv_max(j * strides - border, 0) - (j * strides - border)) * ch; \
-					int maxx = kernel_cols * ch - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols)) * ch; \
-					float* w = update_w + comx + comy; \
-					float* mpz = mp + ccv_max(j * strides - border, 0) * ch; \
-					/* when we have border, we simply do zero padding */ \
-					for (y = 0; y < maxy; y++) \
-					{ \
-						for (x = 0; x < maxx; x++) \
-							w[x] += v * mpz[x]; \
-						w += kernel_cols * ch; \
-						mpz += m->cols * ch; \
-					} \
-				} \
-				act_block_end; \
-			} \
-			ap += a->cols * count; \
-			np += n->cols * count; \
-			mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
-		} \
-		update_params->bias[k] = bias; \
-	}
-	ccv_dense_matrix_t* db = 0;
+	parallel_for(k, count) {
+		int i, j, x, y, c;
+		int p = k / count_per_partition;
+		float* mp = m->data.f32 + p * ch_per_partition;
+		float* ap = a->data.f32 + k;
+		float* np = n->data.f32 + k;
+		float* update_w = update_params->w + k * kernel_rows * kernel_cols * ch_per_partition;
+		float bias = 0;
+		for (i = 0; i < rows; i++)
+		{
+			int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
+			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(m->rows + border, i * strides + kernel_rows));
+			comy *= ch_per_partition * kernel_cols;
+			for (j = 0; j < cols; j++)
+			{
+				if (np[j * count] > 0)
+				{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
+					float v = ap[j * count];
+					bias += v;
+					int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
+					int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(m->cols + border, j * strides + kernel_cols));
+					float* w = update_w + comx * ch_per_partition + comy;
+					float* mpz = mp + ccv_max(j * strides - border, 0) * ch;
+					/* when we have border, we simply do zero padding */
+					for (y = 0; y < maxy; y++)
+					{
+						for (x = 0; x < maxx; x++)
+							for (c = 0; c < ch_per_partition; c++)
+								w[x * ch_per_partition + c] += v * mpz[x * ch + c];
+						w += kernel_cols * ch_per_partition;
+						mpz += m->cols * ch;
+					}
+				}
+			}
+			ap += a->cols * count;
+			np += n->cols * count;
+			mp += m->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
+		}
+		update_params->bias[k] += bias;
+	} parallel_endfor
 	if (b)
 	{
-		db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
+		ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, m->rows, m->cols, CCV_32F | CCV_GET_CHANNEL(m->type), CCV_32F | CCV_GET_CHANNEL(m->type), 0);
 		// clear it up before propagate result
 		ccv_zero(db);
-	}
-#define for_block_b(act_block_setup, act_block_begin, act_block_end) \
-	for (k = 0; k < count; k++) \
-	{ \
-		float* bp = db->data.f32; \
-		float* ap = a->data.f32 + k; \
-		float* np = n->data.f32 + k; \
-		float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch; \
-		act_block_setup; \
-		for (i = 0; i < rows; i++) \
-		{ \
-			int comy = ccv_max(i * strides - border, 0) - (i * strides - border); \
-			int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows)); \
-			comy *= ch * kernel_cols; \
-			for (j = 0; j < cols; j++) \
-			{ \
-				act_block_begin; \
-				if (np[j * count] > 0) \
-				{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */ \
-					float v = ap[j * count]; \
-					int comx = (ccv_max(j * strides - border, 0) - (j * strides - border)) * ch; \
-					int maxx = kernel_cols * ch - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols)) * ch; \
-					float* w = layer_w + comx + comy; \
-					float* bpz = bp + ccv_max(j * strides - border, 0) * ch; \
-					/* when we have border, we simply do zero padding */ \
-					for (y = 0; y < maxy; y++) \
-					{ \
-						for (x = 0; x < maxx; x++) \
-							bpz[x] += v * w[x]; \
-						w += kernel_cols * ch; \
-						bpz += db->cols * ch; \
-					} \
-				} \
-				act_block_end; \
-			} \
-			ap += a->cols * count; \
-			np += n->cols * count; \
-			bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0)); \
-		} \
-	}
-	if (d)
-	{
-#define act_block_setup \
-		int* dp = d->data.i32 + k;
-#define act_block_begin \
-		if (!*dp) \
+		int k;
+		for (k = 0; k < count; k++)
 		{
-#define act_block_end \
-		} \
-		dp += count;
-		for_block_w(act_block_setup, act_block_begin, act_block_end);
-		if (db)
-			for_block_b(act_block_setup, act_block_begin, act_block_end);
-#undef act_block_setup
-#undef act_block_begin
-#undef act_block_end
-	} else {
-		for_block_w(/* empty act block setup */, /* empty act block begin */, /* empty act block end */);
-		if (db)
-			for_block_b(/* empty act block setup */, /* empty act block begin */, /* empty act block end */);
+			int i, j, x, y, c;
+			int p = k / count_per_partition;
+			float* bp = db->data.f32 + p * ch_per_partition;
+			float* ap = a->data.f32 + k;
+			float* np = n->data.f32 + k;
+			float* layer_w = layer->w + k * kernel_rows * kernel_cols * ch_per_partition;
+			for (i = 0; i < rows; i++)
+			{
+				int comy = ccv_max(i * strides - border, 0) - (i * strides - border);
+				int maxy = kernel_rows - comy - (i * strides + kernel_rows - ccv_min(db->rows + border, i * strides + kernel_rows));
+				comy *= ch_per_partition * kernel_cols;
+				for (j = 0; j < cols; j++)
+				{
+					if (np[j * count] > 0)
+					{ /* when np is bigger than 0, relu continues to update the weight, otherwise it stops */
+						float v = ap[j * count];
+						int comx = ccv_max(j * strides - border, 0) - (j * strides - border);
+						int maxx = kernel_cols - comx - (j * strides + kernel_cols - ccv_min(db->cols + border, j * strides + kernel_cols));
+						float* w = layer_w + comx * ch_per_partition + comy;
+						float* bpz = bp + ccv_max(j * strides - border, 0) * ch;
+						/* when we have border, we simply do zero padding */
+						for (y = 0; y < maxy; y++)
+						{
+							for (x = 0; x < maxx; x++)
+								for (c = 0; c < ch_per_partition; c++)
+									bpz[x * ch + c] += v * w[x * ch_per_partition + c];
+							w += kernel_cols * ch_per_partition;
+							bpz += db->cols * ch;
+						}
+					}
+				}
+				ap += a->cols * count;
+				np += n->cols * count;
+				bp += db->cols * ch * (ccv_max((i + 1) * strides - border, 0) - ccv_max(i * strides - border, 0));
+			}
+		}
 	}
-#undef for_block_w
-#undef for_block_b
 	a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
 }
 
-static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* d, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
+static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
 {
-	// a is the input gradient (for back prop), d is the dropout,
+	// a is the input gradient (for back prop), y is the output (for forward prop)
 	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
-	// note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
 	ccv_dense_matrix_t* db = 0;
 	if (b)
 		db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
 	int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
 	x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
 	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
+	int i;
+	if (layer->net.full_connect.relu)
+		for (i = 0; i < y->rows; i++)
+			if (y->data.f32[i] <= 0)
+				a->data.f32[i] = 0;
 	ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
 	ccv_dense_matrix_t* dw = &w;
-	if (d)
+	// compute bias gradient
+	ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
+	ccv_dense_matrix_t* dbias = &bias;
+	ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
+	// compute weight gradient
+	ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
+	w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
+	// propagate error
+	if (db)
 	{
-		int* dptr = d->data.i32;
-		float* aptr = a->data.f32;
-		float* bptr = update_params->bias;
-		int i, j;
-		// bias gradient
-		for (i = 0; i < a->rows; i++)
-			if (dptr[i])
-				bptr[i] += aptr[i];
-		// weight gradient
-		float* dwptr = update_params->w;
-		for (i = 0; i < a->rows; i++)
-		{
-			if (dptr[i])
-			{
-				float* xptr = x->data.f32;
-				for (j = 0; j < x->rows; j++)
-					dwptr[j] += aptr[i] * xptr[j];
-			}
-			dwptr += x->rows;
-		}
-		// propagate error
-		if (db)
-		{
-			ccv_zero(db);
-			float* wptr = layer->w;
-			for (i = 0; i < a->rows; i++)
-			{
-				if (dptr[i])
-				{
-					float* bptr = db->data.f32;
-					for (j = 0; j < db->rows; j++)
-						bptr[j] += wptr[j] * aptr[i];
-				}
-				wptr += x->rows;
-			}
-		}
-	} else {
-		// compute bias gradient
-		ccv_dense_matrix_t bias = ccv_dense_matrix(a->rows, 1, CCV_32F | CCV_C1, update_params->bias, 0);
-		ccv_dense_matrix_t* dbias = &bias;
-		ccv_add(a, dbias, (ccv_matrix_t**)&dbias, 0);
-		// compute weight gradient
-		ccv_gemm(a, x, 1, dw, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&dw, 0);
-		w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, layer->w, 0);
-		// propagate error
-		if (db)
-		{
-			db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
-			db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
-			ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
-			db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
-			db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
-		}
+		db->rows = x->rows, db->cols = x->cols, db->type = (db->type - x_ch) | CCV_C1;
+		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type);
+		ccv_gemm(&w, a, 1, 0, 0, CCV_A_TRANSPOSE, (ccv_matrix_t**)&db, 0);
+		db->rows = x_rows, db->cols = x_cols, db->type = (db->type - CCV_GET_CHANNEL(db->type)) | x_ch;
+		db->step = db->cols * CCV_GET_DATA_TYPE_SIZE(db->type) * CCV_GET_CHANNEL(db->type);
 	}
 	x->rows = x_rows, x->cols = x_cols, x->type = (x->type - CCV_GET_CHANNEL(x->type)) | x_ch;
 	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type) * CCV_GET_CHANNEL(x->type);
@@ -686,8 +964,8 @@ static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* la
 
 static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* n, ccv_dense_matrix_t* m, ccv_dense_matrix_t* denoms, ccv_dense_matrix_t** b)
 {
-	int rows, cols;
-	_ccv_convnet_layer_deduce_output_format(layer, &rows, &cols);
+	int rows, cols, partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &rows, &cols, &partition);
 	int size = layer->net.rnorm.size;
 	float alpha = layer->net.rnorm.alpha;
 	float beta = layer->net.rnorm.beta;
@@ -696,22 +974,24 @@ static void _ccv_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, cc
 	int ch = CCV_GET_CHANNEL(a->type);
 	int type = CCV_32F | ch;
 	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, type, type, 0);
-	int i, j, k, x;
+	int i, j, k, x, p;
 	float* ap = a->data.f32;
 	float* np = n->data.f32;
 	float* mp = m->data.f32;
 	float* dp = denoms->data.f32;
 	float* bp = db->data.f32;
+	int ch_per_partition = ch / partition;
 	for (i = 0; i < db->rows; i++)
 	{
 		for (j = 0; j < db->cols; j++)
-			for (k = 0; k < ch; k++)
-			{
-				float nom = 0;
-				for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch - 1); x++)
-					nom += -2 * alpha * beta * ap[j * ch + x] * np[j * ch + x] / dp[j * ch + x];
-				bp[j * ch + k] = mp[j * ch + k] * nom + ap[j * ch + k] * powf(dp[j * ch + k], -beta);
-			}
+			for (p = 0; p < partition; p++)
+				for (k = 0; k < ch_per_partition; k++)
+				{
+					float nom = 0;
+					for (x = ccv_max(k - way, 0); x <= ccv_min(k + way, ch_per_partition - 1); x++)
+						nom += -2 * alpha * beta * ap[j * ch + x + p * ch_per_partition] * np[j * ch + x + p * ch_per_partition] / dp[j * ch + x + p * ch_per_partition];
+					bp[j * ch + k + p * ch_per_partition] = mp[j * ch + k + p * ch_per_partition] * nom + ap[j * ch + k + p * ch_per_partition] * powf(dp[j * ch + k + p * ch_per_partition], -beta);
+				}
 		ap += a->cols * ch;
 		np += n->cols * ch;
 		mp += m->cols * ch;
@@ -812,17 +1092,17 @@ static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix
 	int i;
 	ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
 	assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
-	_ccv_convnet_full_connect_backward_propagate(layer, dloss, 0, convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
+	_ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
 	for (i = convnet->count - 2; i >= 0; i--)
 	{
 		layer = convnet->layers + i;
 		switch (layer->type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				_ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], convnet->dors[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
+				_ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
-				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->dors[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
+				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
 				break;
 			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
 				_ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
@@ -837,9 +1117,10 @@ static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix
 	}
 }
 
-static void _ccv_convnet_update(ccv_convnet_t* convnet, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
+static void _ccv_convnet_update(ccv_convnet_t* convnet, int batch, ccv_convnet_t* momentum, ccv_convnet_t* update_params, ccv_convnet_layer_train_param_t* layer_params)
 {
 	int i, j;
+	float learn_rate;
 	for (i = 0; i < convnet->count; i++)
 		switch (update_params->layers[i].type)
 		{
@@ -848,17 +1129,19 @@ static void _ccv_convnet_update(ccv_convnet_t* convnet, ccv_convnet_t* momentum,
 				float* w = convnet->layers[i].w;
 				float* vw = momentum->layers[i].w;
 				float* dw = update_params->layers[i].w;
+				learn_rate = layer_params[i].w.learn_rate / batch;
 				for (j = 0; j < convnet->layers[i].wnum; j++)
 				{
-					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + layer_params[i].w.learn_rate * dw[j];
+					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
 					w[j] += vw[j];
 				}
 				float* bias = convnet->layers[i].bias;
 				float* vbias = momentum->layers[i].bias;
 				float* dbias = update_params->layers[i].bias;
+				learn_rate = layer_params[i].bias.learn_rate / batch;
 				for (j = 0; j < convnet->layers[i].net.convolutional.count; j++)
 				{
-					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + layer_params[i].bias.learn_rate * dbias[j];
+					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
 					bias[j] += vbias[j];
 				}
 				break;
@@ -868,17 +1151,19 @@ static void _ccv_convnet_update(ccv_convnet_t* convnet, ccv_convnet_t* momentum,
 				float* w = convnet->layers[i].w;
 				float* vw = momentum->layers[i].w;
 				float* dw = update_params->layers[i].w;
+				learn_rate = layer_params[i].w.learn_rate / batch;
 				for (j = 0; j < convnet->layers[i].wnum; j++)
 				{
-					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + layer_params[i].w.learn_rate * dw[j];
+					vw[j] = layer_params[i].w.momentum * vw[j] - layer_params[i].w.decay * layer_params[i].w.learn_rate * w[j] + learn_rate * dw[j];
 					w[j] += vw[j];
 				}
 				float* bias = convnet->layers[i].bias;
 				float* vbias = momentum->layers[i].bias;
 				float* dbias = update_params->layers[i].bias;
+				learn_rate = layer_params[i].bias.learn_rate / batch;
 				for (j = 0; j < convnet->layers[i].net.full_connect.count; j++)
 				{
-					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + layer_params[i].bias.learn_rate * dbias[j];
+					vbias[j] = layer_params[i].bias.momentum * vbias[j] - layer_params[i].bias.decay * layer_params[i].bias.learn_rate * bias[j] + learn_rate * dbias[j];
 					bias[j] += vbias[j];
 				}
 				break;
@@ -912,26 +1197,29 @@ static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
 	update_params->acts = (ccv_dense_matrix_t**)(update_params->layers + convnet->count);
 	memset(update_params->acts, 0, sizeof(ccv_dense_matrix_t*) * convnet->count);
 	update_params->denoms = 0;
-	update_params->dors = 0;
+	update_params->input = convnet->input;
 	update_params->rows = convnet->rows;
 	update_params->cols = convnet->cols;
 	update_params->count = convnet->count;
 	update_params->channels = convnet->channels;
+	update_params->mean_activity = 0;
 	int i;
 	for (i = 0; i < convnet->count; i++)
 	{
 		update_params->layers[i].type = convnet->layers[i].type;
+		update_params->layers[i].input = convnet->layers[i].input;
 		update_params->layers[i].net = convnet->layers[i].net;
 		update_params->layers[i].wnum = convnet->layers[i].wnum;
+		update_params->layers[i].reserved = 0;
 		switch (update_params->layers[i].type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				update_params->layers[i].w = (float*)cccalloc(sizeof(float), update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count);
+				update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.convolutional.count, sizeof(float));
 				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
 				assert(update_params->layers[i].wnum % update_params->layers[i].net.full_connect.count == 0);
-				update_params->layers[i].w = (float*)cccalloc(sizeof(float), update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count);
+				update_params->layers[i].w = (float*)cccalloc(update_params->layers[i].wnum + update_params->layers[i].net.full_connect.count, sizeof(float));
 				update_params->layers[i].bias = update_params->layers[i].w + update_params->layers[i].wnum;
 				break;
 			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
@@ -945,6 +1233,39 @@ static ccv_convnet_t* _ccv_convnet_update_new(ccv_convnet_t* convnet)
 	return update_params;
 }
 
+static void _ccv_convnet_compute_softmax(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
+{
+	int ch = CCV_GET_CHANNEL(a->type);
+	assert(CCV_GET_DATA_TYPE(a->type) == CCV_32F);
+	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, a->rows, a->cols, CCV_32F | ch, CCV_32F | ch, 0);
+	int i;
+	float* aptr = a->data.f32;
+	float* bptr = db->data.f32;
+	double max = aptr[0];
+	for (i = 1; i < a->rows * a->cols * ch; i++)
+		if (aptr[i] > max)
+			max = aptr[i];
+	double tt = 0;
+	for (i = 0; i < a->rows * a->cols * ch; i++)
+		tt += (bptr[i] = expf(aptr[i] - max));
+	tt = 1.0 / tt;
+	for (i = 0; i < a->rows * a->cols * ch; i++)
+		bptr[i] *= tt;
+}
+
+static void _ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
+{
+	assert(batch == 1);
+	ccv_convnet_encode(convnet, a, convnet->acts + convnet->count - 1, 1);
+	int i, c = 0;
+	ccv_dense_matrix_t* b = convnet->acts[convnet->count - 1];
+	float maxc = b->data.f32[0];
+	for (i = 1; i < b->rows; i++)
+		if (b->data.f32[i] > maxc)
+			maxc = b->data.f32[i], c = i;
+	labels[0] = c;
+}
+
 #endif
 
 #ifndef CASE_TESTS
@@ -991,8 +1312,10 @@ void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
 			{
 				FLUSH(" - at epoch %03d / %d => stochastic gradient descent at %d / %d", t + 1, params.max_epoch, (i + 1) / params.mini_batch, aligned_rnum / params.mini_batch);
 				// update weights
-				_ccv_convnet_update(convnet, momentum, update_params, params.layer_params);
+				_ccv_convnet_update(convnet, params.mini_batch, momentum, update_params, params.layer_params);
 				_ccv_convnet_update_zero(update_params);
+				// compact the convnet to avoid any staled temporary resource
+				ccv_convnet_compact(convnet);
 			}
 		}
 		int miss = 0;
@@ -1001,7 +1324,7 @@ void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
 			FLUSH(" - at epoch %03d / %d => going through %d / %d for tests", t + 1, params.max_epoch, i + 1, tests->rnum);
 			ccv_categorized_t* test = (ccv_categorized_t*)ccv_array_get(tests, i);
 			int c = 0;
-			ccv_convnet_classify(convnet, &test->matrix, &c, 1);
+			_ccv_convnet_classify(convnet, &test->matrix, &c, 1);
 			if (c != test->c)
 				++miss;
 		}
@@ -1044,14 +1367,12 @@ void ccv_convnet_compact(ccv_convnet_t* convnet)
 				ccv_matrix_free(convnet->denoms[i]);
 			convnet->denoms[i] = 0;
 		}
-	}
-	if (convnet->dors)
-		for (i = 0; i < convnet->count - 1; i++)
+		if (SIMD(convnet->layers + i))
 		{
-			if (convnet->dors[i])
-				ccv_matrix_free(convnet->dors[i]);
-			convnet->dors[i] = 0;
+			ccfree(convnet->layers[i].reserved);
+			convnet->layers[i].reserved = 0;
 		}
+	}
 }
 
 void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params)
@@ -1062,22 +1383,24 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 		const char layer_create_table_qs[] =
 			"CREATE TABLE IF NOT EXISTS layer_params "
 			"(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
-			"input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_node_count INTEGER, "
-			"output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
-			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL);"
+			"input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
+			"output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
+			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
+			"CREATE TABLE IF NOT EXISTS convnet_params "
+			"(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
 			"CREATE TABLE IF NOT EXISTS layer_data "
 			"(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB, half_precision INTEGER);";
 		assert(SQLITE_OK == sqlite3_exec(db, layer_create_table_qs, 0, 0, 0));
 		const char layer_params_insert_qs[] = 
 			"REPLACE INTO layer_params "
 			"(layer, type, "
-			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_node_count, "
-			"output_rows, output_cols, output_channels, output_count, output_strides, output_border, "
-			"output_size, output_kappa, output_alpha, output_beta) VALUES "
+			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
+			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
+			"output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
 			"($layer, $type, " // 1
-			"$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_node_count, " // 5
-			"$output_rows, $output_cols, $output_channels, $output_count, $output_strides, $output_border, " // 11
-			"$output_size, $output_kappa, $output_alpha, $output_beta);"; // 14
+			"$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
+			"$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
+			"$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
 		sqlite3_stmt* layer_params_insert_stmt = 0;
 		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0));
 		const char layer_data_insert_qs[] =
@@ -1095,31 +1418,34 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 			sqlite3_bind_int(layer_params_insert_stmt, 3, layer->input.matrix.rows);
 			sqlite3_bind_int(layer_params_insert_stmt, 4, layer->input.matrix.cols);
 			sqlite3_bind_int(layer_params_insert_stmt, 5, layer->input.matrix.channels);
-			sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.node.count);
+			sqlite3_bind_int(layer_params_insert_stmt, 6, layer->input.matrix.partition);
+			sqlite3_bind_int(layer_params_insert_stmt, 7, layer->input.node.count);
 			switch (layer->type)
 			{
 				case CCV_CONVNET_CONVOLUTIONAL:
-					sqlite3_bind_int(layer_params_insert_stmt, 7, layer->net.convolutional.rows);
-					sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.cols);
-					sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.channels);
-					sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.count);
-					sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.strides);
-					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.border);
+					sqlite3_bind_int(layer_params_insert_stmt, 8, layer->net.convolutional.rows);
+					sqlite3_bind_int(layer_params_insert_stmt, 9, layer->net.convolutional.cols);
+					sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.convolutional.channels);
+					sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.convolutional.partition);
+					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.convolutional.count);
+					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.convolutional.strides);
+					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.convolutional.border);
 					break;
 				case CCV_CONVNET_FULL_CONNECT:
-					sqlite3_bind_int(layer_params_insert_stmt, 10, layer->net.full_connect.count);
+					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
+					sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
 					break;
 				case CCV_CONVNET_MAX_POOL:
 				case CCV_CONVNET_AVERAGE_POOL:
-					sqlite3_bind_int(layer_params_insert_stmt, 11, layer->net.pool.strides);
-					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.pool.border);
-					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.size);
+					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.pool.strides);
+					sqlite3_bind_int(layer_params_insert_stmt, 14, layer->net.pool.border);
+					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.pool.size);
 					break;
 				case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-					sqlite3_bind_int(layer_params_insert_stmt, 13, layer->net.rnorm.size);
-					sqlite3_bind_double(layer_params_insert_stmt, 14, layer->net.rnorm.kappa);
-					sqlite3_bind_double(layer_params_insert_stmt, 15, layer->net.rnorm.alpha);
-					sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.beta);
+					sqlite3_bind_int(layer_params_insert_stmt, 15, layer->net.rnorm.size);
+					sqlite3_bind_double(layer_params_insert_stmt, 16, layer->net.rnorm.kappa);
+					sqlite3_bind_double(layer_params_insert_stmt, 17, layer->net.rnorm.alpha);
+					sqlite3_bind_double(layer_params_insert_stmt, 18, layer->net.rnorm.beta);
 					break;
 			}
 			assert(SQLITE_DONE == sqlite3_step(layer_params_insert_stmt));
@@ -1147,8 +1473,26 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 				sqlite3_clear_bindings(layer_data_insert_stmt);
 			}
 		}
+		// insert convnet related params
+		const char convnet_params_insert_qs[] =
+			"REPLACE INTO convnet_params "
+			"(convnet, mean_activity, input_height, input_width) VALUES (0, $mean_activity, $input_height, $input_width);";
+		sqlite3_stmt* convnet_params_insert_stmt = 0;
+		assert(SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_insert_qs, sizeof(convnet_params_insert_qs), &convnet_params_insert_stmt, 0));
+		assert(convnet->mean_activity->rows == convnet->input.height);
+		assert(convnet->mean_activity->cols == convnet->input.width);
+		assert(CCV_GET_CHANNEL(convnet->mean_activity->type) == convnet->channels);
+		assert(CCV_GET_DATA_TYPE(convnet->mean_activity->type) == CCV_32F);
+		sqlite3_bind_blob(convnet_params_insert_stmt, 1, convnet->mean_activity->data.f32, sizeof(float) * convnet->input.height * convnet->input.width * convnet->channels, SQLITE_STATIC);
+		sqlite3_bind_int(convnet_params_insert_stmt, 2, convnet->input.height);
+		sqlite3_bind_int(convnet_params_insert_stmt, 3, convnet->input.width);
+		assert(SQLITE_DONE == sqlite3_step(convnet_params_insert_stmt));
+		sqlite3_reset(convnet_params_insert_stmt);
+		sqlite3_clear_bindings(convnet_params_insert_stmt);
+
 		sqlite3_finalize(layer_params_insert_stmt);
 		sqlite3_finalize(layer_data_insert_stmt);
+		sqlite3_finalize(convnet_params_insert_stmt);
 		sqlite3_close(db);
 	}
 }
@@ -1163,9 +1507,9 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 		// load layer params
 		const char layer_params_qs[] =
 			"SELECT type, " // 1
-			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_node_count, " // 5
-			"output_rows, output_cols, output_channels, output_count, output_strides, output_border, " // 11
-			"output_size, output_kappa, output_alpha, output_beta FROM layer_params ORDER BY layer ASC;"; // 14
+			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
+			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
+			"output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
 		if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
 		{
 			ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
@@ -1176,45 +1520,64 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 				layer_param.input.matrix.rows = sqlite3_column_int(layer_params_stmt, 1);
 				layer_param.input.matrix.cols = sqlite3_column_int(layer_params_stmt, 2);
 				layer_param.input.matrix.channels = sqlite3_column_int(layer_params_stmt, 3);
-				layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 4);
+				layer_param.input.matrix.partition = sqlite3_column_int(layer_params_stmt, 4);
+				layer_param.input.node.count = sqlite3_column_int(layer_params_stmt, 5);
 				layer_param.bias = layer_param.sigma = 0; // this is irrelevant to read convnet
 				switch (layer_param.type)
 				{
 					case CCV_CONVNET_CONVOLUTIONAL:
-						layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 5);
-						layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 6);
-						layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 7);
-						layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 8);
-						layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 9);
-						layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 10);
+						layer_param.output.convolutional.rows = sqlite3_column_int(layer_params_stmt, 6);
+						layer_param.output.convolutional.cols = sqlite3_column_int(layer_params_stmt, 7);
+						layer_param.output.convolutional.channels = sqlite3_column_int(layer_params_stmt, 8);
+						layer_param.output.convolutional.partition = sqlite3_column_int(layer_params_stmt, 9);
+						layer_param.output.convolutional.count = sqlite3_column_int(layer_params_stmt, 10);
+						layer_param.output.convolutional.strides = sqlite3_column_int(layer_params_stmt, 11);
+						layer_param.output.convolutional.border = sqlite3_column_int(layer_params_stmt, 12);
 						break;
 					case CCV_CONVNET_FULL_CONNECT:
-						layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 8);
+						layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
+						layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
 						break;
 					case CCV_CONVNET_MAX_POOL:
 					case CCV_CONVNET_AVERAGE_POOL:
-						layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 9);
-						layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 10);
-						layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 11);
+						layer_param.output.pool.strides = sqlite3_column_int(layer_params_stmt, 11);
+						layer_param.output.pool.border = sqlite3_column_int(layer_params_stmt, 12);
+						layer_param.output.pool.size = sqlite3_column_int(layer_params_stmt, 13);
 						break;
 					case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-						layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 11);
-						layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 12);
-						layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 13);
-						layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 14);
+						layer_param.output.rnorm.size = sqlite3_column_int(layer_params_stmt, 13);
+						layer_param.output.rnorm.kappa = sqlite3_column_double(layer_params_stmt, 14);
+						layer_param.output.rnorm.alpha = sqlite3_column_double(layer_params_stmt, 15);
+						layer_param.output.rnorm.beta = sqlite3_column_double(layer_params_stmt, 16);
 						break;
 				}
 				ccv_array_push(layer_params, &layer_param);
 			}
 			sqlite3_finalize(layer_params_stmt);
-			convnet = ccv_convnet_new(use_cwc_accel, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum);
+			sqlite3_stmt* convnet_params_input_stmt = 0;
+			// load convnet params for input
+			const char convnet_params_input_qs[] =
+				"SELECT input_height, input_width FROM convnet_params WHERE convnet = 0;";
+			ccv_size_t input = ccv_size(0, 0);
+			if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_input_qs, sizeof(convnet_params_input_qs), &convnet_params_input_stmt, 0))
+			{
+				if (sqlite3_step(convnet_params_input_stmt) == SQLITE_ROW)
+				{
+					input.height = sqlite3_column_int(convnet_params_input_stmt, 0);
+					input.width = sqlite3_column_int(convnet_params_input_stmt, 1);
+				}
+				sqlite3_finalize(convnet_params_input_stmt);
+			}
+			assert(input.height != 0 && input.width != 0);
+			convnet = ccv_convnet_new(use_cwc_accel, input, (ccv_convnet_layer_param_t*)ccv_array_get(layer_params, 0), layer_params->rnum);
+			ccv_array_free(layer_params);
 			// load layer data
 			sqlite3_stmt* layer_data_stmt = 0;
 			const char layer_data_qs[] =
 				"SELECT layer, weight, bias, half_precision FROM layer_data;";
 			if (SQLITE_OK == sqlite3_prepare_v2(db, layer_data_qs, sizeof(layer_data_qs), &layer_data_stmt, 0))
 			{
-				while(sqlite3_step(layer_data_stmt) == SQLITE_ROW)
+				while (sqlite3_step(layer_data_stmt) == SQLITE_ROW)
 				{
 					ccv_convnet_layer_t* layer = convnet->layers + sqlite3_column_int(layer_data_stmt, 0);
 					int half_precision = sqlite3_column_int(layer_data_stmt, 3);
@@ -1252,6 +1615,20 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 				}
 				sqlite3_finalize(layer_data_stmt);
 			}
+			sqlite3_stmt* convnet_params_mean_activity_stmt = 0;
+			// load convnet params for mean activity
+			const char convnet_params_mean_activity_qs[] =
+				"SELECT mean_activity FROM convnet_params WHERE convnet = 0;";
+			if (SQLITE_OK == sqlite3_prepare_v2(db, convnet_params_mean_activity_qs, sizeof(convnet_params_mean_activity_qs), &convnet_params_mean_activity_stmt, 0))
+			{
+				if (sqlite3_step(convnet_params_mean_activity_stmt) == SQLITE_ROW)
+				{
+					int elems = sqlite3_column_bytes(convnet_params_mean_activity_stmt, 0) / sizeof(float);
+					if (elems == convnet->input.height * convnet->input.width * convnet->channels)
+						memcpy(convnet->mean_activity->data.f32, sqlite3_column_blob(convnet_params_mean_activity_stmt, 0), sizeof(float) * elems);
+				}
+				sqlite3_finalize(convnet_params_mean_activity_stmt);
+			}
 		}
 		sqlite3_close(db);
 		return convnet;
@@ -1259,6 +1636,16 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 	return 0;
 }
 
+void ccv_convnet_input_formation(ccv_convnet_t* convnet, ccv_dense_matrix_t* a, ccv_dense_matrix_t** b)
+{
+	if (a->rows > convnet->input.height && a->cols > convnet->input.width)
+		ccv_resample(a, b, CCV_32F, ccv_max(convnet->input.height, (int)(a->rows * (float)convnet->input.height / a->cols + 0.5)), ccv_max(convnet->input.width, (int)(a->cols * (float)convnet->input.width / a->rows + 0.5)), CCV_INTER_AREA);
+	else if (a->rows < convnet->input.height || a->cols < convnet->input.width)
+		ccv_resample(a, b, CCV_32F, ccv_max(convnet->input.height, (int)(a->rows * (float)convnet->input.height / a->cols + 0.5)), ccv_max(convnet->input.width, (int)(a->cols * (float)convnet->input.width / a->rows + 0.5)), CCV_INTER_CUBIC);
+	else
+		ccv_shift(a, (ccv_matrix_t**)b, CCV_32F, 0, 0); // converting to 32f
+}
+
 void ccv_convnet_free(ccv_convnet_t* convnet)
 {
 	ccv_convnet_compact(convnet);
@@ -1266,6 +1653,8 @@ void ccv_convnet_free(ccv_convnet_t* convnet)
 	for (i = 0; i < convnet->count; i++)
 		if (convnet->layers[i].w)
 			ccfree(convnet->layers[i].w);
+	if (convnet->mean_activity)
+		ccv_matrix_free(convnet->mean_activity);
 	ccfree(convnet);
 }
 
diff --git a/lib/ccv_dpm.c b/lib/ccv_dpm.c
index 2a1b68bdd..1335455a7 100644
--- a/lib/ccv_dpm.c
+++ b/lib/ccv_dpm.c
@@ -2031,7 +2031,7 @@ static int _ccv_is_equal_same_class(const void* _r1, const void* _r2, void* data
 	const ccv_root_comp_t* r2 = (const ccv_root_comp_t*)_r2;
 	int distance = (int)(ccv_min(r1->rect.width, r1->rect.height) * 0.25 + 0.5);
 
-	return r2->id == r1->id &&
+	return r2->classification.id == r1->classification.id &&
 		r2->rect.x <= r1->rect.x + distance &&
 		r2->rect.x >= r1->rect.x - distance &&
 		r2->rect.y <= r1->rect.y + distance &&
@@ -2053,11 +2053,11 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 	ccv_dense_matrix_t** pyr = (ccv_dense_matrix_t**)alloca((scale_upto + next * 2) * sizeof(ccv_dense_matrix_t*));
 	_ccv_dpm_feature_pyramid(a, pyr, scale_upto, params.interval);
 	ccv_array_t* idx_seq;
-	ccv_array_t* seq = ccv_array_new(sizeof(ccv_root_comp_t), 64, 0);
-	ccv_array_t* seq2 = ccv_array_new(sizeof(ccv_root_comp_t), 64, 0);
 	ccv_array_t* result_seq = ccv_array_new(sizeof(ccv_root_comp_t), 64, 0);
 	for (c = 0; c < count; c++)
 	{
+		ccv_array_t* seq = ccv_array_new(sizeof(ccv_root_comp_t), 64, 0);
+		ccv_array_t* seq2 = ccv_array_new(sizeof(ccv_root_comp_t), 64, 0);
 		ccv_dpm_mixture_model_t* model = _model[c];
 		double scale_x = 1.0;
 		double scale_y = 1.0;
@@ -2085,9 +2085,9 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 						if (f_ptr[x] + root->beta > params.threshold)
 						{
 							ccv_root_comp_t comp;
-							comp.id = c + 1;
 							comp.neighbors = 1;
-							comp.confidence = f_ptr[x] + root->beta;
+							comp.classification.id = c + 1;
+							comp.classification.confidence = f_ptr[x] + root->beta;
 							comp.pnum = root->count;
 							float drift_x = root->alpha[0],
 								  drift_y = root->alpha[1],
@@ -2095,8 +2095,8 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 							for (k = 0; k < root->count; k++)
 							{
 								ccv_dpm_part_classifier_t* part = root->part + k;
-								comp.part[k].id = c;
 								comp.part[k].neighbors = 1;
+								comp.part[k].classification.id = c;
 								int pww = (part->w->cols - 1) / 2, pwh = (part->w->rows - 1) / 2;
 								int offy = part->y + pwh - rwh * 2;
 								int offx = part->x + pww - rww * 2;
@@ -2110,7 +2110,7 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 								ry = iy - ry;
 								rx = ix - rx;
 								comp.part[k].rect = ccv_rect((int)((rx - pww) * CCV_DPM_WINDOW_SIZE / 2 * scale_x + 0.5), (int)((ry - pwh) * CCV_DPM_WINDOW_SIZE / 2 * scale_y + 0.5), (int)(part->w->cols * CCV_DPM_WINDOW_SIZE / 2 * scale_x + 0.5), (int)(part->w->rows * CCV_DPM_WINDOW_SIZE / 2 * scale_y + 0.5));
-								comp.part[k].confidence = -ccv_get_dense_matrix_cell_value_by(CCV_32F | CCV_C1, part_feature[k], iy, ix, 0);
+								comp.part[k].classification.confidence = -ccv_get_dense_matrix_cell_value_by(CCV_32F | CCV_C1, part_feature[k], iy, ix, 0);
 							}
 							comp.rect = ccv_rect((int)((x + drift_x) * CCV_DPM_WINDOW_SIZE * scale_x - rww * CCV_DPM_WINDOW_SIZE * scale_x * (1.0 + drift_scale) + 0.5), (int)((y + drift_y) * CCV_DPM_WINDOW_SIZE * scale_y - rwh * CCV_DPM_WINDOW_SIZE * scale_y * (1.0 + drift_scale) + 0.5), (int)(root->root.w->cols * CCV_DPM_WINDOW_SIZE * scale_x * (1.0 + drift_scale) + 0.5), (int)(root->root.w->rows * CCV_DPM_WINDOW_SIZE * scale_y * (1.0 + drift_scale) + 0.5));
 							ccv_array_push(seq, &comp);
@@ -2150,12 +2150,12 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 				ccv_root_comp_t r1 = *(ccv_root_comp_t*)ccv_array_get(seq, i);
 				int idx = *(int*)ccv_array_get(idx_seq, i);
 
-				comps[idx].id = r1.id;
+				comps[idx].classification.id = r1.classification.id;
 				comps[idx].pnum = r1.pnum;
-				if (r1.confidence > comps[idx].confidence || comps[idx].neighbors == 0)
+				if (r1.classification.confidence > comps[idx].classification.confidence || comps[idx].neighbors == 0)
 				{
 					comps[idx].rect = r1.rect;
-					comps[idx].confidence = r1.confidence;
+					comps[idx].classification.confidence = r1.classification.confidence;
 					memcpy(comps[idx].part, r1.part, sizeof(ccv_comp_t) * CCV_DPM_PART_MAX);
 				}
 
@@ -2179,15 +2179,15 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 				{
 					ccv_root_comp_t r1 = *(ccv_root_comp_t*)ccv_array_get(seq2, j);
 					if (i != j &&
-						abs(r1.id) == r2->id &&
+						abs(r1.classification.id) == r2->classification.id &&
 						r1.rect.x >= r2->rect.x - distance &&
 						r1.rect.y >= r2->rect.y - distance &&
 						r1.rect.x + r1.rect.width <= r2->rect.x + r2->rect.width + distance &&
 						r1.rect.y + r1.rect.height <= r2->rect.y + r2->rect.height + distance &&
 						// if r1 (the smaller one) is better, mute r2
-						(r2->confidence <= r1.confidence && r2->neighbors < r1.neighbors))
+						(r2->classification.confidence <= r1.classification.confidence && r2->neighbors < r1.neighbors))
 					{
-						r2->id = -r2->id;
+						r2->classification.id = -r2->classification.id;
 						break;
 					}
 				}
@@ -2197,7 +2197,7 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 			for (i = 0; i < seq2->rnum; i++)
 			{
 				ccv_root_comp_t r1 = *(ccv_root_comp_t*)ccv_array_get(seq2, i);
-				if (r1.id > 0)
+				if (r1.classification.id > 0)
 				{
 					int flag = 1;
 
@@ -2207,12 +2207,12 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 						int distance = (int)(ccv_min(r2.rect.width, r2.rect.height) * 0.25 + 0.5);
 
 						if (i != j &&
-							r1.id == abs(r2.id) &&
+							r1.classification.id == abs(r2.classification.id) &&
 							r1.rect.x >= r2.rect.x - distance &&
 							r1.rect.y >= r2.rect.y - distance &&
 							r1.rect.x + r1.rect.width <= r2.rect.x + r2.rect.width + distance &&
 							r1.rect.y + r1.rect.height <= r2.rect.y + r2.rect.height + distance &&
-							(r2.confidence > r1.confidence || r2.neighbors >= r1.neighbors))
+							(r2.classification.confidence > r1.classification.confidence || r2.neighbors >= r1.neighbors))
 						{
 							flag = 0;
 							break;
@@ -2226,14 +2226,13 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 			ccv_array_free(idx_seq);
 			ccfree(comps);
 		}
+		ccv_array_free(seq);
+		ccv_array_free(seq2);
 	}
 
 	for (i = 0; i < scale_upto + next * 2; i++)
 		ccv_matrix_free(pyr[i]);
 
-	ccv_array_free(seq);
-	ccv_array_free(seq2);
-
 	ccv_array_t* result_seq2;
 	/* the following code from OpenCV's haar feature implementation */
 	if (params.flags & CCV_DPM_NO_NESTED)
@@ -2251,12 +2250,12 @@ ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model
 			ccv_root_comp_t r1 = *(ccv_root_comp_t*)ccv_array_get(result_seq, i);
 			int idx = *(int*)ccv_array_get(idx_seq, i);
 
-			if (comps[idx].neighbors == 0 || comps[idx].confidence < r1.confidence)
+			if (comps[idx].neighbors == 0 || comps[idx].classification.confidence < r1.classification.confidence)
 			{
-				comps[idx].confidence = r1.confidence;
+				comps[idx].classification.confidence = r1.classification.confidence;
 				comps[idx].neighbors = 1;
 				comps[idx].rect = r1.rect;
-				comps[idx].id = r1.id;
+				comps[idx].classification.id = r1.classification.id;
 				comps[idx].pnum = r1.pnum;
 				memcpy(comps[idx].part, r1.part, sizeof(ccv_comp_t) * CCV_DPM_PART_MAX);
 			}
diff --git a/lib/ccv_icf.c b/lib/ccv_icf.c
index 58bb452b8..5b54f92c3 100644
--- a/lib/ccv_icf.c
+++ b/lib/ccv_icf.c
@@ -16,32 +16,270 @@ const ccv_icf_param_t ccv_icf_default_params = {
 	.interval = 8,
 };
 
-// cube root approximation using bit hack for 32-bit float
-// provides a very crude approximation
-static inline float cbrt_5_f32(float f)
-{
-	unsigned int* p = (unsigned int*)(&f);
-	*p = *p / 3 + 709921077;
-	return f;
-}
-
-// iterative cube root approximation using Halley's method (float)
-static inline float cbrta_halley_f32(const float a, const float R)
-{
-	const float a3 = a * a * a;
-	const float b = a * (a3 + R + R) / (a3 + a3 + R);
-	return b;
-}
-
-// Code based on
-// http://metamerist.com/cbrt/cbrt.htm
-// cube root approximation using 2 iterations of Halley's method (float)
-// this is expected to be ~2.5x times faster than std::pow(x, 3)
+// this uses a look up table for cubic root computation because rgb to luv only requires data within range of 0~1
 static inline float fast_cube_root(const float d)
 {
-	float a = cbrt_5_f32(d);
-	a = cbrta_halley_f32(a, d);
-	return cbrta_halley_f32(a, d);
+	static const float cube_root[2048] = {
+		0.000000e+00, 7.875788e-02, 9.922871e-02, 1.135885e-01, 1.250203e-01, 1.346741e-01, 1.431126e-01, 1.506584e-01,
+		1.575158e-01, 1.638230e-01, 1.696787e-01, 1.751560e-01, 1.803105e-01, 1.851861e-01, 1.898177e-01, 1.942336e-01,
+		1.984574e-01, 2.025087e-01, 2.064040e-01, 2.101577e-01, 2.137818e-01, 2.172870e-01, 2.206827e-01, 2.239769e-01,
+		2.271770e-01, 2.302894e-01, 2.333199e-01, 2.362736e-01, 2.391553e-01, 2.419692e-01, 2.447191e-01, 2.474085e-01,
+		2.500407e-01, 2.526186e-01, 2.551450e-01, 2.576222e-01, 2.600528e-01, 2.624387e-01, 2.647821e-01, 2.670846e-01,
+		2.693482e-01, 2.715743e-01, 2.737645e-01, 2.759202e-01, 2.780428e-01, 2.801334e-01, 2.821933e-01, 2.842235e-01,
+		2.862251e-01, 2.881992e-01, 2.901465e-01, 2.920681e-01, 2.939647e-01, 2.958371e-01, 2.976862e-01, 2.995125e-01,
+		3.013168e-01, 3.030998e-01, 3.048621e-01, 3.066041e-01, 3.083267e-01, 3.100302e-01, 3.117152e-01, 3.133821e-01,
+		3.150315e-01, 3.166639e-01, 3.182795e-01, 3.198789e-01, 3.214625e-01, 3.230307e-01, 3.245837e-01, 3.261220e-01,
+		3.276460e-01, 3.291559e-01, 3.306521e-01, 3.321348e-01, 3.336045e-01, 3.350613e-01, 3.365056e-01, 3.379375e-01,
+		3.393574e-01, 3.407656e-01, 3.421622e-01, 3.435475e-01, 3.449216e-01, 3.462850e-01, 3.476377e-01, 3.489799e-01,
+		3.503119e-01, 3.516339e-01, 3.529460e-01, 3.542483e-01, 3.555412e-01, 3.568248e-01, 3.580992e-01, 3.593646e-01,
+		3.606211e-01, 3.618689e-01, 3.631082e-01, 3.643391e-01, 3.655617e-01, 3.667762e-01, 3.679827e-01, 3.691814e-01,
+		3.703723e-01, 3.715556e-01, 3.727314e-01, 3.738999e-01, 3.750610e-01, 3.762151e-01, 3.773621e-01, 3.785022e-01,
+		3.796354e-01, 3.807619e-01, 3.818818e-01, 3.829952e-01, 3.841021e-01, 3.852027e-01, 3.862970e-01, 3.873852e-01,
+		3.884673e-01, 3.895434e-01, 3.906136e-01, 3.916779e-01, 3.927365e-01, 3.937894e-01, 3.948367e-01, 3.958785e-01,
+		3.969149e-01, 3.979458e-01, 3.989714e-01, 3.999918e-01, 4.010071e-01, 4.020171e-01, 4.030222e-01, 4.040223e-01,
+		4.050174e-01, 4.060076e-01, 4.069931e-01, 4.079738e-01, 4.089499e-01, 4.099212e-01, 4.108880e-01, 4.118503e-01,
+		4.128081e-01, 4.137615e-01, 4.147105e-01, 4.156551e-01, 4.165955e-01, 4.175317e-01, 4.184637e-01, 4.193916e-01,
+		4.203153e-01, 4.212351e-01, 4.221508e-01, 4.230626e-01, 4.239704e-01, 4.248744e-01, 4.257746e-01, 4.266710e-01,
+		4.275636e-01, 4.284525e-01, 4.293377e-01, 4.302193e-01, 4.310973e-01, 4.319718e-01, 4.328427e-01, 4.337101e-01,
+		4.345741e-01, 4.354346e-01, 4.362918e-01, 4.371456e-01, 4.379961e-01, 4.388433e-01, 4.396872e-01, 4.405279e-01,
+		4.413654e-01, 4.421997e-01, 4.430309e-01, 4.438590e-01, 4.446840e-01, 4.455060e-01, 4.463249e-01, 4.471409e-01,
+		4.479539e-01, 4.487639e-01, 4.495711e-01, 4.503753e-01, 4.511767e-01, 4.519752e-01, 4.527710e-01, 4.535639e-01,
+		4.543541e-01, 4.551415e-01, 4.559263e-01, 4.567083e-01, 4.574877e-01, 4.582644e-01, 4.590385e-01, 4.598100e-01,
+		4.605789e-01, 4.613453e-01, 4.621091e-01, 4.628704e-01, 4.636292e-01, 4.643855e-01, 4.651394e-01, 4.658908e-01,
+		4.666398e-01, 4.673865e-01, 4.681307e-01, 4.688726e-01, 4.696122e-01, 4.703494e-01, 4.710843e-01, 4.718169e-01,
+		4.725473e-01, 4.732754e-01, 4.740013e-01, 4.747250e-01, 4.754464e-01, 4.761657e-01, 4.768828e-01, 4.775978e-01,
+		4.783106e-01, 4.790214e-01, 4.797300e-01, 4.804365e-01, 4.811410e-01, 4.818434e-01, 4.825437e-01, 4.832420e-01,
+		4.839384e-01, 4.846327e-01, 4.853250e-01, 4.860154e-01, 4.867038e-01, 4.873902e-01, 4.880748e-01, 4.887574e-01,
+		4.894381e-01, 4.901170e-01, 4.907939e-01, 4.914690e-01, 4.921423e-01, 4.928137e-01, 4.934832e-01, 4.941510e-01,
+		4.948170e-01, 4.954812e-01, 4.961436e-01, 4.968042e-01, 4.974631e-01, 4.981203e-01, 4.987757e-01, 4.994294e-01,
+		5.000814e-01, 5.007317e-01, 5.013803e-01, 5.020273e-01, 5.026726e-01, 5.033162e-01, 5.039582e-01, 5.045985e-01,
+		5.052372e-01, 5.058743e-01, 5.065099e-01, 5.071438e-01, 5.077761e-01, 5.084069e-01, 5.090362e-01, 5.096638e-01,
+		5.102900e-01, 5.109145e-01, 5.115376e-01, 5.121592e-01, 5.127792e-01, 5.133978e-01, 5.140148e-01, 5.146304e-01,
+		5.152445e-01, 5.158572e-01, 5.164684e-01, 5.170782e-01, 5.176865e-01, 5.182934e-01, 5.188988e-01, 5.195029e-01,
+		5.201056e-01, 5.207069e-01, 5.213068e-01, 5.219053e-01, 5.225024e-01, 5.230982e-01, 5.236927e-01, 5.242857e-01,
+		5.248775e-01, 5.254679e-01, 5.260570e-01, 5.266448e-01, 5.272312e-01, 5.278164e-01, 5.284002e-01, 5.289828e-01,
+		5.295641e-01, 5.301442e-01, 5.307229e-01, 5.313004e-01, 5.318767e-01, 5.324517e-01, 5.330254e-01, 5.335979e-01,
+		5.341693e-01, 5.347394e-01, 5.353082e-01, 5.358759e-01, 5.364423e-01, 5.370076e-01, 5.375717e-01, 5.381346e-01,
+		5.386963e-01, 5.392569e-01, 5.398163e-01, 5.403746e-01, 5.409316e-01, 5.414876e-01, 5.420424e-01, 5.425960e-01,
+		5.431486e-01, 5.437000e-01, 5.442503e-01, 5.447995e-01, 5.453476e-01, 5.458946e-01, 5.464405e-01, 5.469853e-01,
+		5.475290e-01, 5.480717e-01, 5.486133e-01, 5.491537e-01, 5.496932e-01, 5.502316e-01, 5.507689e-01, 5.513052e-01,
+		5.518404e-01, 5.523747e-01, 5.529078e-01, 5.534400e-01, 5.539711e-01, 5.545012e-01, 5.550303e-01, 5.555584e-01,
+		5.560855e-01, 5.566117e-01, 5.571368e-01, 5.576609e-01, 5.581840e-01, 5.587062e-01, 5.592273e-01, 5.597475e-01,
+		5.602668e-01, 5.607851e-01, 5.613024e-01, 5.618188e-01, 5.623342e-01, 5.628487e-01, 5.633622e-01, 5.638748e-01,
+		5.643865e-01, 5.648973e-01, 5.654072e-01, 5.659161e-01, 5.664241e-01, 5.669311e-01, 5.674374e-01, 5.679426e-01,
+		5.684470e-01, 5.689505e-01, 5.694531e-01, 5.699549e-01, 5.704557e-01, 5.709556e-01, 5.714548e-01, 5.719529e-01,
+		5.724503e-01, 5.729468e-01, 5.734424e-01, 5.739372e-01, 5.744311e-01, 5.749242e-01, 5.754164e-01, 5.759078e-01,
+		5.763984e-01, 5.768881e-01, 5.773770e-01, 5.778650e-01, 5.783523e-01, 5.788387e-01, 5.793243e-01, 5.798091e-01,
+		5.802931e-01, 5.807762e-01, 5.812586e-01, 5.817402e-01, 5.822210e-01, 5.827010e-01, 5.831801e-01, 5.836585e-01,
+		5.841362e-01, 5.846130e-01, 5.850891e-01, 5.855644e-01, 5.860389e-01, 5.865127e-01, 5.869856e-01, 5.874579e-01,
+		5.879294e-01, 5.884001e-01, 5.888700e-01, 5.893393e-01, 5.898077e-01, 5.902755e-01, 5.907425e-01, 5.912087e-01,
+		5.916742e-01, 5.921390e-01, 5.926031e-01, 5.930664e-01, 5.935290e-01, 5.939909e-01, 5.944521e-01, 5.949125e-01,
+		5.953723e-01, 5.958313e-01, 5.962896e-01, 5.967473e-01, 5.972042e-01, 5.976604e-01, 5.981160e-01, 5.985708e-01,
+		5.990250e-01, 5.994784e-01, 5.999312e-01, 6.003833e-01, 6.008347e-01, 6.012855e-01, 6.017355e-01, 6.021850e-01,
+		6.026337e-01, 6.030817e-01, 6.035291e-01, 6.039758e-01, 6.044219e-01, 6.048673e-01, 6.053120e-01, 6.057562e-01,
+		6.061996e-01, 6.066424e-01, 6.070846e-01, 6.075261e-01, 6.079670e-01, 6.084072e-01, 6.088468e-01, 6.092858e-01,
+		6.097241e-01, 6.101618e-01, 6.105989e-01, 6.110353e-01, 6.114712e-01, 6.119064e-01, 6.123410e-01, 6.127750e-01,
+		6.132084e-01, 6.136411e-01, 6.140732e-01, 6.145048e-01, 6.149357e-01, 6.153660e-01, 6.157957e-01, 6.162249e-01,
+		6.166534e-01, 6.170813e-01, 6.175086e-01, 6.179354e-01, 6.183616e-01, 6.187872e-01, 6.192122e-01, 6.196365e-01,
+		6.200604e-01, 6.204836e-01, 6.209063e-01, 6.213284e-01, 6.217499e-01, 6.221709e-01, 6.225913e-01, 6.230111e-01,
+		6.234304e-01, 6.238490e-01, 6.242672e-01, 6.246848e-01, 6.251017e-01, 6.255182e-01, 6.259341e-01, 6.263494e-01,
+		6.267643e-01, 6.271785e-01, 6.275922e-01, 6.280054e-01, 6.284180e-01, 6.288301e-01, 6.292416e-01, 6.296526e-01,
+		6.300631e-01, 6.304730e-01, 6.308824e-01, 6.312913e-01, 6.316996e-01, 6.321074e-01, 6.325147e-01, 6.329215e-01,
+		6.333277e-01, 6.337335e-01, 6.341386e-01, 6.345433e-01, 6.349475e-01, 6.353511e-01, 6.357543e-01, 6.361569e-01,
+		6.365590e-01, 6.369606e-01, 6.373618e-01, 6.377624e-01, 6.381625e-01, 6.385621e-01, 6.389612e-01, 6.393598e-01,
+		6.397579e-01, 6.401555e-01, 6.405526e-01, 6.409492e-01, 6.413454e-01, 6.417410e-01, 6.421362e-01, 6.425309e-01,
+		6.429250e-01, 6.433188e-01, 6.437120e-01, 6.441047e-01, 6.444970e-01, 6.448888e-01, 6.452801e-01, 6.456710e-01,
+		6.460613e-01, 6.464512e-01, 6.468406e-01, 6.472296e-01, 6.476181e-01, 6.480061e-01, 6.483937e-01, 6.487808e-01,
+		6.491674e-01, 6.495536e-01, 6.499393e-01, 6.503246e-01, 6.507094e-01, 6.510937e-01, 6.514776e-01, 6.518611e-01,
+		6.522441e-01, 6.526266e-01, 6.530087e-01, 6.533904e-01, 6.537716e-01, 6.541524e-01, 6.545327e-01, 6.549126e-01,
+		6.552920e-01, 6.556710e-01, 6.560495e-01, 6.564277e-01, 6.568054e-01, 6.571826e-01, 6.575595e-01, 6.579359e-01,
+		6.583118e-01, 6.586874e-01, 6.590625e-01, 6.594372e-01, 6.598114e-01, 6.601852e-01, 6.605586e-01, 6.609316e-01,
+		6.613042e-01, 6.616763e-01, 6.620481e-01, 6.624194e-01, 6.627903e-01, 6.631607e-01, 6.635308e-01, 6.639005e-01,
+		6.642697e-01, 6.646385e-01, 6.650070e-01, 6.653750e-01, 6.657426e-01, 6.661098e-01, 6.664766e-01, 6.668430e-01,
+		6.672090e-01, 6.675746e-01, 6.679398e-01, 6.683046e-01, 6.686690e-01, 6.690330e-01, 6.693966e-01, 6.697598e-01,
+		6.701226e-01, 6.704850e-01, 6.708471e-01, 6.712087e-01, 6.715700e-01, 6.719308e-01, 6.722913e-01, 6.726514e-01,
+		6.730111e-01, 6.733705e-01, 6.737294e-01, 6.740879e-01, 6.744461e-01, 6.748039e-01, 6.751614e-01, 6.755184e-01,
+		6.758750e-01, 6.762313e-01, 6.765872e-01, 6.769428e-01, 6.772979e-01, 6.776527e-01, 6.780071e-01, 6.783612e-01,
+		6.787149e-01, 6.790682e-01, 6.794212e-01, 6.797737e-01, 6.801260e-01, 6.804778e-01, 6.808293e-01, 6.811804e-01,
+		6.815312e-01, 6.818815e-01, 6.822316e-01, 6.825813e-01, 6.829306e-01, 6.832796e-01, 6.836282e-01, 6.839765e-01,
+		6.843244e-01, 6.846719e-01, 6.850191e-01, 6.853660e-01, 6.857125e-01, 6.860586e-01, 6.864043e-01, 6.867498e-01,
+		6.870949e-01, 6.874397e-01, 6.877841e-01, 6.881282e-01, 6.884719e-01, 6.888152e-01, 6.891583e-01, 6.895010e-01,
+		6.898433e-01, 6.901854e-01, 6.905270e-01, 6.908684e-01, 6.912094e-01, 6.915500e-01, 6.918904e-01, 6.922303e-01,
+		6.925700e-01, 6.929094e-01, 6.932484e-01, 6.935870e-01, 6.939254e-01, 6.942633e-01, 6.946011e-01, 6.949384e-01,
+		6.952754e-01, 6.956121e-01, 6.959485e-01, 6.962845e-01, 6.966202e-01, 6.969556e-01, 6.972907e-01, 6.976255e-01,
+		6.979599e-01, 6.982940e-01, 6.986278e-01, 6.989613e-01, 6.992944e-01, 6.996273e-01, 6.999598e-01, 7.002920e-01,
+		7.006239e-01, 7.009555e-01, 7.012867e-01, 7.016177e-01, 7.019483e-01, 7.022786e-01, 7.026086e-01, 7.029384e-01,
+		7.032678e-01, 7.035969e-01, 7.039256e-01, 7.042542e-01, 7.045823e-01, 7.049102e-01, 7.052377e-01, 7.055650e-01,
+		7.058919e-01, 7.062186e-01, 7.065449e-01, 7.068710e-01, 7.071967e-01, 7.075222e-01, 7.078474e-01, 7.081722e-01,
+		7.084967e-01, 7.088210e-01, 7.091449e-01, 7.094686e-01, 7.097920e-01, 7.101150e-01, 7.104378e-01, 7.107603e-01,
+		7.110825e-01, 7.114044e-01, 7.117260e-01, 7.120473e-01, 7.123684e-01, 7.126891e-01, 7.130095e-01, 7.133297e-01,
+		7.136496e-01, 7.139692e-01, 7.142885e-01, 7.146075e-01, 7.149262e-01, 7.152447e-01, 7.155629e-01, 7.158808e-01,
+		7.161984e-01, 7.165157e-01, 7.168328e-01, 7.171495e-01, 7.174660e-01, 7.177821e-01, 7.180981e-01, 7.184138e-01,
+		7.187291e-01, 7.190442e-01, 7.193590e-01, 7.196736e-01, 7.199879e-01, 7.203019e-01, 7.206156e-01, 7.209290e-01,
+		7.212422e-01, 7.215551e-01, 7.218677e-01, 7.221801e-01, 7.224922e-01, 7.228040e-01, 7.231156e-01, 7.234268e-01,
+		7.237378e-01, 7.240486e-01, 7.243591e-01, 7.246693e-01, 7.249793e-01, 7.252890e-01, 7.255983e-01, 7.259076e-01,
+		7.262164e-01, 7.265251e-01, 7.268335e-01, 7.271415e-01, 7.274494e-01, 7.277570e-01, 7.280643e-01, 7.283714e-01,
+		7.286782e-01, 7.289847e-01, 7.292911e-01, 7.295971e-01, 7.299029e-01, 7.302084e-01, 7.305137e-01, 7.308187e-01,
+		7.311234e-01, 7.314279e-01, 7.317322e-01, 7.320362e-01, 7.323400e-01, 7.326434e-01, 7.329467e-01, 7.332497e-01,
+		7.335525e-01, 7.338549e-01, 7.341572e-01, 7.344592e-01, 7.347609e-01, 7.350624e-01, 7.353637e-01, 7.356647e-01,
+		7.359655e-01, 7.362660e-01, 7.365662e-01, 7.368662e-01, 7.371660e-01, 7.374656e-01, 7.377649e-01, 7.380639e-01,
+		7.383628e-01, 7.386613e-01, 7.389597e-01, 7.392578e-01, 7.395556e-01, 7.398532e-01, 7.401506e-01, 7.404477e-01,
+		7.407446e-01, 7.410412e-01, 7.413377e-01, 7.416338e-01, 7.419298e-01, 7.422255e-01, 7.425209e-01, 7.428162e-01,
+		7.431112e-01, 7.434059e-01, 7.437005e-01, 7.439948e-01, 7.442889e-01, 7.445827e-01, 7.448763e-01, 7.451697e-01,
+		7.454628e-01, 7.457558e-01, 7.460485e-01, 7.463409e-01, 7.466331e-01, 7.469251e-01, 7.472169e-01, 7.475084e-01,
+		7.477998e-01, 7.480908e-01, 7.483817e-01, 7.486723e-01, 7.489627e-01, 7.492529e-01, 7.495428e-01, 7.498326e-01,
+		7.501221e-01, 7.504114e-01, 7.507005e-01, 7.509893e-01, 7.512779e-01, 7.515663e-01, 7.518545e-01, 7.521424e-01,
+		7.524302e-01, 7.527177e-01, 7.530050e-01, 7.532921e-01, 7.535789e-01, 7.538656e-01, 7.541520e-01, 7.544382e-01,
+		7.547241e-01, 7.550099e-01, 7.552955e-01, 7.555808e-01, 7.558660e-01, 7.561509e-01, 7.564356e-01, 7.567201e-01,
+		7.570043e-01, 7.572884e-01, 7.575722e-01, 7.578558e-01, 7.581393e-01, 7.584225e-01, 7.587055e-01, 7.589883e-01,
+		7.592708e-01, 7.595532e-01, 7.598354e-01, 7.601173e-01, 7.603990e-01, 7.606806e-01, 7.609619e-01, 7.612430e-01,
+		7.615239e-01, 7.618046e-01, 7.620851e-01, 7.623653e-01, 7.626454e-01, 7.629253e-01, 7.632049e-01, 7.634844e-01,
+		7.637637e-01, 7.640427e-01, 7.643216e-01, 7.646002e-01, 7.648786e-01, 7.651569e-01, 7.654349e-01, 7.657127e-01,
+		7.659904e-01, 7.662678e-01, 7.665451e-01, 7.668221e-01, 7.670989e-01, 7.673756e-01, 7.676520e-01, 7.679282e-01,
+		7.682042e-01, 7.684801e-01, 7.687557e-01, 7.690312e-01, 7.693064e-01, 7.695814e-01, 7.698563e-01, 7.701310e-01,
+		7.704054e-01, 7.706797e-01, 7.709538e-01, 7.712276e-01, 7.715013e-01, 7.717748e-01, 7.720481e-01, 7.723212e-01,
+		7.725941e-01, 7.728668e-01, 7.731394e-01, 7.734116e-01, 7.736838e-01, 7.739558e-01, 7.742275e-01, 7.744991e-01,
+		7.747704e-01, 7.750416e-01, 7.753126e-01, 7.755834e-01, 7.758540e-01, 7.761245e-01, 7.763947e-01, 7.766647e-01,
+		7.769346e-01, 7.772043e-01, 7.774737e-01, 7.777431e-01, 7.780122e-01, 7.782811e-01, 7.785498e-01, 7.788184e-01,
+		7.790868e-01, 7.793550e-01, 7.796230e-01, 7.798908e-01, 7.801584e-01, 7.804259e-01, 7.806932e-01, 7.809603e-01,
+		7.812271e-01, 7.814939e-01, 7.817604e-01, 7.820268e-01, 7.822930e-01, 7.825589e-01, 7.828248e-01, 7.830904e-01,
+		7.833558e-01, 7.836211e-01, 7.838862e-01, 7.841511e-01, 7.844158e-01, 7.846804e-01, 7.849448e-01, 7.852090e-01,
+		7.854730e-01, 7.857369e-01, 7.860005e-01, 7.862641e-01, 7.865273e-01, 7.867905e-01, 7.870535e-01, 7.873163e-01,
+		7.875788e-01, 7.878413e-01, 7.881036e-01, 7.883657e-01, 7.886276e-01, 7.888893e-01, 7.891509e-01, 7.894123e-01,
+		7.896735e-01, 7.899345e-01, 7.901954e-01, 7.904561e-01, 7.907166e-01, 7.909770e-01, 7.912372e-01, 7.914972e-01,
+		7.917571e-01, 7.920167e-01, 7.922763e-01, 7.925356e-01, 7.927948e-01, 7.930537e-01, 7.933126e-01, 7.935712e-01,
+		7.938297e-01, 7.940881e-01, 7.943462e-01, 7.946042e-01, 7.948620e-01, 7.951197e-01, 7.953772e-01, 7.956345e-01,
+		7.958916e-01, 7.961487e-01, 7.964054e-01, 7.966621e-01, 7.969186e-01, 7.971749e-01, 7.974311e-01, 7.976871e-01,
+		7.979429e-01, 7.981986e-01, 7.984541e-01, 7.987095e-01, 7.989646e-01, 7.992196e-01, 7.994745e-01, 7.997292e-01,
+		7.999837e-01, 8.002381e-01, 8.004923e-01, 8.007463e-01, 8.010002e-01, 8.012539e-01, 8.015075e-01, 8.017609e-01,
+		8.020141e-01, 8.022672e-01, 8.025202e-01, 8.027729e-01, 8.030255e-01, 8.032780e-01, 8.035302e-01, 8.037823e-01,
+		8.040344e-01, 8.042861e-01, 8.045378e-01, 8.047893e-01, 8.050406e-01, 8.052918e-01, 8.055428e-01, 8.057937e-01,
+		8.060444e-01, 8.062950e-01, 8.065454e-01, 8.067956e-01, 8.070457e-01, 8.072957e-01, 8.075454e-01, 8.077950e-01,
+		8.080446e-01, 8.082938e-01, 8.085430e-01, 8.087921e-01, 8.090409e-01, 8.092896e-01, 8.095381e-01, 8.097866e-01,
+		8.100348e-01, 8.102829e-01, 8.105308e-01, 8.107786e-01, 8.110263e-01, 8.112738e-01, 8.115211e-01, 8.117683e-01,
+		8.120154e-01, 8.122622e-01, 8.125089e-01, 8.127556e-01, 8.130020e-01, 8.132483e-01, 8.134944e-01, 8.137404e-01,
+		8.139862e-01, 8.142319e-01, 8.144775e-01, 8.147229e-01, 8.149682e-01, 8.152133e-01, 8.154582e-01, 8.157030e-01,
+		8.159477e-01, 8.161922e-01, 8.164365e-01, 8.166808e-01, 8.169249e-01, 8.171688e-01, 8.174126e-01, 8.176562e-01,
+		8.178997e-01, 8.181431e-01, 8.183863e-01, 8.186293e-01, 8.188722e-01, 8.191150e-01, 8.193576e-01, 8.196001e-01,
+		8.198425e-01, 8.200847e-01, 8.203267e-01, 8.205686e-01, 8.208104e-01, 8.210521e-01, 8.212935e-01, 8.215349e-01,
+		8.217760e-01, 8.220171e-01, 8.222581e-01, 8.224988e-01, 8.227395e-01, 8.229799e-01, 8.232203e-01, 8.234605e-01,
+		8.237006e-01, 8.239405e-01, 8.241804e-01, 8.244200e-01, 8.246595e-01, 8.248989e-01, 8.251381e-01, 8.253772e-01,
+		8.256162e-01, 8.258550e-01, 8.260937e-01, 8.263323e-01, 8.265706e-01, 8.268089e-01, 8.270471e-01, 8.272851e-01,
+		8.275229e-01, 8.277607e-01, 8.279983e-01, 8.282357e-01, 8.284730e-01, 8.287102e-01, 8.289472e-01, 8.291842e-01,
+		8.294209e-01, 8.296576e-01, 8.298941e-01, 8.301305e-01, 8.303667e-01, 8.306028e-01, 8.308387e-01, 8.310746e-01,
+		8.313103e-01, 8.315458e-01, 8.317813e-01, 8.320166e-01, 8.322517e-01, 8.324867e-01, 8.327217e-01, 8.329564e-01,
+		8.331911e-01, 8.334256e-01, 8.336599e-01, 8.338942e-01, 8.341283e-01, 8.343623e-01, 8.345962e-01, 8.348299e-01,
+		8.350635e-01, 8.352969e-01, 8.355302e-01, 8.357634e-01, 8.359964e-01, 8.362294e-01, 8.364622e-01, 8.366948e-01,
+		8.369274e-01, 8.371598e-01, 8.373921e-01, 8.376243e-01, 8.378563e-01, 8.380882e-01, 8.383200e-01, 8.385516e-01,
+		8.387831e-01, 8.390145e-01, 8.392458e-01, 8.394769e-01, 8.397079e-01, 8.399388e-01, 8.401695e-01, 8.404002e-01,
+		8.406307e-01, 8.408611e-01, 8.410913e-01, 8.413214e-01, 8.415514e-01, 8.417813e-01, 8.420110e-01, 8.422406e-01,
+		8.424702e-01, 8.426995e-01, 8.429288e-01, 8.431579e-01, 8.433869e-01, 8.436158e-01, 8.438445e-01, 8.440731e-01,
+		8.443016e-01, 8.445300e-01, 8.447582e-01, 8.449863e-01, 8.452144e-01, 8.454422e-01, 8.456700e-01, 8.458977e-01,
+		8.461251e-01, 8.463526e-01, 8.465798e-01, 8.468069e-01, 8.470340e-01, 8.472609e-01, 8.474877e-01, 8.477143e-01,
+		8.479409e-01, 8.481673e-01, 8.483936e-01, 8.486198e-01, 8.488458e-01, 8.490717e-01, 8.492976e-01, 8.495233e-01,
+		8.497488e-01, 8.499743e-01, 8.501996e-01, 8.504249e-01, 8.506500e-01, 8.508750e-01, 8.510998e-01, 8.513246e-01,
+		8.515491e-01, 8.517737e-01, 8.519981e-01, 8.522223e-01, 8.524465e-01, 8.526706e-01, 8.528944e-01, 8.531182e-01,
+		8.533419e-01, 8.535655e-01, 8.537889e-01, 8.540123e-01, 8.542355e-01, 8.544586e-01, 8.546816e-01, 8.549044e-01,
+		8.551272e-01, 8.553498e-01, 8.555723e-01, 8.557947e-01, 8.560170e-01, 8.562392e-01, 8.564612e-01, 8.566832e-01,
+		8.569050e-01, 8.571267e-01, 8.573483e-01, 8.575698e-01, 8.577912e-01, 8.580124e-01, 8.582336e-01, 8.584546e-01,
+		8.586755e-01, 8.588963e-01, 8.591169e-01, 8.593375e-01, 8.595580e-01, 8.597783e-01, 8.599985e-01, 8.602186e-01,
+		8.604387e-01, 8.606585e-01, 8.608783e-01, 8.610980e-01, 8.613176e-01, 8.615370e-01, 8.617563e-01, 8.619756e-01,
+		8.621947e-01, 8.624136e-01, 8.626326e-01, 8.628513e-01, 8.630700e-01, 8.632885e-01, 8.635070e-01, 8.637253e-01,
+		8.639436e-01, 8.641617e-01, 8.643796e-01, 8.645976e-01, 8.648154e-01, 8.650330e-01, 8.652506e-01, 8.654680e-01,
+		8.656853e-01, 8.659026e-01, 8.661197e-01, 8.663368e-01, 8.665537e-01, 8.667705e-01, 8.669872e-01, 8.672037e-01,
+		8.674202e-01, 8.676366e-01, 8.678529e-01, 8.680690e-01, 8.682851e-01, 8.685010e-01, 8.687168e-01, 8.689325e-01,
+		8.691481e-01, 8.693637e-01, 8.695791e-01, 8.697944e-01, 8.700095e-01, 8.702246e-01, 8.704396e-01, 8.706545e-01,
+		8.708693e-01, 8.710839e-01, 8.712984e-01, 8.715129e-01, 8.717272e-01, 8.719414e-01, 8.721556e-01, 8.723696e-01,
+		8.725836e-01, 8.727974e-01, 8.730111e-01, 8.732247e-01, 8.734382e-01, 8.736516e-01, 8.738649e-01, 8.740780e-01,
+		8.742912e-01, 8.745041e-01, 8.747170e-01, 8.749298e-01, 8.751425e-01, 8.753550e-01, 8.755675e-01, 8.757799e-01,
+		8.759921e-01, 8.762043e-01, 8.764163e-01, 8.766283e-01, 8.768401e-01, 8.770519e-01, 8.772635e-01, 8.774751e-01,
+		8.776865e-01, 8.778979e-01, 8.781091e-01, 8.783202e-01, 8.785312e-01, 8.787422e-01, 8.789530e-01, 8.791637e-01,
+		8.793744e-01, 8.795849e-01, 8.797953e-01, 8.800057e-01, 8.802159e-01, 8.804260e-01, 8.806360e-01, 8.808460e-01,
+		8.810558e-01, 8.812655e-01, 8.814751e-01, 8.816847e-01, 8.818941e-01, 8.821034e-01, 8.823127e-01, 8.825217e-01,
+		8.827308e-01, 8.829397e-01, 8.831486e-01, 8.833573e-01, 8.835659e-01, 8.837745e-01, 8.839829e-01, 8.841912e-01,
+		8.843995e-01, 8.846076e-01, 8.848156e-01, 8.850236e-01, 8.852314e-01, 8.854392e-01, 8.856469e-01, 8.858544e-01,
+		8.860618e-01, 8.862692e-01, 8.864765e-01, 8.866837e-01, 8.868908e-01, 8.870977e-01, 8.873046e-01, 8.875114e-01,
+		8.877181e-01, 8.879247e-01, 8.881311e-01, 8.883376e-01, 8.885438e-01, 8.887501e-01, 8.889562e-01, 8.891622e-01,
+		8.893681e-01, 8.895739e-01, 8.897797e-01, 8.899853e-01, 8.901908e-01, 8.903963e-01, 8.906016e-01, 8.908069e-01,
+		8.910121e-01, 8.912171e-01, 8.914221e-01, 8.916270e-01, 8.918318e-01, 8.920364e-01, 8.922410e-01, 8.924455e-01,
+		8.926499e-01, 8.928543e-01, 8.930585e-01, 8.932626e-01, 8.934667e-01, 8.936706e-01, 8.938744e-01, 8.940782e-01,
+		8.942819e-01, 8.944854e-01, 8.946889e-01, 8.948923e-01, 8.950956e-01, 8.952988e-01, 8.955019e-01, 8.957049e-01,
+		8.959078e-01, 8.961107e-01, 8.963134e-01, 8.965160e-01, 8.967186e-01, 8.969210e-01, 8.971235e-01, 8.973257e-01,
+		8.975279e-01, 8.977300e-01, 8.979320e-01, 8.981339e-01, 8.983358e-01, 8.985375e-01, 8.987392e-01, 8.989407e-01,
+		8.991421e-01, 8.993436e-01, 8.995448e-01, 8.997460e-01, 8.999471e-01, 9.001482e-01, 9.003491e-01, 9.005499e-01,
+		9.007506e-01, 9.009513e-01, 9.011519e-01, 9.013523e-01, 9.015527e-01, 9.017531e-01, 9.019532e-01, 9.021534e-01,
+		9.023534e-01, 9.025534e-01, 9.027532e-01, 9.029530e-01, 9.031526e-01, 9.033523e-01, 9.035518e-01, 9.037512e-01,
+		9.039505e-01, 9.041498e-01, 9.043489e-01, 9.045479e-01, 9.047469e-01, 9.049459e-01, 9.051446e-01, 9.053434e-01,
+		9.055420e-01, 9.057405e-01, 9.059390e-01, 9.061373e-01, 9.063356e-01, 9.065338e-01, 9.067319e-01, 9.069299e-01,
+		9.071279e-01, 9.073257e-01, 9.075235e-01, 9.077212e-01, 9.079187e-01, 9.081162e-01, 9.083136e-01, 9.085110e-01,
+		9.087082e-01, 9.089054e-01, 9.091024e-01, 9.092994e-01, 9.094964e-01, 9.096932e-01, 9.098899e-01, 9.100866e-01,
+		9.102831e-01, 9.104796e-01, 9.106760e-01, 9.108723e-01, 9.110685e-01, 9.112647e-01, 9.114607e-01, 9.116567e-01,
+		9.118526e-01, 9.120483e-01, 9.122441e-01, 9.124397e-01, 9.126353e-01, 9.128307e-01, 9.130261e-01, 9.132214e-01,
+		9.134166e-01, 9.136118e-01, 9.138068e-01, 9.140018e-01, 9.141967e-01, 9.143915e-01, 9.145862e-01, 9.147808e-01,
+		9.149753e-01, 9.151698e-01, 9.153642e-01, 9.155585e-01, 9.157528e-01, 9.159469e-01, 9.161409e-01, 9.163349e-01,
+		9.165288e-01, 9.167226e-01, 9.169164e-01, 9.171100e-01, 9.173036e-01, 9.174970e-01, 9.176905e-01, 9.178838e-01,
+		9.180770e-01, 9.182702e-01, 9.184632e-01, 9.186562e-01, 9.188492e-01, 9.190420e-01, 9.192348e-01, 9.194274e-01,
+		9.196200e-01, 9.198125e-01, 9.200049e-01, 9.201973e-01, 9.203895e-01, 9.205818e-01, 9.207739e-01, 9.209659e-01,
+		9.211578e-01, 9.213497e-01, 9.215415e-01, 9.217332e-01, 9.219248e-01, 9.221163e-01, 9.223078e-01, 9.224992e-01,
+		9.226905e-01, 9.228818e-01, 9.230729e-01, 9.232640e-01, 9.234550e-01, 9.236459e-01, 9.238367e-01, 9.240275e-01,
+		9.242182e-01, 9.244088e-01, 9.245993e-01, 9.247897e-01, 9.249801e-01, 9.251704e-01, 9.253606e-01, 9.255507e-01,
+		9.257408e-01, 9.259307e-01, 9.261206e-01, 9.263105e-01, 9.265002e-01, 9.266899e-01, 9.268795e-01, 9.270689e-01,
+		9.272584e-01, 9.274477e-01, 9.276370e-01, 9.278262e-01, 9.280154e-01, 9.282044e-01, 9.283934e-01, 9.285822e-01,
+		9.287710e-01, 9.289598e-01, 9.291484e-01, 9.293370e-01, 9.295255e-01, 9.297140e-01, 9.299023e-01, 9.300906e-01,
+		9.302788e-01, 9.304669e-01, 9.306549e-01, 9.308429e-01, 9.310308e-01, 9.312186e-01, 9.314064e-01, 9.315941e-01,
+		9.317816e-01, 9.319692e-01, 9.321566e-01, 9.323440e-01, 9.325313e-01, 9.327185e-01, 9.329057e-01, 9.330927e-01,
+		9.332797e-01, 9.334666e-01, 9.336535e-01, 9.338402e-01, 9.340270e-01, 9.342135e-01, 9.344001e-01, 9.345866e-01,
+		9.347730e-01, 9.349593e-01, 9.351455e-01, 9.353317e-01, 9.355178e-01, 9.357038e-01, 9.358898e-01, 9.360756e-01,
+		9.362615e-01, 9.364472e-01, 9.366328e-01, 9.368184e-01, 9.370039e-01, 9.371893e-01, 9.373747e-01, 9.375600e-01,
+		9.377452e-01, 9.379303e-01, 9.381154e-01, 9.383004e-01, 9.384854e-01, 9.386702e-01, 9.388550e-01, 9.390397e-01,
+		9.392243e-01, 9.394089e-01, 9.395934e-01, 9.397778e-01, 9.399621e-01, 9.401464e-01, 9.403306e-01, 9.405147e-01,
+		9.406988e-01, 9.408827e-01, 9.410667e-01, 9.412505e-01, 9.414343e-01, 9.416180e-01, 9.418016e-01, 9.419851e-01,
+		9.421686e-01, 9.423520e-01, 9.425353e-01, 9.427186e-01, 9.429018e-01, 9.430850e-01, 9.432680e-01, 9.434510e-01,
+		9.436339e-01, 9.438167e-01, 9.439995e-01, 9.441822e-01, 9.443648e-01, 9.445474e-01, 9.447299e-01, 9.449123e-01,
+		9.450946e-01, 9.452769e-01, 9.454591e-01, 9.456412e-01, 9.458233e-01, 9.460053e-01, 9.461872e-01, 9.463691e-01,
+		9.465508e-01, 9.467326e-01, 9.469142e-01, 9.470958e-01, 9.472773e-01, 9.474587e-01, 9.476401e-01, 9.478214e-01,
+		9.480026e-01, 9.481838e-01, 9.483649e-01, 9.485459e-01, 9.487268e-01, 9.489077e-01, 9.490886e-01, 9.492693e-01,
+		9.494500e-01, 9.496306e-01, 9.498111e-01, 9.499916e-01, 9.501719e-01, 9.503523e-01, 9.505326e-01, 9.507128e-01,
+		9.508929e-01, 9.510729e-01, 9.512529e-01, 9.514329e-01, 9.516127e-01, 9.517925e-01, 9.519722e-01, 9.521519e-01,
+		9.523315e-01, 9.525110e-01, 9.526904e-01, 9.528698e-01, 9.530491e-01, 9.532284e-01, 9.534075e-01, 9.535866e-01,
+		9.537657e-01, 9.539447e-01, 9.541236e-01, 9.543024e-01, 9.544812e-01, 9.546599e-01, 9.548386e-01, 9.550171e-01,
+		9.551957e-01, 9.553741e-01, 9.555525e-01, 9.557307e-01, 9.559090e-01, 9.560872e-01, 9.562653e-01, 9.564433e-01,
+		9.566213e-01, 9.567992e-01, 9.569771e-01, 9.571549e-01, 9.573326e-01, 9.575102e-01, 9.576878e-01, 9.578653e-01,
+		9.580427e-01, 9.582201e-01, 9.583974e-01, 9.585747e-01, 9.587519e-01, 9.589290e-01, 9.591061e-01, 9.592831e-01,
+		9.594600e-01, 9.596368e-01, 9.598137e-01, 9.599904e-01, 9.601671e-01, 9.603436e-01, 9.605201e-01, 9.606966e-01,
+		9.608730e-01, 9.610494e-01, 9.612256e-01, 9.614019e-01, 9.615780e-01, 9.617541e-01, 9.619301e-01, 9.621060e-01,
+		9.622819e-01, 9.624578e-01, 9.626336e-01, 9.628092e-01, 9.629849e-01, 9.631604e-01, 9.633359e-01, 9.635113e-01,
+		9.636867e-01, 9.638621e-01, 9.640373e-01, 9.642125e-01, 9.643876e-01, 9.645627e-01, 9.647377e-01, 9.649126e-01,
+		9.650874e-01, 9.652622e-01, 9.654370e-01, 9.656116e-01, 9.657863e-01, 9.659608e-01, 9.661353e-01, 9.663097e-01,
+		9.664841e-01, 9.666584e-01, 9.668326e-01, 9.670068e-01, 9.671809e-01, 9.673550e-01, 9.675289e-01, 9.677029e-01,
+		9.678767e-01, 9.680505e-01, 9.682242e-01, 9.683979e-01, 9.685715e-01, 9.687451e-01, 9.689186e-01, 9.690920e-01,
+		9.692653e-01, 9.694387e-01, 9.696119e-01, 9.697851e-01, 9.699582e-01, 9.701312e-01, 9.703043e-01, 9.704772e-01,
+		9.706500e-01, 9.708228e-01, 9.709955e-01, 9.711683e-01, 9.713409e-01, 9.715135e-01, 9.716859e-01, 9.718584e-01,
+		9.720308e-01, 9.722031e-01, 9.723753e-01, 9.725475e-01, 9.727197e-01, 9.728917e-01, 9.730637e-01, 9.732357e-01,
+		9.734076e-01, 9.735794e-01, 9.737512e-01, 9.739228e-01, 9.740945e-01, 9.742661e-01, 9.744377e-01, 9.746091e-01,
+		9.747805e-01, 9.749519e-01, 9.751231e-01, 9.752944e-01, 9.754655e-01, 9.756366e-01, 9.758077e-01, 9.759787e-01,
+		9.761496e-01, 9.763204e-01, 9.764913e-01, 9.766620e-01, 9.768327e-01, 9.770033e-01, 9.771739e-01, 9.773444e-01,
+		9.775148e-01, 9.776852e-01, 9.778556e-01, 9.780258e-01, 9.781960e-01, 9.783661e-01, 9.785362e-01, 9.787063e-01,
+		9.788762e-01, 9.790462e-01, 9.792160e-01, 9.793859e-01, 9.795555e-01, 9.797252e-01, 9.798949e-01, 9.800645e-01,
+		9.802339e-01, 9.804034e-01, 9.805728e-01, 9.807421e-01, 9.809114e-01, 9.810806e-01, 9.812497e-01, 9.814188e-01,
+		9.815878e-01, 9.817568e-01, 9.819257e-01, 9.820946e-01, 9.822634e-01, 9.824321e-01, 9.826008e-01, 9.827695e-01,
+		9.829381e-01, 9.831066e-01, 9.832750e-01, 9.834434e-01, 9.836118e-01, 9.837800e-01, 9.839482e-01, 9.841164e-01,
+		9.842845e-01, 9.844526e-01, 9.846206e-01, 9.847885e-01, 9.849564e-01, 9.851242e-01, 9.852920e-01, 9.854597e-01,
+		9.856274e-01, 9.857950e-01, 9.859625e-01, 9.861299e-01, 9.862974e-01, 9.864647e-01, 9.866320e-01, 9.867993e-01,
+		9.869665e-01, 9.871337e-01, 9.873008e-01, 9.874678e-01, 9.876347e-01, 9.878017e-01, 9.879685e-01, 9.881353e-01,
+		9.883021e-01, 9.884688e-01, 9.886354e-01, 9.888020e-01, 9.889685e-01, 9.891350e-01, 9.893014e-01, 9.894677e-01,
+		9.896340e-01, 9.898003e-01, 9.899665e-01, 9.901326e-01, 9.902986e-01, 9.904646e-01, 9.906306e-01, 9.907965e-01,
+		9.909624e-01, 9.911281e-01, 9.912939e-01, 9.914596e-01, 9.916252e-01, 9.917908e-01, 9.919563e-01, 9.921218e-01,
+		9.922872e-01, 9.924526e-01, 9.926178e-01, 9.927831e-01, 9.929483e-01, 9.931134e-01, 9.932785e-01, 9.934435e-01,
+		9.936085e-01, 9.937734e-01, 9.939383e-01, 9.941031e-01, 9.942678e-01, 9.944325e-01, 9.945971e-01, 9.947617e-01,
+		9.949263e-01, 9.950907e-01, 9.952552e-01, 9.954196e-01, 9.955838e-01, 9.957481e-01, 9.959123e-01, 9.960765e-01,
+		9.962406e-01, 9.964046e-01, 9.965686e-01, 9.967325e-01, 9.968964e-01, 9.970602e-01, 9.972240e-01, 9.973878e-01,
+		9.975514e-01, 9.977150e-01, 9.978786e-01, 9.980421e-01, 9.982055e-01, 9.983689e-01, 9.985323e-01, 9.986956e-01,
+		9.988588e-01, 9.990220e-01, 9.991851e-01, 9.993482e-01, 9.995112e-01, 9.996742e-01, 9.998372e-01, 1.000000e+00,
+	};
+	int i = (int)(d * 2047);
+	assert(i >= 0 && i < 2048);
+	return cube_root[i];
 }
 
 static inline void _ccv_rgb_to_luv(const float r, const float g, const float b, float* pl, float* pu, float* pv)
@@ -453,7 +691,8 @@ static void _ccv_icf_read_classifier_cascade_state(const char* directory, ccv_ic
 			state->example_state[i].rate = rate;
 		}
 		fclose(r);
-	}
+	} else
+		state->example_state = 0;
 	snprintf(filename, 1024, "%s/precomputed", directory);
 	r = fopen(filename, "rb");
 	if (r)
@@ -462,7 +701,8 @@ static void _ccv_icf_read_classifier_cascade_state(const char* directory, ccv_ic
 		state->precomputed = (uint8_t*)ccmalloc(sizeof(uint8_t) * state->params.feature_size * step);
 		fread(state->precomputed, 1, step * state->params.feature_size, r);
 		fclose(r);
-	}
+	} else
+		state->precomputed = 0;
 	snprintf(filename, 1024, "%s/cascade", directory);
 	state->classifier = ccv_icf_read_classifier_cascade(filename);
 	if (!state->classifier)
@@ -521,7 +761,7 @@ static uint8_t* _ccv_icf_precompute_features(ccv_icf_feature_t* features, int fe
 	size_t step = (3 * (positives->rnum + negatives->rnum) + 3) & -4;
 	uint8_t* precomputed = (uint8_t*)ccmalloc(sizeof(uint8_t) * feature_size * step);
 	ccv_icf_value_index_t* sortkv = (ccv_icf_value_index_t*)ccmalloc(sizeof(ccv_icf_value_index_t) * (positives->rnum + negatives->rnum));
-	printf(" - precompute features using %luM memory temporarily\n", (sizeof(float) * (positives->rnum + negatives->rnum) * feature_size + sizeof(uint8_t) * feature_size * step) / (1024 * 1024));
+	printf(" - precompute features using %uM memory temporarily\n", (uint32_t)((sizeof(float) * (positives->rnum + negatives->rnum) * feature_size + sizeof(uint8_t) * feature_size * step) / (1024 * 1024)));
 	float* featval = (float*)ccmalloc(sizeof(float) * feature_size * (positives->rnum + negatives->rnum));
 	ccv_disable_cache(); // clean up cache so we have enough space to run it
 #ifdef USE_DISPATCH
@@ -584,7 +824,7 @@ static uint8_t* _ccv_icf_precompute_features(ccv_icf_feature_t* features, int fe
 	}
 	ccfree(featval);
 	ccfree(sortkv);
-	printf("\n - features are precomputed on examples and will occupy %luM memory\n", (uint64_t)(feature_size * step) / (1024 * 1024));
+	printf("\n - features are precomputed on examples and will occupy %uM memory\n", (uint32_t)((feature_size * step) / (1024 * 1024)));
 	return precomputed;
 }
 
@@ -641,12 +881,7 @@ static ccv_icf_decision_tree_cache_t _ccv_icf_find_first_feature(ccv_icf_feature
 		aweigh0 += example_state[i].weight, example_state[i].correct = 1; // assuming negative examples we get right
 	size_t step = (3 * (positives->rnum + negatives->rnum) + 3) & -4;
 	ccv_icf_first_feature_find_t* feature_find = (ccv_icf_first_feature_find_t*)ccmalloc(sizeof(ccv_icf_first_feature_find_t) * feature_size);
-#ifdef USE_DISPATCH
-	dispatch_apply(feature_size, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^(size_t i) {
-#else
-	for (i = 0; i < feature_size; i++)
-	{
-#endif
+	parallel_for(i, feature_size) {
 		ccv_icf_first_feature_find_t min_find = {
 			.error_rate = 1.0,
 			.error_index = 0,
@@ -683,11 +918,7 @@ static ccv_icf_decision_tree_cache_t _ccv_icf_find_first_feature(ccv_icf_feature
 			}
 		}
 		feature_find[i] = min_find;
-#ifdef USE_DISPATCH
-	});
-#else
-	}
-#endif
+	} parallel_endfor
 	ccv_icf_first_feature_find_t best = {
 		.error_rate = 1.0,
 		.error_index = -1,
@@ -753,17 +984,11 @@ typedef struct {
 
 static double _ccv_icf_find_second_feature(ccv_icf_decision_tree_cache_t intermediate_cache, int leaf, ccv_icf_feature_t* features, int feature_size, ccv_array_t* positives, ccv_array_t* negatives, uint8_t* precomputed, ccv_icf_example_state_t* example_state, ccv_icf_feature_t* feature)
 {
-	int i;
 	size_t step = (3 * (positives->rnum + negatives->rnum) + 3) & -4;
 	uint8_t* lut = intermediate_cache.lut;
 	double* aweigh = intermediate_cache.weigh + leaf * 2;
 	ccv_icf_second_feature_find_t* feature_find = (ccv_icf_second_feature_find_t*)ccmalloc(sizeof(ccv_icf_second_feature_find_t) * feature_size);
-#ifdef USE_DISPATCH
-	dispatch_apply(feature_size, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^(size_t i) {
-#else
-	for (i = 0; i < feature_size; i++)
-	{
-#endif
+	parallel_for(i, feature_size) {
 		ccv_icf_second_feature_find_t min_find = {
 			.error_rate = 1.0,
 			.error_index = 0,
@@ -808,16 +1033,13 @@ static double _ccv_icf_find_second_feature(ccv_icf_decision_tree_cache_t interme
 			}
 		}
 		feature_find[i] = min_find;
-#ifdef USE_DISPATCH
-	});
-#else
-	}
-#endif
+	} parallel_endfor
 	ccv_icf_second_feature_find_t best = {
 		.error_rate = 1.0,
 		.error_index = -1,
 		.weigh = {0, 0},
 	};
+	int i;
 	int feature_index = 0;
 	for (i = 0; i < feature_size; i++)
 		if (feature_find[i].error_rate < best.error_rate)
@@ -1071,7 +1293,7 @@ static void _ccv_icf_bootstrap_negatives(ccv_icf_classifier_cascade_t* cascade,
 				continue;
 #endif
 			}
-			FLUSH(" - bootstrap negatives %d%% (%d / %d) [%lu / %d] %s", (i + 1) * 100 / negnum, i + 1, negnum, j + 1, bgfiles->rnum, spread ? "" : "without statistic balancing");
+			FLUSH(" - bootstrap negatives %d%% (%d / %d) [%u / %d] %s", (i + 1) * 100 / negnum, i + 1, negnum, (uint32_t)(j + 1), bgfiles->rnum, spread ? "" : "without statistic balancing");
 #ifdef USE_DISPATCH
 			gsl_rng* crng = gsl_rng_alloc(gsl_rng_default);
 			gsl_rng_set(crng, gsl_rng_get(rng));
@@ -1413,11 +1635,9 @@ ccv_icf_classifier_cascade_t* ccv_icf_classifier_cascade_new(ccv_array_t* posfil
 	for (z.bootstrap = 0; z.bootstrap <= params.bootstrap; z.bootstrap++)
 	{
 		z.example_state = (ccv_icf_example_state_t*)ccmalloc(sizeof(ccv_icf_example_state_t) * (z.negatives->rnum + z.positives->rnum));
+		memset(z.example_state, 0, sizeof(ccv_icf_example_state_t) * (z.negatives->rnum + z.positives->rnum));
 		for (z.i = 0; z.i < z.positives->rnum + z.negatives->rnum; z.i++)
-		{
 			z.example_state[z.i].weight = (z.i < z.positives->rnum) ? 0.5 / z.positives->rnum : 0.5 / z.negatives->rnum;
-			z.example_state[z.i].rate = 0;
-		}
 		z.x.example_state = 0;
 		ccv_function_state_resume(_ccv_icf_write_classifier_cascade_state, z, dir);
 		z.precomputed = _ccv_icf_precompute_features(z.features, params.feature_size, z.positives, z.negatives);
@@ -1525,8 +1745,10 @@ ccv_icf_classifier_cascade_t* ccv_icf_classifier_cascade_new(ccv_array_t* posfil
 			ccv_function_state_resume(_ccv_icf_write_classifier_cascade_state, z, dir);
 		}
 	}
-	ccfree(z.precomputed);
-	ccfree(z.example_state);
+	if (z.precomputed)
+		ccfree(z.precomputed);
+	if (z.example_state)
+		ccfree(z.example_state);
 	ccfree(z.features);
 	ccv_array_free(z.positives);
 	ccv_array_free(z.negatives);
@@ -1729,7 +1951,7 @@ static int _ccv_is_equal_same_class(const void* _r1, const void* _r2, void* data
 	const ccv_comp_t* r2 = (const ccv_comp_t*)_r2;
 	int distance = (int)(ccv_min(r1->rect.width, r1->rect.height) * 0.25 + 0.5);
 
-	return r2->id == r1->id &&
+	return r2->classification.id == r1->classification.id &&
 		r2->rect.x <= r1->rect.x + distance &&
 		r2->rect.x >= r1->rect.x - distance &&
 		r2->rect.y <= r1->rect.y + distance &&
@@ -1809,9 +2031,9 @@ static void _ccv_icf_detect_objects_with_classifier_cascade(ccv_dense_matrix_t*
 						{
 							ccv_comp_t comp;
 							comp.rect = ccv_rect((int)((x + 0.5) * scale * (1 << i) - 0.5), (int)((y + 0.5) * scale * (1 << i) - 0.5), (cascade->size.width - cascade->margin.left - cascade->margin.right) * scale * (1 << i), (cascade->size.height - cascade->margin.top - cascade->margin.bottom) * scale * (1 << i));
-							comp.id = j + 1;
 							comp.neighbors = 1;
-							comp.confidence = sum;
+							comp.classification.id = j + 1;
+							comp.classification.confidence = sum;
 							ccv_array_push(seq[j], &comp);
 						}
 					}
@@ -1916,9 +2138,9 @@ static void _ccv_icf_detect_objects_with_multiscale_classifier_cascade(ccv_dense
 						{
 							ccv_comp_t comp;
 							comp.rect = ccv_rect((int)((x + 0.5) * scale * (1 << i)), (int)((y + 0.5) * scale * (1 << i)), (cascade->size.width - cascade->margin.left - cascade->margin.right) << i, (cascade->size.height - cascade->margin.top - cascade->margin.bottom) << i);
-							comp.id = j + 1;
 							comp.neighbors = 1;
-							comp.confidence = sum;
+							comp.classification.id = j + 1;
+							comp.classification.confidence = sum;
 							ccv_array_push(seq[j], &comp);
 						}
 					}
@@ -1972,7 +2194,7 @@ ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, void* cascade, int co
 			ccv_array_clear(seq2);
 			// group retrieved rectangles in order to filter out noise
 			int ncomp = ccv_array_group(seq[k], &idx_seq, _ccv_is_equal_same_class, 0);
-			ccv_comp_t* comps = (ccv_comp_t*)cccalloc(sizeof(ccv_comp_t), ncomp + 1);
+			ccv_comp_t* comps = (ccv_comp_t*)cccalloc(ncomp + 1, sizeof(ccv_comp_t));
 
 			// count number of neighbors
 			for (i = 0; i < seq[k]->rnum; i++)
@@ -1980,11 +2202,11 @@ ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, void* cascade, int co
 				ccv_comp_t r1 = *(ccv_comp_t*)ccv_array_get(seq[k], i);
 				int idx = *(int*)ccv_array_get(idx_seq, i);
 
-				comps[idx].id = r1.id;
-				if (r1.confidence > comps[idx].confidence || comps[idx].neighbors == 0)
+				comps[idx].classification.id = r1.classification.id;
+				if (r1.classification.confidence > comps[idx].classification.confidence || comps[idx].neighbors == 0)
 				{
 					comps[idx].rect = r1.rect;
-					comps[idx].confidence = r1.confidence;
+					comps[idx].classification.confidence = r1.classification.confidence;
 				}
 
 				++comps[idx].neighbors;
@@ -2007,15 +2229,15 @@ ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, void* cascade, int co
 				{
 					ccv_comp_t r1 = *(ccv_comp_t*)ccv_array_get(seq2, j);
 					if (i != j &&
-						abs(r1.id) == r2->id &&
+						abs(r1.classification.id) == r2->classification.id &&
 						r1.rect.x >= r2->rect.x - distance &&
 						r1.rect.y >= r2->rect.y - distance &&
 						r1.rect.x + r1.rect.width <= r2->rect.x + r2->rect.width + distance &&
 						r1.rect.y + r1.rect.height <= r2->rect.y + r2->rect.height + distance &&
 						// if r1 (the smaller one) is better, mute r2
-						(r2->confidence <= r1.confidence && r2->neighbors < r1.neighbors))
+						(r2->classification.confidence <= r1.classification.confidence && r2->neighbors < r1.neighbors))
 					{
-						r2->id = -r2->id;
+						r2->classification.id = -r2->classification.id;
 						break;
 					}
 				}
@@ -2025,7 +2247,7 @@ ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, void* cascade, int co
 			for (i = 0; i < seq2->rnum; i++)
 			{
 				ccv_comp_t r1 = *(ccv_comp_t*)ccv_array_get(seq2, i);
-				if (r1.id > 0)
+				if (r1.classification.id > 0)
 				{
 					int flag = 1;
 
@@ -2035,13 +2257,13 @@ ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, void* cascade, int co
 						int distance = (int)(ccv_min(r2.rect.width, r2.rect.height) * 0.25 + 0.5);
 
 						if (i != j &&
-							abs(r1.id) == abs(r2.id) &&
+							abs(r1.classification.id) == abs(r2.classification.id) &&
 							r1.rect.x >= r2.rect.x - distance &&
 							r1.rect.y >= r2.rect.y - distance &&
 							r1.rect.x + r1.rect.width <= r2.rect.x + r2.rect.width + distance &&
 							r1.rect.y + r1.rect.height <= r2.rect.y + r2.rect.height + distance &&
 							// if r2 is better, we mute r1
-							(r2.confidence > r1.confidence || r2.neighbors >= r1.neighbors))
+							(r2.classification.confidence > r1.classification.confidence || r2.neighbors >= r1.neighbors))
 						{
 							flag = 0;
 							break;
diff --git a/lib/ccv_internal.h b/lib/ccv_internal.h
index 1de481e82..b1cdfcd9d 100644
--- a/lib/ccv_internal.h
+++ b/lib/ccv_internal.h
@@ -14,6 +14,14 @@ static int _CCV_PRINT_LOOP __attribute__ ((unused)) = 0;
 #define ccv_descale(x, n) (((x) + (1 << ((n) - 1))) >> (n))
 #define conditional_assert(x, expr) if ((x)) { assert(expr); }
 
+#ifdef USE_DISPATCH
+#define parallel_for(x, n) dispatch_apply(n, dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^(size_t x) {
+#define parallel_endfor });
+#else
+#define parallel_for(x, n) { int x; for (x = 0; x < n; x++) {
+#define parallel_endfor } }
+#endif
+
 /* macro printf utilities */
 
 #define FLUSH(a, ...) \
diff --git a/lib/ccv_numeric.c b/lib/ccv_numeric.c
index 7c1c47b35..48140bebf 100644
--- a/lib/ccv_numeric.c
+++ b/lib/ccv_numeric.c
@@ -2,6 +2,7 @@
 #include "ccv_internal.h"
 #include <complex.h>
 #ifdef HAVE_FFTW3
+#include <pthread.h>
 #include <fftw3.h>
 #else
 #include "3rdparty/kissfft/kiss_fftndr.h"
@@ -16,8 +17,105 @@ void ccv_solve(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type)
 {
 }
 
-void ccv_eigen(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** d, int type)
+void ccv_eigen(ccv_dense_matrix_t* a, ccv_dense_matrix_t** vector, ccv_dense_matrix_t** lambda, int type, double epsilon)
 {
+	ccv_declare_derived_signature(vsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(vector)"), a->sig, CCV_EOF_SIGN);
+	ccv_declare_derived_signature(lsig, a->sig != 0, ccv_sign_with_literal("ccv_eigen(lambda)"), a->sig, CCV_EOF_SIGN);
+	assert(CCV_GET_CHANNEL(a->type) == 1);
+	type = (type == 0) ? CCV_GET_DATA_TYPE(a->type) | CCV_C1 : CCV_GET_DATA_TYPE(type) | CCV_C1;
+	// as of now, this function only support real symmetric matrix
+	ccv_dense_matrix_t* dvector = *vector = ccv_dense_matrix_renew(*vector, a->rows, a->cols, CCV_32F | CCV_64F | CCV_C1, type, vsig);
+	ccv_dense_matrix_t* dlambda = *lambda = ccv_dense_matrix_renew(*lambda, 1, a->cols, CCV_32F | CCV_64F | CCV_C1, type, lsig);
+	assert(CCV_GET_DATA_TYPE(dvector->type) == CCV_GET_DATA_TYPE(dlambda->type));
+	ccv_object_return_if_cached(, dvector, dlambda);
+	double* ja = (double*)ccmalloc(sizeof(double) * a->rows * a->cols);
+	int i, j;
+	unsigned char* aptr = a->data.u8;
+	assert(a->rows > 0 && a->cols > 0);
+#define for_block(_, _for_get) \
+	for (i = 0; i < a->rows; i++) \
+	{ \
+		for (j = 0; j < a->cols; j++) \
+			ja[i * a->cols + j] = _for_get(aptr, j, 0); \
+		aptr += a->step; \
+	}
+	ccv_matrix_getter(a->type, for_block);
+#undef for_block
+	ccv_zero(dvector);
+	ccv_zero(dlambda);
+	unsigned char* dvptr = dvector->data.u8;
+#define for_block(_, _for_set) \
+	for (i = 0; i < a->cols; i++) \
+		_for_set(dvptr, i * a->cols + i, 1, 0);
+	ccv_matrix_setter(dvector->type, for_block);
+#undef for_block
+	double accuracy = 0;
+	for (i = 0; i < a->rows * a->cols; i++)
+		accuracy += ja[i];
+	accuracy = sqrt(2 * accuracy);
+	int p, q;
+	unsigned char* dlptr = dlambda->data.u8;
+	int flag = 1;
+	assert(a->rows == a->cols);
+#define for_block(_, _for_set, _for_get) \
+	do { \
+		if (!flag) \
+			accuracy = accuracy * 0.5; \
+		flag = 0; \
+		for (p = 0; p < a->rows; p++) \
+		{ \
+			for (q = p + 1; q < a->cols; q++) \
+				if (fabs(ja[p * a->cols + q]) > accuracy) \
+				{ \
+					double x = -ja[p * a->cols + q]; \
+					double y = (ja[q * a->cols + q] - ja[p * a->cols + p]) * 0.5; \
+					double omega = (x == 0 && y == 0) ? 1 : x / sqrt(x * x + y * y); \
+					if (y < 0) \
+						omega = -omega; \
+					double sn = 1.0 + sqrt(1.0 - omega * omega); \
+					sn = omega / sqrt(2 * sn); \
+					double cn = sqrt(1.0 - sn * sn); \
+					double fpp = ja[p * a->cols + p]; \
+					double fpq = ja[p * a->cols + q]; \
+					double fqq = ja[q * a->cols + q]; \
+					ja[p * a->cols + p] = fpp * cn * cn + fqq * sn * sn + fpq * omega; \
+					ja[q * a->cols + q] = fpp * sn * sn + fqq * cn * cn - fpq * omega; \
+					ja[p * a->cols + q] = ja[q * a->cols + p] = 0; \
+					for (i = 0; i < a->cols; i++) \
+						if (i != q && i != p) \
+						{ \
+							fpp = ja[p * a->cols + i]; \
+							fqq = ja[q * a->cols + i]; \
+							ja[p * a->cols + i] = fpp * cn + fqq * sn; \
+							ja[q * a->cols + i] = -fpp * sn + fqq * cn; \
+						} \
+					for (i = 0; i < a->rows; i++) \
+						if (i != q && i != p) \
+						{ \
+							fpp = ja[i * a->cols + p]; \
+							fqq = ja[i * a->cols + q]; \
+							ja[i * a->cols + p] = fpp * cn + fqq * sn; \
+							ja[i * a->cols + q] = -fpp * sn + fqq * cn; \
+						} \
+					for (i = 0; i < a->cols; i++) \
+					{ \
+						fpp = _for_get(dvptr, p * a->cols + i, 0); \
+						fqq = _for_get(dvptr, q * a->cols + i, 0); \
+						_for_set(dvptr, p * a->cols + i, fpp * cn + fqq * sn, 0); \
+						_for_set(dvptr, q * a->cols + i, -fpp * sn + fqq * cn, 0); \
+					} \
+					for (i = 0; i < a->cols; i++) \
+						_for_set(dlptr, i, ja[i * a->cols + i], 0); \
+					flag = 1; \
+					break; \
+				} \
+			if (flag) \
+				break; \
+		} \
+	} while (accuracy > epsilon);
+	ccv_matrix_setter_getter(dvector->type, for_block);
+#undef for_block
+	ccfree(ja);
 }
 
 void ccv_minimize(ccv_dense_matrix_t* x, int length, double red, ccv_minimize_f func, ccv_minimize_param_t params, void* data)
@@ -473,6 +571,8 @@ static int _ccv_get_optimal_fft_size(int size)
     return _ccv_optimal_fft_size[b];
 }
 
+static pthread_mutex_t fftw_plan_mutex = PTHREAD_MUTEX_INITIALIZER;
+
 static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
 {
 	int ch = CCV_GET_CHANNEL(a->type);
@@ -490,6 +590,7 @@ static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_d
 		fftw_a = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
 		fftw_b = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
 		fftw_d = fftwf_malloc(rows * cols_2c * ch * sizeof(float));
+		pthread_mutex_lock(&fftw_plan_mutex);
 		if (ch == 1)
 		{
 			pf = fftwf_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
@@ -499,10 +600,12 @@ static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_d
 			pf = fftwf_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
 			pinvf = fftwf_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
 		}
+		pthread_mutex_unlock(&fftw_plan_mutex);
 	} else {
 		fftw_a = fftw_malloc(rows * cols_2c * ch * sizeof(double));
 		fftw_b = fftw_malloc(rows * cols_2c * ch * sizeof(double));
 		fftw_d = fftw_malloc(rows * cols_2c * ch * sizeof(double));
+		pthread_mutex_lock(&fftw_plan_mutex);
 		if (ch == 1)
 		{
 			p = fftw_plan_dft_r2c_2d(rows, cols, 0, 0, FFTW_ESTIMATE);
@@ -512,6 +615,7 @@ static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_d
 			p = fftw_plan_many_dft_r2c(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
 			pinv = fftw_plan_many_dft_c2r(2, ndim, ch, 0, 0, ch, 1, 0, 0, ch, 1, FFTW_ESTIMATE);
 		}
+		pthread_mutex_unlock(&fftw_plan_mutex);
 	}
 	memset(fftw_b, 0, rows * cols_2c * ch * CCV_GET_DATA_TYPE_SIZE(fft_type));
 
@@ -631,8 +735,10 @@ static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_d
 		ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, float, fftwf_complex);
 #undef fft_execute_dft_r2c
 #undef fft_execute_dft_c2r
+		pthread_mutex_lock(&fftw_plan_mutex);
 		fftwf_destroy_plan(pf);
 		fftwf_destroy_plan(pinvf);
+		pthread_mutex_unlock(&fftw_plan_mutex);
 		fftwf_free(fftw_a);
 		fftwf_free(fftw_b);
 		fftwf_free(fftw_d);
@@ -642,8 +748,10 @@ static void _ccv_filter_fftw(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_d
 		ccv_matrix_setter(d->type, ccv_matrix_getter, a->type, for_block, double, fftw_complex);
 #undef fft_execute_dft_r2c
 #undef fft_execute_dft_c2r
+		pthread_mutex_lock(&fftw_plan_mutex);
 		fftw_destroy_plan(p);
 		fftw_destroy_plan(pinv);
+		pthread_mutex_unlock(&fftw_plan_mutex);
 		fftw_free(fftw_a);
 		fftw_free(fftw_b);
 		fftw_free(fftw_d);
@@ -840,7 +948,7 @@ static void _ccv_filter_kissfft(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, cc
 }
 #endif
 
-void _ccv_filter_direct_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
+static void _ccv_filter_direct_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t* d, int padding_pattern)
 {
 	int i, j, y, x, k;
 	int nz = b->rows * b->cols;
@@ -1008,17 +1116,18 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				_for_type_b s; \
 				for (;;) \
 				{ \
-					assert(k >= 0 && k < ccv_max(db->rows, db->cols) + 1); \
+					assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
 					s = ((SGN _for_get_a(a_ptr, j, 0) + _dxx * j * j - _dx * j) - (SGN _for_get_a(a_ptr, v[k], 0) + _dxx * v[k] * v[k] - _dx * v[k])) / (2.0 * _dxx * (j - v[k])); \
 					if (s > z[k]) break; \
 					--k; \
 				} \
 				++k; \
-				assert(k >= 0 && k < ccv_max(db->rows, db->cols) + 1); \
+				assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
 				v[k] = j; \
 				z[k] = s; \
 				z[k + 1] = (_for_type_b)_for_max; \
 			} \
+			assert(z[k + 1] >= a->cols - 1); \
 			k = 0; \
 			if (mx) \
 			{ \
@@ -1026,7 +1135,7 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				{ \
 					while (z[k + 1] < j) \
 					{ \
-						assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
+						assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
 						++k; \
 					} \
 					_for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k], 0), 0); \
@@ -1037,7 +1146,10 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				for (j = 0; j < a->cols; j++) \
 				{ \
 					while (z[k + 1] < j) \
+					{ \
+						assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
 						++k; \
+					} \
 					_for_set_b(b_ptr, j, _dx * (j - v[k]) + _dxx * (j - v[k]) * (j - v[k]) SGN _for_get_a(a_ptr, v[k], 0), 0); \
 				} \
 			} \
@@ -1074,17 +1186,18 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				_for_type_b s; \
 				for (;;) \
 				{ \
-					assert(k >= 0 && k < ccv_max(db->rows, db->cols) + 1); \
+					assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
 					s = ((_for_get_b(c_ptr, i, 0) + _dyy * i * i - _dy * i) - (_for_get_b(c_ptr, v[k], 0) + _dyy * v[k] * v[k] - _dy * v[k])) / (2.0 * _dyy * (i - v[k])); \
 					if (s > z[k]) break; \
 					--k; \
 				} \
 				++k; \
-				assert(k >= 0 && k < ccv_max(db->rows, db->cols) + 1); \
+				assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
 				v[k] = i; \
 				z[k] = s; \
 				z[k + 1] = (_for_type_b)_for_max; \
 			} \
+			assert(z[k + 1] >= db->rows - 1); \
 			k = 0; \
 			if (my) \
 			{ \
@@ -1092,7 +1205,7 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				{ \
 					while (z[k + 1] < i) \
 					{ \
-						assert(k >= 0 && k < ccv_max(db->rows, db->cols)); \
+						assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
 						++k; \
 					} \
 					_for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k], 0), 0); \
@@ -1103,7 +1216,10 @@ void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int t
 				for (i = 0; i < db->rows; i++) \
 				{ \
 					while (z[k + 1] < i) \
+					{ \
+						assert(k >= 0 && k < ccv_max(db->rows, db->cols) - 1); \
 						++k; \
+					} \
 					_for_set_b(b_ptr + i * db->step, j, _dy * (i - v[k]) + _dyy * (i - v[k]) * (i - v[k]) + _for_get_b(c_ptr, v[k], 0), 0); \
 				} \
 			} \
diff --git a/lib/ccv_resample.c b/lib/ccv_resample.c
index 462b5aaf1..361c143a8 100644
--- a/lib/ccv_resample.c
+++ b/lib/ccv_resample.c
@@ -10,6 +10,7 @@ typedef struct {
 
 static void _ccv_resample_area_8u(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b)
 {
+	assert(a->cols > 0 && b->cols > 0);
 	ccv_int_alpha* xofs = (ccv_int_alpha*)alloca(sizeof(ccv_int_alpha) * a->cols * 2);
 	int ch = ccv_clamp(CCV_GET_CHANNEL(a->type), 1, 4);
 	double scale_x = (double)a->cols / b->cols;
@@ -101,6 +102,7 @@ typedef struct {
 
 static void _ccv_resample_area(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b)
 {
+	assert(a->cols > 0 && b->cols > 0);
 	ccv_area_alpha_t* xofs = (ccv_area_alpha_t*)alloca(sizeof(ccv_area_alpha_t) * a->cols * 2);
 	int ch = CCV_GET_CHANNEL(a->type);
 	double scale_x = (double)a->cols / b->cols;
@@ -215,6 +217,7 @@ static void _ccv_resample_cubic_float_only(ccv_dense_matrix_t* a, ccv_dense_matr
 {
 	assert(CCV_GET_DATA_TYPE(b->type) == CCV_32F || CCV_GET_DATA_TYPE(b->type) == CCV_64F);
 	int i, j, k, ch = CCV_GET_CHANNEL(a->type);
+	assert(b->cols > 0 && b->step > 0);
 	ccv_cubic_coeffs_t* xofs = (ccv_cubic_coeffs_t*)alloca(sizeof(ccv_cubic_coeffs_t) * b->cols);
 	float scale_x = (float)a->cols / b->cols;
 	for (i = 0; i < b->cols; i++)
@@ -224,6 +227,9 @@ static void _ccv_resample_cubic_float_only(ccv_dense_matrix_t* a, ccv_dense_matr
 	}
 	float scale_y = (float)a->rows / b->rows;
 	unsigned char* buf = (unsigned char*)alloca(b->step * 4);
+#ifdef __clang_analyzer__
+	memset(buf, 0, b->step * 4);
+#endif
 	unsigned char* a_ptr = a->data.u8;
 	unsigned char* b_ptr = b->data.u8;
 	int psi = -1, siy = 0;
@@ -283,6 +289,7 @@ static void _ccv_resample_cubic_integer_only(ccv_dense_matrix_t* a, ccv_dense_ma
 	assert(CCV_GET_DATA_TYPE(b->type) == CCV_8U || CCV_GET_DATA_TYPE(b->type) == CCV_32S || CCV_GET_DATA_TYPE(b->type) == CCV_64S);
 	int i, j, k, ch = CCV_GET_CHANNEL(a->type);
 	int no_8u_type = (b->type & CCV_8U) ? CCV_32S : b->type;
+	assert(b->cols > 0);
 	ccv_cubic_integer_coeffs_t* xofs = (ccv_cubic_integer_coeffs_t*)alloca(sizeof(ccv_cubic_integer_coeffs_t) * b->cols);
 	float scale_x = (float)a->cols / b->cols;
 	for (i = 0; i < b->cols; i++)
@@ -293,6 +300,9 @@ static void _ccv_resample_cubic_integer_only(ccv_dense_matrix_t* a, ccv_dense_ma
 	float scale_y = (float)a->rows / b->rows;
 	int bufstep = b->cols * ch * CCV_GET_DATA_TYPE_SIZE(no_8u_type);
 	unsigned char* buf = (unsigned char*)alloca(bufstep * 4);
+#ifdef __clang_analyzer__
+	memset(buf, 0, bufstep * 4);
+#endif
 	unsigned char* a_ptr = a->data.u8;
 	unsigned char* b_ptr = b->data.u8;
 	int psi = -1, siy = 0;
@@ -334,6 +344,7 @@ static void _ccv_resample_cubic_integer_only(ccv_dense_matrix_t* a, ccv_dense_ma
 
 void ccv_resample(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int btype, int rows, int cols, int type)
 {
+	assert(rows > 0 && cols > 0);
 	ccv_declare_derived_signature(sig, a->sig != 0, ccv_sign_with_format(64, "ccv_resample(%d,%d,%d)", rows, cols, type), a->sig, CCV_EOF_SIGN);
 	btype = (btype == 0) ? CCV_GET_DATA_TYPE(a->type) | CCV_GET_CHANNEL(a->type) : CCV_GET_DATA_TYPE(btype) | CCV_GET_CHANNEL(a->type);
 	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, rows, cols, CCV_ALL_DATA_TYPE | CCV_GET_CHANNEL(a->type), btype, sig);
@@ -384,6 +395,9 @@ void ccv_sample_down(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, in
 			tab[dx * ch + k] = ((dx >= a->cols) ? a->cols * 2 - 1 - dx : dx) * ch + k;
 	unsigned char* buf = (unsigned char*)alloca(5 * db->cols * ch * ccv_max(CCV_GET_DATA_TYPE_SIZE(db->type), sizeof(int)));
 	int bufstep = db->cols * ch * ccv_max(CCV_GET_DATA_TYPE_SIZE(db->type), sizeof(int));
+#ifdef __clang_analyzer__
+	memset(buf, 0, 5 * bufstep);
+#endif
 	unsigned char* b_ptr = db->data.u8;
 	/* why is src_y * 4 in computing the offset of row?
 	 * Essentially, it means sy - src_y but in a manner that doesn't result negative number.
@@ -441,6 +455,7 @@ void ccv_sample_up(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int
 	ccv_object_return_if_cached(, db);
 	int ch = CCV_GET_CHANNEL(a->type);
 	int cols0 = a->cols - 1 - src_x;
+	assert(a->cols > 0 && cols0 > 0);
 	int y, x, sy = -1 + src_y, sx = src_x * ch, k;
 	int* tab = (int*)alloca((a->cols + src_x + 2) * ch * sizeof(int));
 	for (x = 0; x < a->cols + src_x + 2; x++)
@@ -448,6 +463,9 @@ void ccv_sample_up(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int
 			tab[x * ch + k] = ((x >= a->cols) ? a->cols * 2 - 1 - x : x) * ch + k;
 	unsigned char* buf = (unsigned char*)alloca(3 * db->cols * ch * ccv_max(CCV_GET_DATA_TYPE_SIZE(db->type), sizeof(int)));
 	int bufstep = db->cols * ch * ccv_max(CCV_GET_DATA_TYPE_SIZE(db->type), sizeof(int));
+#ifdef __clang_analyzer__
+	memset(buf, 0, 3 * bufstep);
+#endif
 	unsigned char* b_ptr = db->data.u8;
 	/* why src_y * 2: the same argument as in ccv_sample_down */
 #define for_block(_for_get_a, _for_set, _for_get, _for_set_b) \
diff --git a/lib/ccv_swt.c b/lib/ccv_swt.c
index e1003add5..1ff7fe4c6 100644
--- a/lib/ccv_swt.c
+++ b/lib/ccv_swt.c
@@ -222,7 +222,7 @@ void ccv_swt(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_swt_pa
 	ccv_matrix_free(dy);
 }
 
-ccv_array_t* _ccv_swt_connected_component(ccv_dense_matrix_t* a, int ratio, int min_height, int max_height, int min_area)
+static ccv_array_t* _ccv_swt_connected_component(ccv_dense_matrix_t* a, int ratio, int min_height, int max_height, int min_area)
 {
 	int i, j, k;
 	int* a_ptr = a->data.i32;
@@ -538,7 +538,7 @@ static ccv_array_t* _ccv_swt_merge_textline(ccv_array_t* letters, ccv_swt_param_
 }
 
 #define less_than(a, b, aux) ((a)->center.x < (b)->center.x)
-CCV_IMPLEMENT_QSORT(_ccv_sort_letters, ccv_letter_t*, less_than)
+static CCV_IMPLEMENT_QSORT(_ccv_sort_letters, ccv_letter_t*, less_than)
 #undef less_than
 
 static ccv_array_t* _ccv_swt_break_words(ccv_array_t* textline, ccv_swt_param_t params)
@@ -550,6 +550,7 @@ static ccv_array_t* _ccv_swt_break_words(ccv_array_t* textline, ccv_swt_param_t
 		if (t->neighbors - 1 > n)
 			n = t->neighbors - 1;
 	}
+	assert(n > 0);
 	int* buffer = (int*)alloca(n * sizeof(int));
 	ccv_array_t* words = ccv_array_new(sizeof(ccv_rect_t), textline->rnum, 0);
 	for (i = 0; i < textline->rnum; i++)
@@ -657,7 +658,7 @@ ccv_array_t* ccv_swt_detect_words(ccv_dense_matrix_t* a, ccv_swt_param_t params)
 		ccv_array_t* idx = 0;
 		int ntl = ccv_array_group(textline, &idx, _ccv_is_same_textline, params.same_word_thresh);
 		ccv_array_t* words;
-		if (params.breakdown)
+		if (params.breakdown && ntl > 0)
 		{
 			textline2 = ccv_array_new(sizeof(ccv_textline_t), ntl, 0);
 			ccv_array_zero(textline2);
@@ -721,6 +722,7 @@ ccv_array_t* ccv_swt_detect_words(ccv_dense_matrix_t* a, ccv_swt_param_t params)
 	}
 	if (params.scale_invariant && params.min_neighbors)
 	{
+		assert(all_words);
 		// de-dup logic, similar to what BBF / DPM have
 		ccv_array_t* idx = 0;
 		int ntl = ccv_array_group(all_words, &idx, _ccv_is_same_textline, params.same_word_thresh);
diff --git a/lib/ccv_tld.c b/lib/ccv_tld.c
index 2b70238b7..2a2e74e0e 100644
--- a/lib/ccv_tld.c
+++ b/lib/ccv_tld.c
@@ -272,7 +272,7 @@ static inline float _ccv_tld_rect_intersect(const ccv_rect_t r1, const ccv_rect_
 					INTERNAL_CATCH_UNIQUE_NAME(pix) = INTERNAL_CATCH_UNIQUE_NAME(ix); \
 					ccv_comp_t new_comp; \
 					new_comp.rect = ccv_rect(INTERNAL_CATCH_UNIQUE_NAME(ix), INTERNAL_CATCH_UNIQUE_NAME(iy), INTERNAL_CATCH_UNIQUE_NAME(width), INTERNAL_CATCH_UNIQUE_NAME(height)); \
-					new_comp.id = INTERNAL_CATCH_UNIQUE_NAME(s);
+					new_comp.classification.id = INTERNAL_CATCH_UNIQUE_NAME(s);
 #define end_for_each_box } } end_for_each_size }
 
 static void _ccv_tld_box_percolate_down(ccv_array_t* good, int i)
@@ -286,20 +286,14 @@ static void _ccv_tld_box_percolate_down(ccv_array_t* good, int i)
 		if (left < good->rnum)
 		{
 			ccv_comp_t* left_comp = (ccv_comp_t*)ccv_array_get(good, left);
-			if (left_comp->confidence < smallest_comp->confidence)
-			{
+			if (left_comp->classification.confidence < smallest_comp->classification.confidence)
 				smallest = left;
-				smallest_comp = left_comp;
-			}
 		}
 		if (right < good->rnum)
 		{
 			ccv_comp_t* right_comp = (ccv_comp_t*)ccv_array_get(good, right);
-			if (right_comp->confidence < smallest_comp->confidence)
-			{
+			if (right_comp->classification.confidence < smallest_comp->classification.confidence)
 				smallest = right;
-				smallest_comp = right_comp;
-			}
 		}
 		if (smallest == i)
 			break;
@@ -320,7 +314,7 @@ static void _ccv_tld_box_percolate_up(ccv_array_t* good, int smallest)
 			break;
 		ccv_comp_t* parent_comp = (ccv_comp_t*)ccv_array_get(good, parent);
 		ccv_comp_t* smallest_comp = (ccv_comp_t*)ccv_array_get(good, smallest);
-		if (smallest_comp->confidence < parent_comp->confidence)
+		if (smallest_comp->classification.confidence < parent_comp->classification.confidence)
 		{
 			smallest = parent;
 			smallest_comp = parent_comp;
@@ -336,7 +330,7 @@ static void _ccv_tld_box_percolate_up(ccv_array_t* good, int smallest)
 		{
 			ccv_comp_t* other_comp = (ccv_comp_t*)ccv_array_get(good, other);
 			// if current one is no smaller than the other one, stop, and this requires a percolating down
-			if (other_comp->confidence < smallest_comp->confidence)
+			if (other_comp->classification.confidence < smallest_comp->classification.confidence)
 				break;
 		}
 	}
@@ -351,14 +345,16 @@ static ccv_comp_t _ccv_tld_generate_box_for(ccv_size_t image_size, ccv_size_t in
 	ccv_array_t* abad = *bad = ccv_array_new(sizeof(ccv_comp_t), 64, 0);
 	double max_overlap = -DBL_MAX;
 	ccv_comp_t best_box = {
-		.id = 0,
+		.classification = {
+			.id = 0,
+		},
 		.rect = ccv_rect(0, 0, 0, 0),
 	};
 	int i = 0;
 	for_each_box(comp, input_size.width, input_size.height, params.interval, params.shift, image_size.width, image_size.height)
 		double overlap = _ccv_tld_rect_intersect(comp.rect, box);
 		comp.neighbors = i++;
-		comp.confidence = overlap;
+		comp.classification.confidence = overlap;
 		if (overlap > params.include_overlap)
 		{
 			if (overlap > max_overlap)
@@ -372,7 +368,7 @@ static ccv_comp_t _ccv_tld_generate_box_for(ccv_size_t image_size, ccv_size_t in
 				_ccv_tld_box_percolate_up(agood, agood->rnum - 1);
 			} else {
 				ccv_comp_t* p = (ccv_comp_t*)ccv_array_get(agood, 0);
-				if (overlap > p->confidence)
+				if (overlap > p->classification.confidence)
 				{
 					*(ccv_comp_t*)ccv_array_get(agood, 0) = comp;
 					_ccv_tld_box_percolate_down(agood, 0);
@@ -395,7 +391,7 @@ static void _ccv_tld_ferns_feature_for(ccv_ferns_t* ferns, ccv_dense_matrix_t* a
 	{
 		ccv_dense_matrix_t roi = ccv_dense_matrix(box.rect.height, box.rect.width, CCV_GET_DATA_TYPE(a->type) | CCV_GET_CHANNEL(a->type), ccv_get_dense_matrix_cell(a, box.rect.y, box.rect.x, 0), 0);
 		roi.step = a->step;
-		ccv_ferns_feature(ferns, &roi, box.id, fern);
+		ccv_ferns_feature(ferns, &roi, box.classification.id, fern);
 	} else {
 		float rotate_x = (deform_angle * 2 * dsfmt_genrand_close_open(dsfmt) - deform_angle) * CCV_PI / 180;
 		float rotate_y = (deform_angle * 2 * dsfmt_genrand_close_open(dsfmt) - deform_angle) * CCV_PI / 180;
@@ -435,7 +431,7 @@ static void _ccv_tld_ferns_feature_for(ccv_ferns_t* ferns, ccv_dense_matrix_t* a
 		ccv_perspective_transform(&roi, &b, 0, m00, m01, m02, m10, m11, m12, m20, m21, m22);
 		roi = ccv_dense_matrix(box.rect.height, box.rect.width, CCV_GET_DATA_TYPE(b->type) | CCV_GET_CHANNEL(b->type), ccv_get_dense_matrix_cell(b, padding_top, padding_left, 0), 0);
 		roi.step = b->step;
-		ccv_ferns_feature(ferns, &roi, box.id, fern);
+		ccv_ferns_feature(ferns, &roi, box.classification.id, fern);
 		ccv_matrix_free(b);
 	}
 }
@@ -840,8 +836,8 @@ static ccv_array_t* _ccv_tld_long_term_detect(ccv_tld_t* tld, ccv_dense_matrix_t
 			_ccv_tld_box_variance(sat, sqsat, box.rect) > tld->var_thres)
 		{
 			_ccv_tld_ferns_feature_for(tld->ferns, ga, box, fern, 0, 0, 0, 0);
-			box.confidence = ccv_ferns_predict(tld->ferns, fern);
-			if (box.confidence > tld->ferns_thres)
+			box.classification.confidence = ccv_ferns_predict(tld->ferns, fern);
+			if (box.classification.confidence > tld->ferns_thres)
 			{
 				if (tld->top->rnum < tld->params.top_n)
 				{
@@ -849,7 +845,7 @@ static ccv_array_t* _ccv_tld_long_term_detect(ccv_tld_t* tld, ccv_dense_matrix_t
 					_ccv_tld_box_percolate_up(tld->top, tld->top->rnum - 1);
 				} else {
 					ccv_comp_t* top_box = (ccv_comp_t*)ccv_array_get(tld->top, 0);
-					if (top_box->confidence < box.confidence)
+					if (top_box->classification.confidence < box.classification.confidence)
 					{
 						*(ccv_comp_t*)ccv_array_get(tld->top, 0) = box;
 						_ccv_tld_box_percolate_down(tld->top, 0);
@@ -871,7 +867,7 @@ static ccv_array_t* _ccv_tld_long_term_detect(ccv_tld_t* tld, ccv_dense_matrix_t
 		if (c > tld->nnc_thres)
 		{
 			// save only the conservative confidence (50% samples)
-			box->confidence = _ccv_tld_sv_classify(tld, b, ccv_max((int)(tld->sv[1]->rnum * tld->params.validate_set + 0.5), 1), 0, &anyp, &anyn);
+			box->classification.confidence = _ccv_tld_sv_classify(tld, b, ccv_max((int)(tld->sv[1]->rnum * tld->params.validate_set + 0.5), 1), 0, &anyp, &anyn);
 			ccv_array_push(seq, box);
 		}
 		ccv_matrix_free(b);
@@ -914,9 +910,9 @@ ccv_comp_t ccv_tld_track_object(ccv_tld_t* tld, ccv_dense_matrix_t* a, ccv_dense
 			int anyp = 0, anyn = 0;
 			ccv_dense_matrix_t* c = 0;
 			_ccv_tld_fetch_patch(tld, gb, &c, 0, result.rect);
-			result.confidence = _ccv_tld_sv_classify(tld, c, 0, 0, &anyp, &anyn);
+			result.classification.confidence = _ccv_tld_sv_classify(tld, c, 0, 0, &anyp, &anyn);
 			ccv_matrix_free(c);
-			if (result.confidence > tld->nnc_verify_thres)
+			if (result.classification.confidence > tld->nnc_verify_thres)
 				verified = 1;
 		}
 	}
@@ -953,7 +949,7 @@ ccv_comp_t ccv_tld_track_object(ccv_tld_t* tld, ccv_dense_matrix_t* a, ccv_dense
 			comps[idx].rect.y += r1.rect.y;
 			comps[idx].rect.width += r1.rect.width;
 			comps[idx].rect.height += r1.rect.height;
-			comps[idx].confidence += r1.confidence;
+			comps[idx].classification.confidence += r1.classification.confidence;
 		}
 		ccv_array_clear(dd);
 		for(i = 0; i < ncomp; i++)
@@ -965,7 +961,7 @@ ccv_comp_t ccv_tld_track_object(ccv_tld_t* tld, ccv_dense_matrix_t* a, ccv_dense
 			comp.rect.width = (comps[i].rect.width * 2 + n) / (2 * n);
 			comp.rect.height = (comps[i].rect.height * 2 + n) / (2 * n);
 			comp.neighbors = comps[i].neighbors;
-			comp.confidence = comps[i].confidence / n;
+			comp.classification.confidence = comps[i].classification.confidence / n;
 			ccv_array_push(dd, &comp);
 		}
 		ccv_array_free(idx_dd);
@@ -985,7 +981,7 @@ ccv_comp_t ccv_tld_track_object(ccv_tld_t* tld, ccv_dense_matrix_t* a, ccv_dense
 			for (i = 0; i < dd->rnum; i++)
 			{
 				ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(dd, i);
-				if (_ccv_tld_rect_intersect(result.rect, comp->rect) < 0.5 && comp->confidence > result.confidence)
+				if (_ccv_tld_rect_intersect(result.rect, comp->rect) < 0.5 && comp->classification.confidence > result.classification.confidence)
 				{
 					++confident_matches;
 					ddcomp = comp;
diff --git a/lib/ccv_util.c b/lib/ccv_util.c
index 4992d538b..ae79a2695 100644
--- a/lib/ccv_util.c
+++ b/lib/ccv_util.c
@@ -247,11 +247,14 @@ static void _ccv_dense_vector_expand(ccv_sparse_matrix_t* mat, ccv_dense_vector_
 
 static void _ccv_sparse_matrix_expand(ccv_sparse_matrix_t* mat)
 {
+	assert(mat->prime >= 0 && mat->prime < sizeof(_ccv_get_sparse_prime) / sizeof(int));
 	int length = CCV_GET_SPARSE_PRIME(mat->prime);
 	mat->prime++;
+	assert(mat->prime >= 0 && mat->prime < sizeof(_ccv_get_sparse_prime) / sizeof(int));
 	int new_length = CCV_GET_SPARSE_PRIME(mat->prime);
 	ccv_dense_vector_t* new_vector = (ccv_dense_vector_t*)ccmalloc(new_length * sizeof(ccv_dense_vector_t));
 	int i;
+	assert(new_length > 0);
 	for (i = 0; i < new_length; i++)
 	{
 		new_vector[i].index = -1;
@@ -382,10 +385,10 @@ void ccv_set_sparse_matrix_cell(ccv_sparse_matrix_t* mat, int row, int col, void
 	(aux)[(int)(&(i2) - (array))] = td;							\
 	CCV_SWAP(i1, i2, t); }
 
-CCV_IMPLEMENT_QSORT_EX(_ccv_indice_uchar_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_uchar_data, unsigned char*);
-CCV_IMPLEMENT_QSORT_EX(_ccv_indice_int_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_int_data, int*);
-CCV_IMPLEMENT_QSORT_EX(_ccv_indice_float_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_float_data, float*);
-CCV_IMPLEMENT_QSORT_EX(_ccv_indice_double_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_double_data, double*);
+static CCV_IMPLEMENT_QSORT_EX(_ccv_indice_uchar_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_uchar_data, unsigned char*);
+static CCV_IMPLEMENT_QSORT_EX(_ccv_indice_int_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_int_data, int*);
+static CCV_IMPLEMENT_QSORT_EX(_ccv_indice_float_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_float_data, float*);
+static CCV_IMPLEMENT_QSORT_EX(_ccv_indice_double_sort, int, _ccv_indice_less_than, _ccv_swap_indice_and_double_data, double*);
 
 void ccv_compress_sparse_matrix(ccv_sparse_matrix_t* mat, ccv_compressed_sparse_matrix_t** csm)
 {
@@ -510,6 +513,7 @@ int ccv_matrix_eq(ccv_matrix_t* a, ccv_matrix_t* b)
 			return -1;
 		if (da->cols != db->cols)
 			return -1;
+		float epsilon = (CCV_GET_DATA_TYPE(db->type) == CCV_8U || CCV_GET_DATA_TYPE(db->type) == CCV_32S || CCV_GET_DATA_TYPE(db->type) == CCV_64S) ? 1 : 1e-4;
 		int i, j, ch = CCV_GET_CHANNEL(da->type);
 		unsigned char* a_ptr = da->data.u8;
 		unsigned char* b_ptr = db->data.u8;
@@ -518,7 +522,7 @@ int ccv_matrix_eq(ccv_matrix_t* a, ccv_matrix_t* b)
 		{ \
 			for (j = 0; j < da->cols * ch; j++) \
 			{ \
-				if (fabs(_for_get(b_ptr, j, 0) - _for_get(a_ptr, j, 0)) > 1e-4) \
+				if (fabs((double)(_for_get(b_ptr, j, 0) - _for_get(a_ptr, j, 0))) > epsilon) \
 					return -1; \
 			} \
 			a_ptr += da->step; \
diff --git a/lib/configure b/lib/configure
index 9a7a816e7..d24765c2e 100755
--- a/lib/configure
+++ b/lib/configure
@@ -677,6 +677,7 @@ SHELL'
 ac_subst_files=''
 ac_user_opts='
 enable_option_checking
+enable_neon
 with_cuda
 '
       ac_precious_vars='build_alias
@@ -1293,10 +1294,16 @@ if test -n "$ac_init_help"; then
    esac
   cat <<\_ACEOF
 
+Optional Features:
+  --disable-option-checking  ignore unrecognized --enable/--with options
+  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
+  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-neon           optimize with NEON instruction set
+
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
   --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
-  --with-cuda             CUDA installation
+  --with-cuda             CUDA installation [ARG=/usr/local/cuda]
 
 Some influential environment variables:
   CC          C compiler command
@@ -2108,7 +2115,29 @@ fi
 
 
 
-# check for sse2 support, libpng, libjpeg, fftw3, liblinear, cblas, Accelerate framework, avformat, avcodec, swscale
+# check for ARM NEON support
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking neon" >&5
+$as_echo_n "checking neon... " >&6; }
+# Check whether --enable-neon was given.
+if test "${enable_neon+set}" = set; then :
+  enableval=$enable_neon; neon_support=$enableval
+else
+  neon_support="no"
+fi
+
+if test "$neon_support" = yes; then
+	DEFINE_MACROS="$DEFINE_MACROS-D HAVE_NEON "
+
+	MKCFLAGS="$MKCFLAGS-mfpu=neon "
+
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+else
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+# check for libpng, libjpeg, fftw3, liblinear, cblas, Accelerate framework, avformat, avcodec, swscale
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -3296,14 +3325,6 @@ fi
 done
 
 
-ac_fn_c_check_header_mongrel "$LINENO" "xmmintrin.h" "ac_cv_header_xmmintrin_h" "$ac_includes_default"
-if test "x$ac_cv_header_xmmintrin_h" = xyes; then :
-  DEFINE_MACROS="$DEFINE_MACROS-D HAVE_SSE2 "
- MKCFLAGS="$MKCFLAGS-msse2 "
-
-fi
-
-
 ac_fn_c_check_header_mongrel "$LINENO" "png.h" "ac_cv_header_png_h" "$ac_includes_default"
 if test "x$ac_cv_header_png_h" = xyes; then :
   DEFINE_MACROS="$DEFINE_MACROS-D HAVE_LIBPNG "
@@ -3323,7 +3344,7 @@ fi
 ac_fn_c_check_header_mongrel "$LINENO" "fftw3.h" "ac_cv_header_fftw3_h" "$ac_includes_default"
 if test "x$ac_cv_header_fftw3_h" = xyes; then :
   DEFINE_MACROS="$DEFINE_MACROS-D HAVE_FFTW3 "
- MKLDFLAGS="$MKLDFLAGS-lfftw3 -lfftw3f "
+ MKLDFLAGS="$MKLDFLAGS-lfftw3 -lfftw3f -lpthread "
 
 fi
 
@@ -3339,7 +3360,15 @@ fi
 ac_fn_c_check_header_mongrel "$LINENO" "cblas.h" "ac_cv_header_cblas_h" "$ac_includes_default"
 if test "x$ac_cv_header_cblas_h" = xyes; then :
   DEFINE_MACROS="$DEFINE_MACROS-D HAVE_CBLAS "
- MKLDFLAGS="$MKLDFLAGS-lcblas "
+ MKLDFLAGS="$MKLDFLAGS-lblas "
+
+fi
+
+
+ac_fn_c_check_header_mongrel "$LINENO" "tesseract/capi.h" "ac_cv_header_tesseract_capi_h" "$ac_includes_default"
+if test "x$ac_cv_header_tesseract_capi_h" = xyes; then :
+  DEFINE_MACROS="$DEFINE_MACROS-D HAVE_TESSERACT "
+ MKLDFLAGS="$MKLDFLAGS-ltesseract "
 
 fi
 
@@ -3381,7 +3410,59 @@ if test $CC = clang; then
 	ac_fn_c_check_header_mongrel "$LINENO" "dispatch/dispatch.h" "ac_cv_header_dispatch_dispatch_h" "$ac_includes_default"
 if test "x$ac_cv_header_dispatch_dispatch_h" = xyes; then :
   DEFINE_MACROS="$DEFINE_MACROS-D USE_DISPATCH "
- MKLDFLAGS="$MKLDFLAGS-ldispatch "
+ MKCFLAGS="$MKCFLAGS-fblocks "
+
+fi
+
+
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dispatch_apply in -ldispatch" >&5
+$as_echo_n "checking for dispatch_apply in -ldispatch... " >&6; }
+if ${ac_cv_lib_dispatch_dispatch_apply+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-ldispatch  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char dispatch_apply ();
+int
+main ()
+{
+return dispatch_apply ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  ac_cv_lib_dispatch_dispatch_apply=yes
+else
+  ac_cv_lib_dispatch_dispatch_apply=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_dispatch_dispatch_apply" >&5
+$as_echo "$ac_cv_lib_dispatch_dispatch_apply" >&6; }
+if test "x$ac_cv_lib_dispatch_dispatch_apply" = xyes; then :
+  MKLDFLAGS="$MKLDFLAGS-ldispatch -lBlocksRuntime "
+
+fi
+
+fi
+# check for SSE2 support only we don't enable NEON explicitly
+if test "$neon_support" != yes; then
+	ac_fn_c_check_header_mongrel "$LINENO" "xmmintrin.h" "ac_cv_header_xmmintrin_h" "$ac_includes_default"
+if test "x$ac_cv_header_xmmintrin_h" = xyes; then :
+  DEFINE_MACROS="$DEFINE_MACROS-D HAVE_SSE2 "
+ MKCFLAGS="$MKCFLAGS-msse2 "
 
 fi
 
@@ -3517,7 +3598,7 @@ fi
 $as_echo "$ac_cv_lib_gsl_gsl_blas_dgemm" >&6; }
 if test "x$ac_cv_lib_gsl_gsl_blas_dgemm" = xyes; then :
   DEFINE_MACROS="$DEFINE_MACROS-D HAVE_GSL "
- MKLDFLAGS="$MKLDFLAGS-lgsl "
+ MKLDFLAGS="$MKLDFLAGS-lgsl -lgslcblas "
 
 fi
 
@@ -3556,13 +3637,6 @@ else
 $as_echo "no" >&6; }
 fi
 
-# check for block support, only enable dispatch when this is presented
-if test $CC = clang; then :
-  MKCFLAGS="$MKCFLAGS-fblocks "
- MKLDFLAGS="$MKLDFLAGS-lBlocksRuntime "
-
-fi
-
 # for display only, concatenate CFLAGS and DEFINE_MACROS
 DISCFLAGS="$MKCFLAGS$DEFINE_MACROS"
 
diff --git a/lib/configure.ac b/lib/configure.ac
index 2cf20198d..2dc6315b1 100644
--- a/lib/configure.ac
+++ b/lib/configure.ac
@@ -11,19 +11,30 @@ AC_CHECK_PROG(CC, clang, clang)
 # check if nvcc exists
 AC_CHECK_PROG(NVCC, nvcc, nvcc)
 
-# check for sse2 support, libpng, libjpeg, fftw3, liblinear, cblas, Accelerate framework, avformat, avcodec, swscale
-AC_CHECK_HEADER(xmmintrin.h,
-				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_SSE2 "]) AC_SUBST(MKCFLAGS, ["$MKCFLAGS-msse2 "])])
+# check for ARM NEON support
+AC_MSG_CHECKING([neon])
+AC_ARG_ENABLE(neon, [AS_HELP_STRING([--enable-neon], [optimize with NEON instruction set])], [neon_support=$enableval], [neon_support="no"])
+if test "$neon_support" = yes; then
+	AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_NEON "])
+	AC_SUBST(MKCFLAGS, ["$MKCFLAGS-mfpu=neon "])
+	AC_MSG_RESULT(yes)
+else
+	AC_MSG_RESULT(no)
+fi
+
+# check for libpng, libjpeg, fftw3, liblinear, cblas, Accelerate framework, avformat, avcodec, swscale
 AC_CHECK_HEADER(png.h,
 				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_LIBPNG "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lpng "])])
 AC_CHECK_HEADER(jpeglib.h,
 				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_LIBJPEG "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-ljpeg "])])
 AC_CHECK_HEADER(fftw3.h,
-				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_FFTW3 "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lfftw3 -lfftw3f "])])
+				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_FFTW3 "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lfftw3 -lfftw3f -lpthread "])])
 AC_CHECK_HEADER(linear.h,
 				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_LIBLINEAR "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-llinear "])])
 AC_CHECK_HEADER(cblas.h,
-				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_CBLAS "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lcblas "])])
+				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_CBLAS "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lblas "])])
+AC_CHECK_HEADER(tesseract/capi.h,
+				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_TESSERACT "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-ltesseract "])])
 AC_CHECK_HEADER([Accelerate/Accelerate.h],
 				[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_ACCELERATE_FRAMEWORK "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-framework Accelerate "])])
 AC_CHECK_HEADER([libavcodec/avcodec.h],
@@ -35,18 +46,24 @@ AC_CHECK_HEADER([libswscale/swscale.h],
 # only check dispatch if compiled with clang
 if test $CC = clang; then
 	AC_CHECK_HEADER([dispatch/dispatch.h],
-					[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D USE_DISPATCH "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-ldispatch "])])
+					[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D USE_DISPATCH "]) AC_SUBST(MKCFLAGS, ["$MKCFLAGS-fblocks "])])
+	AC_CHECK_LIB(dispatch, dispatch_apply, [AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-ldispatch -lBlocksRuntime "])])
+fi
+# check for SSE2 support only we don't enable NEON explicitly
+if test "$neon_support" != yes; then
+	AC_CHECK_HEADER(xmmintrin.h,
+					[AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_SSE2 "]) AC_SUBST(MKCFLAGS, ["$MKCFLAGS-msse2 "])])
 fi
 
 # check for gsl, and I need to first check these two before I can check gsl
 AC_CHECK_LIB(m, cos)
 AC_CHECK_LIB(gslcblas, cblas_dgemm)
 AC_CHECK_LIB(gsl, gsl_blas_dgemm,
-			 [AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_GSL "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lgsl "])])
+			 [AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_GSL "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lgsl -lgslcblas "])])
 
 # prepare for cuda
 AC_MSG_CHECKING([cuda])
-AC_ARG_WITH(cuda, [AS_HELP_STRING([--with-cuda], [CUDA installation])], [cuda_prefix=$withval], [cuda_prefix="/usr/local/cuda"])
+AC_ARG_WITH(cuda, [AS_HELP_STRING([--with-cuda], [CUDA installation [ARG=/usr/local/cuda]])], [cuda_prefix=$withval], [cuda_prefix="/usr/local/cuda"])
 if [[ -d "$cuda_prefix" ]]; then
 	AC_SUBST(DEFINE_MACROS, ["$DEFINE_MACROS-D HAVE_CUDA "])
 	AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lcuda -lcudart -lcublas "])
@@ -62,9 +79,6 @@ else
 	AC_MSG_RESULT(no)
 fi
 
-# check for block support, only enable dispatch when this is presented
-AS_IF([test $CC = clang], [AC_SUBST(MKCFLAGS, ["$MKCFLAGS-fblocks "]) AC_SUBST(MKLDFLAGS, ["$MKLDFLAGS-lBlocksRuntime "])])
-
 # for display only, concatenate CFLAGS and DEFINE_MACROS
 AC_SUBST(DISCFLAGS, ["$MKCFLAGS$DEFINE_MACROS"])
 
diff --git a/lib/cuda/.ycm_extra_conf.py b/lib/cuda/.ycm_extra_conf.py
new file mode 100644
index 000000000..6102c0c63
--- /dev/null
+++ b/lib/cuda/.ycm_extra_conf.py
@@ -0,0 +1,60 @@
+import os
+import ycm_core
+from clang_helpers import PrepareClangFlags
+
+flags = [
+	'-ffast-math',
+	'-Wall',
+	'-msse2',
+	'-D HAVE_SSE2',
+	'-D HAVE_LIBJPEG',
+	'-D HAVE_LIBPNG',
+	'-D HAVE_GSL',
+	'-D HAVE_FFTW3',
+	'-D HAVE_LIBLINEAR',
+	'-D HAVE_CBLAS',
+	'-D HAVE_AVCODEC',
+	'-D HAVE_AVFORMAT',
+	'-D HAVE_SWSCALE',
+	'-I',
+	'..'
+]
+
+def DirectoryOfThisScript():
+	return os.path.dirname(os.path.abspath(__file__))
+
+def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
+	if not working_directory:
+		return flags
+	new_flags = []
+	make_next_absolute = False
+	path_flags = ['-isystem', '-I', '-iquote', '--sysroot=']
+	for flag in flags:
+		new_flag = flag
+
+		if make_next_absolute:
+			make_next_absolute = False
+			if not flag.startswith('/'):
+				new_flag = os.path.join(working_directory, flag)
+
+		for path_flag in path_flags:
+			if flag == path_flag:
+				make_next_absolute = True
+				break
+
+			if flag.startswith(path_flag):
+				path = flag[len(path_flag):]
+				new_flag = path_flag + os.path.join(working_directory, path)
+				break
+
+		if new_flag:
+			new_flags.append(new_flag)
+	return new_flags
+
+def FlagsForFile(filename):
+	relative_to = DirectoryOfThisScript()
+	final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)
+	return {
+		'flags' : final_flags,
+		'do_cache' : True
+	}
diff --git a/lib/cuda/cwc.h b/lib/cuda/cwc.h
index d5388d3a1..0095269f2 100644
--- a/lib/cuda/cwc.h
+++ b/lib/cuda/cwc.h
@@ -9,7 +9,7 @@
 #include "../ccv.h"
 
 void cwc_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch);
-void cwc_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch);
+void cwc_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch);
 void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params);
 void cwc_convnet_compact(ccv_convnet_t* convnet);
 
diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu
index 5a5f7cadc..e0c820785 100644
--- a/lib/cuda/cwc_convnet.cu
+++ b/lib/cuda/cwc_convnet.cu
@@ -10,6 +10,7 @@ extern "C" {
 #include <cuda.h>
 #include <cublas_v2.h>
 #include "../3rdparty/sqlite3/sqlite3.h"
+#include "../inl/ccv_convnet_inl.h"
 
 // this structure holds intermediate on-device memory representation of convnet
 
@@ -18,16 +19,29 @@ typedef struct {
 	struct {
 		float* input; // input per batch
 		int* c; // class
+		float* out; // confidence score
 		float** dor; // dropout regulator, in this version I generate dor on CPU because it is lightweight and gsl has shuffle method, which is better suited for this (and faster than per-node randomization)
-	} host;
+	} host[2];
 	// on device
 	struct {
-		cudaStream_t stream;
-		cublasHandle_t cublas;
+		// this is modeled after Alex's "One Weird Trick", there are 3 join points for me: 1). forward pass from data parallelism to model parallelism; 2). compute logistic loss; 3). backward pass from model parallelism to data parallelism;
+		cudaStream_t data_stream; // based on above description, we need 3 streams, one stream for data parallelism
+		cudaStream_t model_stream[2]; // two streams for model parallelism (to overlap data transfer and computation
+		// based on above description, we need 6 events (3 join points):
+		// 0: in forward pass, when data parallelism is done, and model parallelism will start;
+		// 1: in forward pass, the first stream's model parallelism is done;
+		// 2: in forward pass, the second stream's model parallelism is done;
+		// 3: in backward pass, when the error propagate starts, (thus, model parallelism starts);
+		// 4: in backward pass, the first stream's model parallelism is done;
+		// 5: in backward pass, the second stream's model parallelism is done;
+		cudaEvent_t joint[6];
+		cublasHandle_t data_cublas; // the same, just cublas handle to stream
+		cublasHandle_t model_cublas[2]; // the same, just cublas handle to stream
 		float* input;
 		int* c;
+		float* out;
 		float** dor;
-	} device;
+	} device[2];
 } cwc_convnet_context_t;
 
 typedef struct {
@@ -36,82 +50,59 @@ typedef struct {
 
 typedef struct {
 	int batch;
+	int tops;
+	int dual_device;
 	ccv_convnet_layer_train_param_t* layer_params;
-	ccv_convnet_layer_t* layers;
-	ccv_convnet_layer_t* configurations;
-	ccv_convnet_layer_t* momentums;
-	float** forwards; // the forward output layers
-	float** backwards; // the backwards output layer
-	float** denoms; // the denominator for rnorm layer, thus, backprop can reuse the value
-	float* unit; // the unit vector for a batch, ease the GEMM on full-connect layer
-	float* scratch; // the scratch space for temporary reuse, it will be max(wnum, input rows * cols * channels + output rows * cols * channels)
+	struct {
+		ccv_convnet_layer_t* layers;
+		ccv_convnet_layer_t* configurations;
+		ccv_convnet_layer_t* momentums;
+		float** forwards; // the forward output layers
+		float** backwards; // the backwards output layer
+		float** denoms; // the denominator for rnorm layer, thus, backprop can reuse the value
+		float** scans; // the scan layer to reformat outputs
+		float* unit; // the unit vector for a batch, ease the GEMM on full-connect layer
+		float* scratch; // the scratch space for temporary reuse, it will be max(wnum, input rows * cols * channels + output rows * cols * channels)
+	} device[2];
 	cwc_convnet_context_t contexts[2];
-	cwc_convnet_stats_t device;
+	cwc_convnet_stats_t stats;
 } cwc_convnet_t;
 
+typedef struct {
+	int x, y, z;
+} cwc_convnet_kernel_vary_t;
+
 typedef struct {
 	struct {
-		int x, y, z;
-	} forward_propagate;
-	struct {
-		int x, y, z;
-	} backward_propagate_coeff;
-	struct {
-		int x, y, z;
-	} backward_propagate_error;
+		cwc_convnet_kernel_vary_t forward;
+		struct {
+			cwc_convnet_kernel_vary_t coefficient;
+			cwc_convnet_kernel_vary_t gradient;
+		} backward;
+	} convolutional;
 } cwc_convnet_layer_vary_t;
 
-
 #define VARY(x) ((cwc_convnet_layer_vary_t*)((x)->reserved))
 #define GPU(x) ((cwc_convnet_t*)((x)->reserved))
 #define BATCH_PER_BLOCK (8)
+#define THREAD_PER_BLOCK (16)
 
-inline static void _cwc_convnet_layer_deduce_output_format(ccv_convnet_layer_t* layer, int* rows, int* cols)
-{
-	assert(rows != 0 && cols != 0);
-	switch(layer->type)
-	{
-		case CCV_CONVNET_CONVOLUTIONAL:
-			assert(layer->net.convolutional.rows % 2); // as of now, don't support even number of kernel size
-			assert(layer->net.convolutional.cols % 2);
-			assert((layer->input.matrix.rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows) % layer->net.convolutional.strides == 0);
-			assert((layer->input.matrix.cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols) % layer->net.convolutional.strides == 0);
-			*rows = (layer->input.matrix.rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
-			*cols = (layer->input.matrix.cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
-			break;
-		case CCV_CONVNET_FULL_CONNECT:
-			*rows = layer->net.full_connect.count;
-			*cols = 1;
-			break;
-		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-			*rows = layer->input.matrix.rows;
-			*cols = layer->input.matrix.cols;
-			break;
-		case CCV_CONVNET_MAX_POOL:
-		case CCV_CONVNET_AVERAGE_POOL:
-			assert((layer->input.matrix.rows + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
-			assert((layer->input.matrix.cols + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
-			*rows = (layer->input.matrix.rows + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
-			*cols = (layer->input.matrix.cols + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
-			break;
-	}
-}
-
-static int _cwc_convnet_layer_use_multi_way(ccv_convnet_layer_t* layer)
+static int _cwc_convnet_layer_use_rows(ccv_convnet_layer_t* layer)
 {
-	return layer->input.matrix.channels <= 8;
+	return layer->input.matrix.channels <= 8 && layer->input.matrix.partition == 1;
 }
 
-static void _cwc_convnet_reorder_convolutional_weights_onto_device(float* w, float* ow, int wnum, int filters, int channels)
+static void _cwc_convnet_reorder_convolutional_weights_onto_device(float* w, float* ow, int wnum, int filters, int channels, int channel_partition)
 {
-	assert(wnum % (filters * channels) == 0);
+	int channels_per_partition = channels / channel_partition;
+	assert(wnum % (filters * channels_per_partition) == 0);
 	float* iw = (float*)ccmalloc(sizeof(float) * wnum);
-	int count = wnum / (filters * channels);
+	int count = wnum / (filters * channels_per_partition);
 	int i, j, k;
-	for (i = 0; i < channels; i++)
+	for (i = 0; i < channels_per_partition; i++)
 		for (j = 0; j < count; j++)
 			for (k = 0; k < filters; k++)
-				iw[i * count * filters + j * filters + k] = w[k * count * channels + j * channels + i];
+				iw[i * count * filters + j * filters + k] = w[k * count * channels_per_partition + j * channels_per_partition + i];
 	cudaMemcpy(ow, iw, sizeof(float) * wnum, cudaMemcpyHostToDevice);
 	ccfree(iw);
 }
@@ -130,250 +121,574 @@ static void _cwc_convnet_reorder_full_connect_weights_onto_device(float* w, floa
 	ccfree(iw);
 }
 
-static void _cwc_convnet_alloc_reserved(ccv_convnet_t* convnet, int batch, ccv_convnet_layer_train_param_t* layer_params)
+static void _cwc_convnet_alloc_layers(ccv_convnet_t* convnet, int device_id)
 {
-	if (GPU(convnet) && (GPU(convnet)->batch != batch || GPU(convnet)->layer_params != layer_params))
-		ccv_convnet_compact(convnet);
-	else if (GPU(convnet))
-		return; // it is allocated properly, no-op
-	convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10);
-	GPU(convnet)->batch = batch;
-	GPU(convnet)->layer_params = layer_params;
-	GPU(convnet)->device.memory_usage = 0;
-	cwc_convnet_layer_vary_t* layer_vary = (cwc_convnet_layer_vary_t*)(GPU(convnet) + 1);
-	memset(layer_vary, 0, sizeof(cwc_convnet_layer_vary_t) * convnet->count);
-	GPU(convnet)->layers = (ccv_convnet_layer_t*)(layer_vary + convnet->count);
-	int i, j, out_rows, out_cols;
-	memcpy(GPU(convnet)->layers, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
-	ccv_convnet_layer_t* layers = GPU(convnet)->layers;
-	// configurations (the backprop coeffs)
-	size_t scratch_space = 0;
-	size_t unit_size = batch;
-	for (i = 0; i < convnet->count; i++)
-		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
-		{
-			int use_multi_way = _cwc_convnet_layer_use_multi_way(layers + i);
-			layers[i].reserved = layer_vary + i;
-			_cwc_convnet_layer_deduce_output_format(layers + i, &out_rows, &out_cols);
-			scratch_space = ccv_max(scratch_space, layers[i].wnum);
-			scratch_space = ccv_max(scratch_space,
-					out_rows * out_cols * layers[i].net.convolutional.count * batch + // output layer reorder
-					layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch + // input layer reorder
-					layers[i].net.convolutional.count * layers[i].input.matrix.channels * layers[i].net.convolutional.rows * layers[i].net.convolutional.cols * (use_multi_way ? out_rows : 1) * (batch / BATCH_PER_BLOCK)); // unconsolidated weights output
-			if (use_multi_way)
-				unit_size = ccv_max(unit_size, out_rows * (batch / BATCH_PER_BLOCK));
-		}
-	GPU(convnet)->scratch = 0;
-	cudaMalloc(&GPU(convnet)->scratch, sizeof(float) * scratch_space);
-	assert(GPU(convnet)->scratch);
-	GPU(convnet)->device.memory_usage += sizeof(float) * scratch_space;
-	float* unit = 0;
-	cudaMallocHost(&unit, sizeof(float) * unit_size);
-	for (i = 0; i < unit_size; i++)
-		unit[i] = 1;
-	GPU(convnet)->unit = 0;
-	cudaMalloc(&GPU(convnet)->unit, sizeof(float) * unit_size);
-	GPU(convnet)->device.memory_usage += sizeof(float) * unit_size;
-	cudaMemcpy(GPU(convnet)->unit, unit, sizeof(float) * unit_size, cudaMemcpyHostToDevice);
-	cudaFreeHost(unit);
-	GPU(convnet)->configurations = GPU(convnet)->layers + convnet->count;
-	memcpy(GPU(convnet)->configurations, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
-	GPU(convnet)->momentums = GPU(convnet)->layers + convnet->count * 2;
-	memcpy(GPU(convnet)->momentums, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
-	GPU(convnet)->forwards = (float**)(GPU(convnet)->layers + convnet->count * 3);
-	GPU(convnet)->backwards = (float**)(GPU(convnet)->layers + convnet->count * 3) + convnet->count;
-	GPU(convnet)->denoms = (float**)(GPU(convnet)->layers + convnet->count * 3) + convnet->count * 2;
-	for (i = 0; i < 2; i++)
-	{
-		cwc_convnet_context_t* context = GPU(convnet)->contexts + i;
-		context->host.dor = (float**)(GPU(convnet)->layers + convnet->count * 3) + convnet->count * 3 + convnet->count * i;
-		context->device.dor = (float**)(GPU(convnet)->layers + convnet->count * 3) + convnet->count * 5 + convnet->count * i;
-		context->host.input = 0;
-		cudaMallocHost(&context->host.input, sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch); 
-		assert(context->host.input);
-		context->device.input = 0;
-		cudaMalloc(&context->device.input, sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch);
-		GPU(convnet)->device.memory_usage += sizeof(float) * convnet->rows * convnet->cols * convnet->channels * batch;
-		assert(context->device.input);
-		context->host.c = 0;
-		cudaMallocHost(&context->host.c, sizeof(int) * batch); 
-		assert(context->host.c);
-		context->device.c = 0;
-		cudaMalloc(&context->device.c, sizeof(int) * batch); 
-		GPU(convnet)->device.memory_usage += sizeof(int) * batch;
-		cudaStreamCreate(&context->device.stream);
-		cublasCreate(&context->device.cublas);
-		cublasSetStream(context->device.cublas, context->device.stream);
-	}
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
 	for (i = 0; i < convnet->count; i++)
 		switch (layers[i].type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				assert(GPU(convnet)->configurations[i].type == CCV_CONVNET_CONVOLUTIONAL);
-				assert(GPU(convnet)->momentums[i].type == CCV_CONVNET_CONVOLUTIONAL);
 				// allocating for layer
 				layers[i].w = 0;
 				cudaMalloc(&layers[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
 				assert(layers[i].w);
 				layers[i].bias = layers[i].w + layers[i].wnum;
-				_cwc_convnet_reorder_convolutional_weights_onto_device(convnet->layers[i].w, layers[i].w, layers[i].wnum, layers[i].net.convolutional.count, layers[i].net.convolutional.channels);
+				_cwc_convnet_reorder_convolutional_weights_onto_device(convnet->layers[i].w, layers[i].w, layers[i].wnum, layers[i].net.convolutional.count, layers[i].net.convolutional.channels, layers[i].input.matrix.partition);
 				cudaMemcpy(layers[i].bias, convnet->layers[i].bias, sizeof(float) * layers[i].net.convolutional.count, cudaMemcpyHostToDevice);
-				_cwc_convnet_layer_deduce_output_format(layers + i, &out_rows, &out_cols);
-				// allocating for configurations 
-				GPU(convnet)->configurations[i].w = 0;
-				cudaMalloc(&GPU(convnet)->configurations[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
-				assert(GPU(convnet)->configurations[i].w);
-				GPU(convnet)->configurations[i].bias = GPU(convnet)->configurations[i].w + layers[i].wnum;
-				// allocating for momentums
-				GPU(convnet)->momentums[i].w = 0;
-				cudaMalloc(&GPU(convnet)->momentums[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
-				assert(GPU(convnet)->momentums[i].w);
-				GPU(convnet)->momentums[i].bias = GPU(convnet)->momentums[i].w + layers[i].wnum;
-				GPU(convnet)->denoms[i] = 0;
-				GPU(convnet)->forwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->forwards[i], sizeof(float) * out_rows * out_cols * layers[i].net.convolutional.count * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].net.convolutional.count * batch;
-				assert(GPU(convnet)->forwards[i]);
-				GPU(convnet)->backwards[i] = 0;
-				if (i > 0) // if it is the input layer, no need to backprop to outmost one
-				{
-					cudaMalloc(&GPU(convnet)->backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
-					GPU(convnet)->device.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
-					assert(GPU(convnet)->backwards[i]);
-				}
-				for (j = 0; j < 2; j++)
-				{
-					cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
-					context->host.dor[i] = 0;
-					context->device.dor[i] = 0;
-					if (layer_params && layer_params[i].dor > 0)
-					{
-						assert(i > 0);
-						cudaMallocHost(&context->host.dor[i], sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count);
-						assert(context->host.dor[i]);
-						cudaMalloc(&context->device.dor[i], sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count);
-						GPU(convnet)->device.memory_usage += sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count;
-						assert(context->device.dor[i]);
-					}
-				}
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
 				assert(i > 0);
-				assert(GPU(convnet)->configurations[i].type == CCV_CONVNET_FULL_CONNECT);
-				assert(GPU(convnet)->momentums[i].type == CCV_CONVNET_FULL_CONNECT);
 				// allocating for layer
 				layers[i].w = 0;
 				cudaMalloc(&layers[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
 				assert(layers[i].w);
 				layers[i].bias = layers[i].w + layers[i].wnum;
 				_cwc_convnet_reorder_full_connect_weights_onto_device(convnet->layers[i].w, layers[i].w, layers[i].wnum, layers[i].input.matrix.rows * layers[i].input.matrix.cols, layers[i].input.matrix.channels);
 				cudaMemcpy(layers[i].bias, convnet->layers[i].bias, sizeof(float) * layers[i].net.full_connect.count, cudaMemcpyHostToDevice);
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				assert(i > 0);
+				layers[i].w = layers[i].bias = 0;
+				break;
+		}
+}
+
+static void _cwc_convnet_alloc_configurations(ccv_convnet_t* convnet, int device_id)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	ccv_convnet_layer_t* configurations = GPU(convnet)->device[device_id].configurations;
+	for (i = 0; i < convnet->count; i++)
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
+				assert(configurations[i].type == CCV_CONVNET_CONVOLUTIONAL);
+				// allocating for configurations 
+				configurations[i].w = 0;
+				cudaMalloc(&configurations[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count));
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
+				assert(configurations[i].w);
+				configurations[i].bias = configurations[i].w + layers[i].wnum;
+				break;
+			case CCV_CONVNET_FULL_CONNECT:
+				assert(i > 0);
+				assert(configurations[i].type == CCV_CONVNET_FULL_CONNECT);
 				// allocating for configurations 
-				GPU(convnet)->configurations[i].w = 0;
-				cudaMalloc(&GPU(convnet)->configurations[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
-				assert(GPU(convnet)->configurations[i].w);
-				GPU(convnet)->configurations[i].bias = GPU(convnet)->configurations[i].w + layers[i].wnum;
+				configurations[i].w = 0;
+				cudaMalloc(&configurations[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count));
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
+				assert(configurations[i].w);
+				configurations[i].bias = configurations[i].w + layers[i].wnum;
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				assert(i > 0);
+				assert(configurations[i].type == layers[i].type);
+				configurations[i].w = configurations[i].bias = 0;
+				break;
+		}
+}
+
+static void _cwc_convnet_alloc_momentums(ccv_convnet_t* convnet, int device_id)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	ccv_convnet_layer_t* momentums = GPU(convnet)->device[device_id].momentums;
+	for (i = 0; i < convnet->count; i++)
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
+				assert(momentums[i].type == CCV_CONVNET_CONVOLUTIONAL);
+				// allocating for momentums
+				momentums[i].w = 0;
+				cudaMalloc(&momentums[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count));
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.convolutional.count);
+				assert(momentums[i].w);
+				momentums[i].bias = momentums[i].w + layers[i].wnum;
+				break;
+			case CCV_CONVNET_FULL_CONNECT:
+				assert(i > 0);
+				assert(momentums[i].type == CCV_CONVNET_FULL_CONNECT);
 				// allocating for momentums
-				GPU(convnet)->momentums[i].w = 0;
-				cudaMalloc(&GPU(convnet)->momentums[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count));
-				GPU(convnet)->device.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
-				assert(GPU(convnet)->momentums[i].w);
-				GPU(convnet)->momentums[i].bias = GPU(convnet)->momentums[i].w + layers[i].wnum;
-				GPU(convnet)->denoms[i] = 0;
-				GPU(convnet)->forwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->forwards[i], sizeof(float) * layers[i].net.full_connect.count * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * layers[i].net.full_connect.count * batch;
-				assert(GPU(convnet)->forwards[i]);
-				GPU(convnet)->backwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->backwards[i]);
+				momentums[i].w = 0;
+				cudaMalloc(&momentums[i].w, sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count));
+				GPU(convnet)->stats.memory_usage += sizeof(float) * (layers[i].wnum + layers[i].net.full_connect.count);
+				assert(momentums[i].w);
+				momentums[i].bias = momentums[i].w + layers[i].wnum;
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				assert(i > 0);
+				assert(momentums[i].type == layers[i].type);
+				momentums[i].w = momentums[i].bias = 0;
+				break;
+		}
+}
+
+static void _cwc_convnet_alloc_forwards(ccv_convnet_t* convnet, int device_id, int start, int length, int rows, int cols, int batch)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	assert(start >= 0 && start + length <= convnet->count);
+	int out_rows, out_cols, out_partition;
+	for (i = start; i < start + length; i++)
+	{
+		_ccv_convnet_layer_derive_output(layers + i, rows, cols, &out_rows, &out_cols, &out_partition);
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
+				GPU(convnet)->device[device_id].forwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].forwards[i], sizeof(float) * out_rows * out_cols * layers[i].net.convolutional.count * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].net.convolutional.count * batch;
+				assert(GPU(convnet)->device[device_id].forwards[i]);
+				break;
+			case CCV_CONVNET_FULL_CONNECT:
+				assert(i > 0);
+				GPU(convnet)->device[device_id].forwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].forwards[i], sizeof(float) * layers[i].net.full_connect.count * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * layers[i].net.full_connect.count * batch;
+				assert(GPU(convnet)->device[device_id].forwards[i]);
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+				assert(i > 0);
+				GPU(convnet)->device[device_id].forwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].forwards[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].forwards[i]);
+				break;
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				assert(i > 0);
+				GPU(convnet)->device[device_id].forwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].forwards[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].forwards[i]);
+				break;
+		}
+		rows = out_rows, cols = out_cols;
+	}
+}
+
+static void _cwc_convnet_alloc_denoms(ccv_convnet_t* convnet, int device_id, int start, int length, int rows, int cols, int batch)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	assert(start >= 0 && start + length <= convnet->count);
+	int out_rows, out_cols, out_partition;
+	for (i = start; i < start + length; i++)
+	{
+		_ccv_convnet_layer_derive_output(layers + i, rows, cols, &out_rows, &out_cols, &out_partition);
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+				GPU(convnet)->device[device_id].denoms[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].denoms[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].denoms[i]);
+				break;
+			case CCV_CONVNET_CONVOLUTIONAL:
+			case CCV_CONVNET_FULL_CONNECT:
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				GPU(convnet)->device[device_id].denoms[i] = 0;
+				break;
+		}
+		rows = out_rows, cols = out_cols;
+	}
+}
+
+static void _cwc_convnet_alloc_backwards(ccv_convnet_t* convnet, int device_id, int batch)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	for (i = 0; i < convnet->count; i++)
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
+				GPU(convnet)->device[device_id].backwards[i] = 0;
+				if (i > 0) // if it is the input layer, no need to backprop to outmost one
+				{
+					cudaMalloc(&GPU(convnet)->device[device_id].backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
+					GPU(convnet)->stats.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
+					assert(GPU(convnet)->device[device_id].backwards[i]);
+				}
+				break;
+			case CCV_CONVNET_FULL_CONNECT:
+				assert(i > 0);
+				GPU(convnet)->device[device_id].backwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].backwards[i]);
+				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+				GPU(convnet)->device[device_id].backwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].backwards[i]);
+				break;
+			case CCV_CONVNET_MAX_POOL:
+			case CCV_CONVNET_AVERAGE_POOL:
+				assert(i > 0);
+				GPU(convnet)->device[device_id].backwards[i] = 0;
+				cudaMalloc(&GPU(convnet)->device[device_id].backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
+				GPU(convnet)->stats.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
+				assert(GPU(convnet)->device[device_id].backwards[i]);
+				break;
+		}
+}
+
+static void _cwc_convnet_alloc_dor(ccv_convnet_t* convnet, int device_id, int batch, ccv_convnet_layer_train_param_t* layer_params)
+{
+	int i, j;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	int rows = convnet->rows;
+	int cols = convnet->cols;
+	int out_rows, out_cols, out_partition;
+	for (i = 0; i < convnet->count; i++)
+	{
+		_ccv_convnet_layer_derive_output(layers + i, rows, cols, &out_rows, &out_cols, &out_partition);
+		switch (layers[i].type)
+		{
+			case CCV_CONVNET_CONVOLUTIONAL:
 				for (j = 0; j < 2; j++)
 				{
 					cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
-					context->host.dor[i] = 0;
-					context->device.dor[i] = 0;
+					context->host[device_id].dor[i] = 0;
+					context->device[device_id].dor[i] = 0;
 					if (layer_params && layer_params[i].dor > 0)
 					{
-						cudaMallocHost(&context->host.dor[i], sizeof(float) * batch * layers[i].net.full_connect.count);
-						assert(context->host.dor[i]);
-						cudaMalloc(&context->device.dor[i], sizeof(float) * batch * layers[i].net.full_connect.count);
-						GPU(convnet)->device.memory_usage += sizeof(float) * batch * layers[i].net.full_connect.count;
-						assert(context->device.dor[i]);
+						assert(i > 0);
+						cudaMallocHost(&context->host[device_id].dor[i], sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count);
+						assert(context->host[device_id].dor[i]);
+						cudaMalloc(&context->device[device_id].dor[i], sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count);
+						GPU(convnet)->stats.memory_usage += sizeof(float) * batch * out_rows * out_cols * layers[i].net.convolutional.count;
+						assert(context->device[device_id].dor[i]);
 					}
 				}
 				break;
-			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-				assert(i > 0);
-				assert(GPU(convnet)->configurations[i].type == CCV_CONVNET_LOCAL_RESPONSE_NORM);
-				assert(GPU(convnet)->momentums[i].type == CCV_CONVNET_LOCAL_RESPONSE_NORM);
-				GPU(convnet)->configurations[i].w = GPU(convnet)->configurations[i].bias = 0;
-				assert(GPU(convnet)->momentums[i].type == layers[i].type);
-				GPU(convnet)->momentums[i].w = GPU(convnet)->momentums[i].bias = 0;
-				GPU(convnet)->denoms[i] = 0;
-				cudaMalloc(&GPU(convnet)->denoms[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->denoms[i]);
-				GPU(convnet)->forwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->forwards[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->forwards[i]);
-				GPU(convnet)->backwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->backwards[i]);
+			case CCV_CONVNET_FULL_CONNECT:
 				for (j = 0; j < 2; j++)
 				{
 					cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
-					context->host.dor[i] = 0;
-					context->device.dor[i] = 0;
+					context->host[device_id].dor[i] = 0;
+					context->device[device_id].dor[i] = 0;
+					if (layer_params && layer_params[i].dor > 0)
+					{
+						cudaMallocHost(&context->host[device_id].dor[i], sizeof(float) * batch * layers[i].net.full_connect.count);
+						assert(context->host[device_id].dor[i]);
+						cudaMalloc(&context->device[device_id].dor[i], sizeof(float) * batch * layers[i].net.full_connect.count);
+						GPU(convnet)->stats.memory_usage += sizeof(float) * batch * layers[i].net.full_connect.count;
+						assert(context->device[device_id].dor[i]);
+					}
 				}
-				layers[i].w = layers[i].bias = 0;
 				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
 			case CCV_CONVNET_MAX_POOL:
 			case CCV_CONVNET_AVERAGE_POOL:
-				assert(i > 0);
-				_cwc_convnet_layer_deduce_output_format(layers + i, &out_rows, &out_cols);
-				assert(GPU(convnet)->configurations[i].type == layers[i].type);
-				GPU(convnet)->configurations[i].w = GPU(convnet)->configurations[i].bias = 0;
-				assert(GPU(convnet)->momentums[i].type == layers[i].type);
-				GPU(convnet)->momentums[i].w = GPU(convnet)->momentums[i].bias = 0;
-				GPU(convnet)->denoms[i] = 0;
-				GPU(convnet)->forwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->forwards[i], sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * out_rows * out_cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->forwards[i]);
-				GPU(convnet)->backwards[i] = 0;
-				cudaMalloc(&GPU(convnet)->backwards[i], sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch);
-				GPU(convnet)->device.memory_usage += sizeof(float) * layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch;
-				assert(GPU(convnet)->backwards[i]);
 				for (j = 0; j < 2; j++)
 				{
 					cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
-					context->host.dor[i] = 0;
-					context->device.dor[i] = 0;
+					context->host[device_id].dor[i] = 0;
+					context->device[device_id].dor[i] = 0;
 				}
-				layers[i].w = layers[i].bias = 0;
 				break;
 		}
+		rows = out_rows, cols = out_cols;
+	}
+}
+
+static void _cwc_convnet_alloc_input(ccv_convnet_t* convnet, int device_id, int context_id, int rows, int cols, int batch)
+{
+	cwc_convnet_context_t* context = GPU(convnet)->contexts + context_id;
+	context->host[device_id].input = 0;
+	cudaMallocHost(&context->host[device_id].input, sizeof(float) * rows * cols * convnet->channels * batch); 
+	assert(context->host[device_id].input);
+	context->device[device_id].input = 0;
+	cudaMalloc(&context->device[device_id].input, sizeof(float) * rows * cols * convnet->channels * batch);
+	GPU(convnet)->stats.memory_usage += sizeof(float) * rows * cols * convnet->channels * batch;
+	assert(context->device[device_id].input);
+}
+
+static void _cwc_convnet_alloc_c(ccv_convnet_t* convnet, int device_id, int context_id, int batch)
+{
+	cwc_convnet_context_t* context = GPU(convnet)->contexts + context_id;
+	context->host[device_id].c = 0;
+	cudaMallocHost(&context->host[device_id].c, sizeof(int) * batch); 
+	assert(context->host[device_id].c);
+	context->device[device_id].c = 0;
+	cudaMalloc(&context->device[device_id].c, sizeof(int) * batch); 
+	GPU(convnet)->stats.memory_usage += sizeof(int) * batch;
+}
+
+static void _cwc_convnet_alloc_out(ccv_convnet_t* convnet, int device_id, int context_id, int batch)
+{
+	cwc_convnet_context_t* context = GPU(convnet)->contexts + context_id;
+	context->host[device_id].out = 0;
+	cudaMallocHost(&context->host[device_id].out, sizeof(float) * batch); 
+	assert(context->host[device_id].out);
+	context->device[device_id].out = 0;
+	cudaMalloc(&context->device[device_id].out, sizeof(float) * batch); 
+	GPU(convnet)->stats.memory_usage += sizeof(float) * batch;
+}
+
+static void _cwc_convnet_alloc_context(ccv_convnet_t* convnet, int device_id, int context_id, int dual_device)
+{
+	cwc_convnet_context_t* context = GPU(convnet)->contexts + context_id;
+	cudaStreamCreate(&context->device[device_id].data_stream);
+	cublasCreate(&context->device[device_id].data_cublas);
+	cublasSetStream(context->device[device_id].data_cublas, context->device[device_id].data_stream);
+	int i;
+	if (dual_device)
+	{
+		// only allocate model parallelism stream / cublas handle / joint events when dual device mode is on
+		for (i = 0; i < 2; i++)
+		{
+			cudaStreamCreate(&context->device[device_id].model_stream[i]);
+			cublasCreate(&context->device[device_id].model_cublas[i]);
+			cublasSetStream(context->device[device_id].model_cublas[i], context->device[device_id].model_stream[i]);
+		}
+		for (i = 0; i < 6; i++)
+			cudaEventCreateWithFlags(&context->device[device_id].joint[i], cudaEventDisableTiming);
+	} else {
+		for (i = 0; i < 2; i++)
+		{
+			context->device[device_id].model_stream[i] = 0;
+			context->device[device_id].model_cublas[i] = 0;
+		}
+		for (i = 0; i < 6; i++)
+			context->device[device_id].joint[i] = 0;
+	}
+}
+
+static void _cwc_convnet_alloc_scratch(ccv_convnet_t* convnet, int device_id, int batch)
+{
+	int i;
+	int out_rows, out_cols, out_partition;
+	size_t scratch_space = 0;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	for (i = 0; i < convnet->count; i++)
+		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
+		{
+			int use_rows = _cwc_convnet_layer_use_rows(layers + i);
+			_ccv_convnet_layer_derive_output(layers + i, layers[i].input.matrix.rows, layers[i].input.matrix.cols, &out_rows, &out_cols, &out_partition);
+			scratch_space = ccv_max(scratch_space, layers[i].wnum);
+			scratch_space = ccv_max(scratch_space,
+					out_rows * out_cols * layers[i].net.convolutional.count * batch + // output layer reorder
+					layers[i].input.matrix.rows * layers[i].input.matrix.cols * layers[i].input.matrix.channels * batch + // input layer reorder
+					layers[i].wnum * (use_rows ? out_rows : 1) * (batch / BATCH_PER_BLOCK)); // unconsolidated weights output
+		}
+	GPU(convnet)->device[device_id].scratch = 0;
+	cudaMalloc(&GPU(convnet)->device[device_id].scratch, sizeof(float) * scratch_space);
+	assert(GPU(convnet)->device[device_id].scratch);
+	GPU(convnet)->stats.memory_usage += sizeof(float) * scratch_space;
+}
+
+static void _cwc_convnet_make_unit(ccv_convnet_t* convnet, int device_id, int batch)
+{
+	int i;
+	int out_rows, out_cols, out_partition;
+	size_t unit_size = batch;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	for (i = 0; i < convnet->count; i++)
+		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
+		{
+			_ccv_convnet_layer_derive_output(layers + i, layers[i].input.matrix.rows, layers[i].input.matrix.cols, &out_rows, &out_cols, &out_partition);
+			if (_cwc_convnet_layer_use_rows(layers + i))
+				unit_size = ccv_max(unit_size, out_rows * (batch / BATCH_PER_BLOCK));
+			unit_size = ccv_max(unit_size, out_rows * out_cols * batch);
+		}
+	float* unit = 0;
+	cudaMallocHost(&unit, sizeof(float) * unit_size);
+	for (i = 0; i < unit_size; i++)
+		unit[i] = 1;
+	GPU(convnet)->device[device_id].unit = 0;
+	cudaMalloc(&GPU(convnet)->device[device_id].unit, sizeof(float) * unit_size);
+	GPU(convnet)->stats.memory_usage += sizeof(float) * unit_size;
+	cudaMemcpy(GPU(convnet)->device[device_id].unit, unit, sizeof(float) * unit_size, cudaMemcpyHostToDevice);
+	cudaFreeHost(unit);
+}
+
+static void _cwc_convnet_alloc_scans(ccv_convnet_t* convnet, int device_id, int offset, int batch)
+{
+	int i;
+	for (i = 0; i < convnet->count; i++)
+	{
+		GPU(convnet)->device[device_id].scans[i] = 0;
+		if (i == offset)
+		{
+			ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + offset + 1;
+			cudaMalloc(&GPU(convnet)->device[device_id].scans[i], sizeof(float) * layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch);
+			GPU(convnet)->stats.memory_usage += sizeof(float) * layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch;
+			assert(GPU(convnet)->device[device_id].scans[i]);
+		}
+	}
+}
+
+// find the layer for scanning (it is the last convolutional layer)
+static int _cwc_convnet_find_scan(ccv_convnet_t* convnet, int device_id)
+{
+	int i;
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[device_id].layers;
+	for (i = convnet->count - 1; i >= 0; i--)
+		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
+			return i;
+	return -1;
+}
+
+// allocate reserved for only forward path, this is only interesting to single-device mode
+static void _cwc_convnet_alloc_reserved_for_classify(ccv_convnet_t* convnet, int tops, int batch)
+{
+	if (GPU(convnet) && (GPU(convnet)->batch != batch || GPU(convnet)->tops != tops || GPU(convnet)->layer_params != 0))
+		ccv_convnet_compact(convnet);
+	else if (GPU(convnet))
+		return; // it is allocated properly, no-op
+	convnet->reserved = (cwc_convnet_t*)ccmalloc(sizeof(cwc_convnet_t) + sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count + sizeof(float*) * convnet->count * 3);
+	GPU(convnet)->batch = batch;
+	GPU(convnet)->tops = tops;
+	GPU(convnet)->dual_device = 0;
+	GPU(convnet)->layer_params = 0;
+	GPU(convnet)->stats.memory_usage = 0;
+	cwc_convnet_layer_vary_t* layer_vary = (cwc_convnet_layer_vary_t*)(GPU(convnet) + 1);
+	memset(layer_vary, 0, sizeof(cwc_convnet_layer_vary_t) * convnet->count);
+	GPU(convnet)->device[0].layers = (ccv_convnet_layer_t*)(layer_vary + convnet->count);
+	memcpy(GPU(convnet)->device[0].layers, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
+	ccv_convnet_layer_t* layers = GPU(convnet)->device[0].layers;
+	// point reserved place to layer_vary
+	int i;
+	for (i = 0; i < convnet->count; i++)
+		if (layers[i].type == CCV_CONVNET_CONVOLUTIONAL)
+			layers[i].reserved = layer_vary + i;
+	// alloc and copy layers
+	_cwc_convnet_alloc_layers(convnet, 0);
+	GPU(convnet)->device[0].configurations = 0;
+	GPU(convnet)->device[0].momentums = 0;
+	GPU(convnet)->device[0].scratch = 0;
+	_cwc_convnet_make_unit(convnet, 0, batch * 30);
+	int scan = _cwc_convnet_find_scan(convnet, 0);
+	GPU(convnet)->device[0].forwards = (float**)(GPU(convnet)->device[0].layers + convnet->count);
+	// alloc forwards until the scan layer (for initial 6 patches)
+	_cwc_convnet_alloc_forwards(convnet, 0, 0, scan + 1, convnet->input.height, convnet->input.width, batch * 6);
+	// alloc forwards from scan layer (for scanned 30 patches)
+	_cwc_convnet_alloc_forwards(convnet, 0, scan + 1, convnet->count - scan - 1, GPU(convnet)->device[0].layers[scan + 1].input.matrix.rows, GPU(convnet)->device[0].layers[scan + 1].input.matrix.cols, batch * 30);
+	GPU(convnet)->device[0].denoms = (float**)(GPU(convnet)->device[0].layers + convnet->count) + convnet->count;
+	// alloc until the scan layer
+	_cwc_convnet_alloc_denoms(convnet, 0, 0, scan + 1, convnet->input.height, convnet->input.width, batch * 6);
+	// alloc denoms from scan layer to the end
+	_cwc_convnet_alloc_denoms(convnet, 0, scan + 1, convnet->count - scan - 1, GPU(convnet)->device[0].layers[scan + 1].input.matrix.rows, GPU(convnet)->device[0].layers[scan + 1].input.matrix.cols, batch * 30);
+	// alloc scan layer
+	GPU(convnet)->device[0].scans = (float**)(GPU(convnet)->device[0].layers + convnet->count) + convnet->count * 2;
+	_cwc_convnet_alloc_scans(convnet, 0, scan, batch * 30);
+	GPU(convnet)->device[0].backwards = 0;
+	GPU(convnet)->contexts[0].host[0].dor = GPU(convnet)->contexts[0].device[0].dor = 0;
+	_cwc_convnet_alloc_input(convnet, 0, 0, convnet->input.height, convnet->input.width, batch * 6);
+	_cwc_convnet_alloc_c(convnet, 0, 0, batch * tops);
+	_cwc_convnet_alloc_out(convnet, 0, 0, batch * tops);
+	_cwc_convnet_alloc_context(convnet, 0, 0, 0);
+	GPU(convnet)->contexts[1].host[0].dor = GPU(convnet)->contexts[1].device[0].dor = 0;
+	GPU(convnet)->contexts[1].host[0].input = GPU(convnet)->contexts[1].device[0].input = 0;
+	GPU(convnet)->contexts[1].host[0].c = GPU(convnet)->contexts[1].device[0].c = 0;
+	GPU(convnet)->contexts[1].host[0].out = GPU(convnet)->contexts[1].device[0].out = 0;
+	GPU(convnet)->contexts[1].device[0].data_stream = 0;
+	GPU(convnet)->contexts[1].device[0].data_cublas = 0;
+	for (i = 0; i < 2; i++)
+	{
+		GPU(convnet)->contexts[1].device[0].model_stream[i] = 0;
+		GPU(convnet)->contexts[1].device[0].model_cublas[i] = 0;
+	}
+	for (i = 0; i < 6; i++)
+		GPU(convnet)->contexts[1].device[0].joint[i] = 0;
+}
+
+// allocate reserved for both forward and backward path
+static void _cwc_convnet_alloc_reserved_both(ccv_convnet_t* convnet, int batch, int dual_device, ccv_convnet_layer_train_param_t* layer_params)
+{
+	if (GPU(convnet) && (GPU(convnet)->batch != batch || GPU(convnet)->tops != 0 || GPU(convnet)->dual_device != dual_device || GPU(convnet)->layer_params != layer_params))
+		ccv_convnet_compact(convnet);
+	else if (GPU(convnet))
+		return; // it is allocated properly, no-op
+	assert(dual_device == !!dual_device);
+	uint8_t* reserved = (uint8_t*)ccmalloc(sizeof(cwc_convnet_t) + (sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10) * (dual_device + 1));
+	convnet->reserved = (cwc_convnet_t*)reserved;
+	GPU(convnet)->batch = batch;
+	GPU(convnet)->tops = 0;
+	GPU(convnet)->dual_device = dual_device;
+	GPU(convnet)->layer_params = layer_params;
+	GPU(convnet)->stats.memory_usage = 0;
+	int i, j;
+	for (i = 0; i < dual_device + 1; i++)
+	{
+		GPU(convnet)->device[i].scans = 0;
+		cwc_convnet_layer_vary_t* layer_vary = (cwc_convnet_layer_vary_t*)(reserved + sizeof(cwc_convnet_t) + (sizeof(cwc_convnet_layer_vary_t) * convnet->count + sizeof(ccv_convnet_layer_t) * convnet->count * 3 + sizeof(float*) * convnet->count * 10) * i);
+		memset(layer_vary, 0, sizeof(cwc_convnet_layer_vary_t) * convnet->count);
+		GPU(convnet)->device[i].layers = (ccv_convnet_layer_t*)(layer_vary + convnet->count);
+		memcpy(GPU(convnet)->device[i].layers, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
+		ccv_convnet_layer_t* layers = GPU(convnet)->device[i].layers;
+		// point reserved place to layer_vary
+		for (j = 0; j < convnet->count; j++)
+			if (layers[j].type == CCV_CONVNET_CONVOLUTIONAL)
+				layers[j].reserved = layer_vary + j;
+		// alloc and copy layers
+		_cwc_convnet_alloc_layers(convnet, i);
+		// alloc scratch space (for backprop on convolutional layer)
+		_cwc_convnet_alloc_scratch(convnet, i, batch);
+		// alloc and make unit vector
+		_cwc_convnet_make_unit(convnet, i, batch);
+		// hook up configurations (the backprop coefficients), and alloc & copy configurations
+		GPU(convnet)->device[i].configurations = GPU(convnet)->device[i].layers + convnet->count;
+		memcpy(GPU(convnet)->device[i].configurations, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
+		_cwc_convnet_alloc_configurations(convnet, i);
+		// hook up momentums, and alloc & copy momentums
+		GPU(convnet)->device[i].momentums = GPU(convnet)->device[i].layers + convnet->count * 2;
+		memcpy(GPU(convnet)->device[i].momentums, convnet->layers, sizeof(ccv_convnet_layer_t) * convnet->count);
+		_cwc_convnet_alloc_momentums(convnet, i);
+		// hook up forwards and alloc forwards
+		GPU(convnet)->device[i].forwards = (float**)(GPU(convnet)->device[i].layers + convnet->count * 3);
+		_cwc_convnet_alloc_forwards(convnet, i, 0, convnet->count, convnet->rows, convnet->cols, batch);
+		// hook up denoms and alloc denoms
+		GPU(convnet)->device[i].denoms = (float**)(GPU(convnet)->device[i].layers + convnet->count * 3) + convnet->count * 2;
+		_cwc_convnet_alloc_denoms(convnet, i, 0, convnet->count, convnet->rows, convnet->cols, batch);
+		// hook up backwards and alloc backwards
+		GPU(convnet)->device[i].backwards = (float**)(GPU(convnet)->device[i].layers + convnet->count * 3) + convnet->count;
+		// hook up dor and alloc dor
+		_cwc_convnet_alloc_backwards(convnet, i, batch);
+		for (j = 0; j < 2; j++)
+		{
+			cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
+			context->host[i].dor = (float**)(GPU(convnet)->device[i].layers + convnet->count * 3) + convnet->count * 3 + convnet->count * j;
+			context->device[i].dor = (float**)(GPU(convnet)->device[i].layers + convnet->count * 3) + convnet->count * 5 + convnet->count * j;
+		}
+		_cwc_convnet_alloc_dor(convnet, i, batch, layer_params);
+		// alloc contexts
+		for (j = 0; j < 2; j++)
+		{
+			_cwc_convnet_alloc_input(convnet, i, j, convnet->rows, convnet->cols, batch);
+			_cwc_convnet_alloc_c(convnet, i, j, batch);
+			GPU(convnet)->contexts[j].host[i].out = 0;
+			GPU(convnet)->contexts[j].device[i].out = 0;
+			_cwc_convnet_alloc_context(convnet, i, j, dual_device);
+		}
+	}
 }
 
 // =========================================== KERNEL CODE ===================================================
 
 template <int input_per_thread, int filter_per_thread, int filter_per_block>
 __global__ static void _cwc_kern_convolutional_forward_propagate(const int strides, const int border, const int batch,
-		float* input, const int rows, const int cols, const int channels,
+		float* input, const int rows, const int cols, const int channels_per_partition, const int partition,
 		float* out, const int out_rows, const int out_cols,
 		float* filter, const int filter_rows, const int filter_cols, const int count,
 		float* const biases)
 {
-	assert(gridDim.x * filter_per_block == out_rows * count);
-	assert(gridDim.y == out_cols);
+	assert(gridDim.x * partition * filter_per_block == out_cols * count);
+	assert(gridDim.y == out_rows);
+	assert(gridDim.z == partition);
 	extern __shared__ float shared[];
 	float* shared_block = &shared[0];
 	float* shared_weights = &shared[batch];
@@ -391,8 +706,8 @@ __global__ static void _cwc_kern_convolutional_forward_propagate(const int strid
 			prod[i][j] = 0;
 	const int origin_x = blockIdx.x % out_cols;
 	const int origin_y = blockIdx.y;
-	const int filter_group_idx = blockIdx.x / out_cols;
-	input += (origin_y * strides * cols + origin_x * strides) * batch;
+	const int filter_group_idx = blockIdx.z * count / (filter_per_block * partition) + blockIdx.x / out_cols; // for the partitioned filter group
+	input += (blockIdx.z * channels_per_partition * rows * cols +  origin_y * strides * cols + origin_x * strides) * batch;
 	assert(thcnt >= batch);
 	assert(thcnt >= filter_per_block);
 	if (thidx < filter_per_block)
@@ -402,7 +717,7 @@ __global__ static void _cwc_kern_convolutional_forward_propagate(const int strid
 	const int start_y = max(origin_y * strides - border, 0) - (origin_y * strides - border);
 	const int end_y = min(origin_y * strides - border + filter_rows, rows) - (origin_y * strides - border);
 	filter += filter_group_idx * filter_per_block;
-	for (c = 0; c < channels; c++)
+	for (c = 0; c < channels_per_partition; c++)
 	{
 		for (y = start_y; y < end_y; y++)
 			for (x = start_x; x < end_x; x++)
@@ -423,40 +738,41 @@ __global__ static void _cwc_kern_convolutional_forward_propagate(const int strid
 		filter += filter_rows * filter_cols * count;
 	}
 	const int outcnt = out_rows * out_cols * batch;
-	out += filter_group_idx * filter_per_block * outcnt + (origin_y * out_cols + origin_x) * batch;
+	out += (filter_group_idx * filter_per_block + threadIdx.y * filter_per_thread) * outcnt + (origin_y * out_cols + origin_x) * batch + threadIdx.x * input_per_thread;
 	#pragma unroll
 	for (i = 0; i < filter_per_thread; i++)
 	{
 		const float bias = shared_bias[i + threadIdx.y * filter_per_thread];
 		#pragma unroll
 		for (j = 0; j < input_per_thread; j++)
-			out[(i + threadIdx.y * filter_per_thread) * outcnt + j + threadIdx.x * input_per_thread] = max(0.0, prod[i][j] + bias);
+			out[j] = max(0.0, prod[i][j] + bias);
+		out += outcnt;
 	}
 }
 
-static int _cwc_convnet_convolutional_forward_propagate_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* b, const cudaStream_t& stream,
+static int _cwc_convnet_convolutional_forward_propagate_vary(ccv_convnet_layer_t* layer, int rows, int cols, int batch, float* a, float* b, const cudaStream_t& stream,
 		int x, int y, int z) // these are the dynamic configurations
 {
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
 	// first do configuration validation
-	if (!(batch % x == 0 && z % y == 0 && layer->net.convolutional.count % z == 0 &&
+	if (!(batch % x == 0 && z % y == 0 && layer->net.convolutional.count % (z * out_partition) == 0 &&
 				batch / x * z / y <= 1024 && /* thread number constraint */
 				batch / x * z / y >= batch && batch / x * z / y >= z && /* kernel internal loading constraint */
-				sizeof(float) * ((batch + z * 2) + layer->net.convolutional.count) <= 48 * 1024 /* shared memory size constraint */))
+				sizeof(float) * (batch + z * 2) <= 48 * 1024 /* shared memory size constraint */))
 		return -1;
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
 	assert(b);
 #define vary_block(_x, _y, _z) do { \
 		dim3 threads_per_block(batch / _x, _z / _y); \
 		assert(threads_per_block.x * threads_per_block.y <= 1024); \
-		dim3 num_blocks(out_cols * layer->net.convolutional.count / _z, out_rows); \
+		dim3 num_blocks(out_cols * layer->net.convolutional.count / (_z * out_partition), out_rows, out_partition); \
 		int shared_memory_size = sizeof(float) * (batch + _z * 2); \
 		cudaFuncSetCacheConfig(_cwc_kern_convolutional_forward_propagate<_x, _y, _z>, cudaFuncCachePreferShared); \
 		_cwc_kern_convolutional_forward_propagate \
 			<_x, _y, _z> \
-			<<<num_blocks, threads_per_block, shared_memory_size + /* need extra space for bias */ sizeof(float) * layer->net.convolutional.count, stream>>> \
+			<<<num_blocks, threads_per_block, shared_memory_size, stream>>> \
 			(layer->net.convolutional.strides, layer->net.convolutional.border, batch, \
-			 a, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels, \
+			 a, rows, cols, layer->input.matrix.channels / out_partition, out_partition, \
 			 b, out_rows, out_cols, \
 			 layer->w, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count, \
 			 layer->bias); \
@@ -467,59 +783,52 @@ static int _cwc_convnet_convolutional_forward_propagate_vary(ccv_convnet_layer_t
 	return 0;
 }
 
-static void _cwc_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, const cudaStream_t& stream)
+static void _cwc_convnet_convolutional_forward_propagate(ccv_convnet_layer_t* layer, int rows, int cols, int batch, float* a, float* b, const cudaStream_t& stream)
 {
 	static int vary_x[] = { 1, 2, 4, 8 };
 	static int vary_y[] = { 1, 2, 4, 6, 8 };
 	static int vary_z[] = { 16, 24, 32, 36, 64, 72 };
-	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->forward_propagate, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_forward_propagate_vary, layer, batch, a, b, stream);
+	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->convolutional.forward, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_forward_propagate_vary, layer, rows, cols, batch, a, b, stream);
 }
 
 template <int input_per_thread, int size>
 __global__ static void _cwc_kern_rnorm_forward_propagate(const int batch,
-		float* input, const int rows, const int cols, const int channels,
+		float* input, const int rows, const int cols, const int channels_per_partition, const int partition,
 		float* out, float* denoms, const float kappa, const float alpha, const float beta)
 {
 	assert(gridDim.x == cols);
 	assert(gridDim.y == rows);
+	assert(gridDim.z == partition);
 	extern __shared__ float shared[];
 	float* shared_input = &shared[0];
 	const int way = size / 2;
-	const int thcnt = blockDim.x;
 	const int thidx = threadIdx.x;
-	const int input_loads = (batch + thcnt - 1) / thcnt;
 	int i, j, c;
 	float prod[input_per_thread];
 	const int incnt = rows * cols * batch;
-	input += (blockIdx.y * cols + blockIdx.x) * batch;
-	out += (blockIdx.y * cols + blockIdx.x) * batch;
-	denoms += (blockIdx.y * cols + blockIdx.x) * batch;
-	const int end_way = min(way, channels - 1);
+	input += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	out += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	denoms += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	const int end_way = min(way, channels_per_partition - 1);
 	for (c = 0; c < end_way; c++)
 	{
-		#pragma unroll
-		for (i = 0; i < input_loads; i++)
-			if (i * thcnt + thidx < batch)
-				shared_input[c * batch + i * thcnt + thidx] = input[i * thcnt + thidx];
+		shared_input[c * batch + thidx] = input[thidx];
 		input += incnt;
 	}
-	for (c = 0; c < channels; c++)
+	for (c = 0; c < channels_per_partition; c++)
 	{
 		const int start_way = max(c - way, 0);
-		const int end_way = min(c + way, channels - 1);
-		if (c + way < channels)
+		const int end_way = min(c + way, channels_per_partition - 1);
+		if (c + way < channels_per_partition)
 		{
-			#pragma unroll
-			for (i = 0; i < input_loads; i++)
-				if (i * thcnt + thidx < batch)
-					shared_input[(end_way % size) * batch + i * thcnt + thidx] = input[i * thcnt + thidx];
+			shared_input[(end_way % size) * batch + thidx] = input[thidx];
 			input += incnt;
 		}
 		__syncthreads();
 		#pragma unroll
 		for (i = 0; i < input_per_thread; i++)
 			prod[i] = 0;
-		#pragma unroll
+		#pragma unroll 5
 		for (i = start_way; i <= end_way; i++)
 			#pragma unroll
 			for (j = 0; j < input_per_thread; j++)
@@ -539,30 +848,22 @@ __global__ static void _cwc_kern_rnorm_forward_propagate(const int batch,
 	}
 }
 
-static void _cwc_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, float* denoms, const cudaStream_t& stream)
+static void _cwc_convnet_rnorm_forward_propagate(ccv_convnet_layer_t* layer, int rows, int cols, int batch, float* a, float* b, float* denoms, const cudaStream_t& stream)
 {
-	dim3 num_blocks(layer->input.matrix.cols, layer->input.matrix.rows);
+	dim3 num_blocks(cols, rows, layer->input.matrix.partition);
 	dim3 threads_per_block(batch);
 	assert(threads_per_block.x <= 1024);
 	int shared_memory_size = sizeof(float) * batch * layer->net.rnorm.size;
-	if (layer->net.rnorm.size == 3)
-	{
-		cudaFuncSetCacheConfig(_cwc_kern_rnorm_forward_propagate<1, 3>, cudaFuncCachePreferShared);
-		_cwc_kern_rnorm_forward_propagate
-		<1, 3>
-		<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
-		(batch,
-		 a, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
-		 b, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
-	} else if (layer->net.rnorm.size == 5) {
-		cudaFuncSetCacheConfig(_cwc_kern_rnorm_forward_propagate<1, 5>, cudaFuncCachePreferShared);
-		_cwc_kern_rnorm_forward_propagate
-		<1, 5>
-		<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
-		(batch,
-		 a, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
-		 b, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
-	}
+#define vary_block(_, _x) \
+	cudaFuncSetCacheConfig(_cwc_kern_rnorm_forward_propagate<1, _x>, cudaFuncCachePreferShared); \
+	_cwc_kern_rnorm_forward_propagate \
+	<1, _x> \
+	<<<num_blocks, threads_per_block, shared_memory_size, stream>>> \
+	(batch, \
+	 a, rows, cols, layer->input.matrix.channels / layer->input.matrix.partition, layer->input.matrix.partition, \
+	 b, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
+	cwc_vary_2_a(layer->net.rnorm.size, 3, 5, vary_block);
+#undef vary_block
 }
 
 template <int input_per_thread>
@@ -575,9 +876,8 @@ __global__ static void _cwc_kern_max_pool_forward_propagate(const int strides, c
 	assert(gridDim.z == channels);
 	extern __shared__ float shared[];
 	float* shared_input = &shared[0];
-	const int thcnt = blockDim.x;
+	assert(blockDim.x == batch);
 	const int thidx = threadIdx.x;
-	assert(thcnt >= batch);
 	int i, x, y;
 	input += blockIdx.z * rows * cols * batch + (blockIdx.y * strides * cols + blockIdx.x * strides) * batch;
 	float prod[input_per_thread];
@@ -597,8 +897,7 @@ __global__ static void _cwc_kern_max_pool_forward_propagate(const int strides, c
 		#pragma unroll
 		for (x = size_start_x; x < size_end_x; x++)
 		{
-			if (thidx < batch)
-				shared_input[thidx] = input[(y * cols + x) * batch + thidx];
+			shared_input[thidx] = input[(y * cols + x) * batch + thidx];
 			__syncthreads();
 			if (x == size_start_x && y == size_start_y)
 				#pragma unroll
@@ -616,10 +915,10 @@ __global__ static void _cwc_kern_max_pool_forward_propagate(const int strides, c
 		out[i + threadIdx.x * input_per_thread] = prod[i];
 }
 
-static void _cwc_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, const cudaStream_t& stream)
+static void _cwc_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer, int rows, int cols, int batch, float* a, float* b, const cudaStream_t& stream)
 {
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
 	dim3 num_blocks(out_cols, out_rows, layer->input.matrix.channels);
 	dim3 threads_per_block(batch);
 	assert(threads_per_block.x <= 1024);
@@ -628,7 +927,7 @@ static void _cwc_convnet_max_pool_forward_propagate(ccv_convnet_layer_t* layer,
 	<1>
 	<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
 	(layer->net.pool.strides, layer->net.pool.border, layer->net.pool.size, batch,
-	 a, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
+	 a, rows, cols, layer->input.matrix.channels,
 	 b, out_rows, out_cols);
 }
 
@@ -682,10 +981,10 @@ __global__ static void _cwc_kern_average_pool_forward_propagate(const int stride
 		out[i + threadIdx.x * input_per_thread] = prod[i] * inv_size;
 }
 
-static void _cwc_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, const cudaStream_t& stream)
+static void _cwc_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* layer, int rows, int cols, int batch, float* a, float* b, const cudaStream_t& stream)
 {
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
 	dim3 num_blocks(out_rows, out_cols, layer->input.matrix.channels);
 	dim3 threads_per_block(batch);
 	int shared_memory_size = sizeof(float) * batch;
@@ -693,14 +992,21 @@ static void _cwc_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* lay
 	<1>
 	<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
 	(layer->net.pool.strides, layer->net.pool.border, layer->net.pool.size, batch,
-	 a, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
+	 a, rows, cols, layer->input.matrix.channels,
 	 b, out_rows, out_cols);
 }
 
-static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, float* batch_unit /* this is just 1's in device */, const cublasHandle_t& handle)
+__global__ static void _cwc_kern_relu_forward_propagate(float* a)
+{
+	a += blockIdx.x * blockDim.x;
+	const int thidx = threadIdx.x;
+	a[thidx] = max(0.0, a[thidx]);
+}
+
+static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, float* batch_unit /* this is just 1's in device */, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
-	int rows, out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int rows, out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	out_cols = batch;
 	rows = layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels;
 	float alpha = 1;
@@ -710,6 +1016,11 @@ static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* lay
 	beta = 1;
 	// and then do the GEMM by adding bias
 	cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, batch, out_rows, rows, &alpha, a, batch, layer->w, rows, &beta, b, batch);
+	if (layer->net.full_connect.relu)
+		_cwc_kern_relu_forward_propagate
+		<<<layer->net.full_connect.count, batch, 0, stream>>>
+		(b);
+
 }
 
 __global__ static void _cwc_kern_mute_neuron(float* a, float* d)
@@ -720,54 +1031,59 @@ __global__ static void _cwc_kern_mute_neuron(float* a, float* d)
 	a[thidx] = a[thidx] * d[thidx];
 }
 
+static void _cwc_convnet_layer_forward_propagate(ccv_convnet_layer_t* layer, int device_id, int k, int rows, int cols, int batch, int dor, float* a, float* b, float* denoms, float* batch_unit, cwc_convnet_context_t* context)
+{
+	switch (layer->type)
+	{
+		case CCV_CONVNET_CONVOLUTIONAL:
+			_cwc_convnet_convolutional_forward_propagate(layer, rows, cols, batch, a, b, context->device[device_id].data_stream);
+			if (dor && context->device[device_id].dor[k])
+			{
+				int out_rows, out_cols, out_partition;
+				_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
+				_cwc_kern_mute_neuron
+				<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device[device_id].data_stream>>>
+				(b, context->device[device_id].dor[k]);
+			}
+			break;
+		case CCV_CONVNET_FULL_CONNECT:
+			assert(k > 0);
+			_cwc_convnet_full_connect_forward_propagate(layer, batch, a, b, batch_unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
+			if (dor && context->device[device_id].dor[k])
+				_cwc_kern_mute_neuron
+				<<<layer->net.full_connect.count, batch, 0, context->device[device_id].data_stream>>>
+				(b, context->device[device_id].dor[k]);
+			break;
+		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			assert(k > 0);
+			_cwc_convnet_rnorm_forward_propagate(layer, rows, cols, batch, a, b, denoms, context->device[device_id].data_stream);
+			break;
+		case CCV_CONVNET_MAX_POOL:
+			assert(k > 0);
+			_cwc_convnet_max_pool_forward_propagate(layer, rows, cols, batch, a, b, context->device[device_id].data_stream);
+			break;
+		case CCV_CONVNET_AVERAGE_POOL:
+			assert(k > 0);
+			_cwc_convnet_average_pool_forward_propagate(layer, rows, cols, batch,  a, b, context->device[device_id].data_stream);
+			break;
+	}
+}
+
 // assuming a is in device memory
-static void _cwc_convnet_encode_impl(ccv_convnet_t* convnet, float* a, int batch, int dor, cwc_convnet_context_t* context)
+static void _cwc_convnet_encode_impl(ccv_convnet_t* convnet, int device_id, float* a, int batch, int dor, cwc_convnet_context_t* context)
 {
 	assert(batch % 16 == 0);
-	int i;
-	for (i = 0; i < convnet->count; i++)
-	{
-		ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
-		switch (layer->type)
-		{
-			case CCV_CONVNET_CONVOLUTIONAL:
-				_cwc_convnet_convolutional_forward_propagate(layer, batch, i == 0 ? a : GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], context->device.stream);
-				if (dor && context->device.dor[i])
-				{
-					int out_rows, out_cols;
-					_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
-					_cwc_kern_mute_neuron
-					<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device.stream>>>
-					(GPU(convnet)->forwards[i], context->device.dor[i]);
-				}
-				break;
-			case CCV_CONVNET_FULL_CONNECT:
-				assert(i > 0);
-				_cwc_convnet_full_connect_forward_propagate(layer, batch, GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], GPU(convnet)->unit, context->device.cublas);
-				if (dor && context->device.dor[i])
-					_cwc_kern_mute_neuron
-					<<<layer->net.full_connect.count, batch, 0, context->device.stream>>>
-					(GPU(convnet)->forwards[i], context->device.dor[i]);
-				break;
-			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-				assert(i > 0);
-				_cwc_convnet_rnorm_forward_propagate(layer, batch, GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], GPU(convnet)->denoms[i], context->device.stream);
-				break;
-			case CCV_CONVNET_MAX_POOL:
-				assert(i > 0);
-				_cwc_convnet_max_pool_forward_propagate(layer, batch, GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], context->device.stream);
-				break;
-			case CCV_CONVNET_AVERAGE_POOL:
-				assert(i > 0);
-				_cwc_convnet_average_pool_forward_propagate(layer, batch,  GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], context->device.stream);
-				break;
-		}
+	int i;
+	for (i = 0; i < convnet->count; i++)
+	{
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+		_cwc_convnet_layer_forward_propagate(layer, device_id, i, layer->input.matrix.rows, layer->input.matrix.cols, batch, dor, i == 0 ? a : GPU(convnet)->device[device_id].forwards[i - 1], GPU(convnet)->device[device_id].forwards[i], GPU(convnet)->device[device_id].denoms[i], GPU(convnet)->device[device_id].unit, context);
 	}
 }
 
 #ifdef HAVE_GSL
 
-__global__ static void _cwc_kern_convolutional_relu_backward_propagate(const int batch,
+__global__ static void _cwc_kern_relu_backward_propagate(const int batch,
 		float* out, float* out_grad, const int out_rows, const int out_cols,
 		const int count)
 {
@@ -782,34 +1098,36 @@ __global__ static void _cwc_kern_convolutional_relu_backward_propagate(const int
 }
 
 template <int channel_per_thread, int filter_per_thread, int channel_per_block, int batch_per_block>
-__global__ static void _cwc_kern_convolutional_backward_propagate_coeff_default(const int strides, const int border, const int batch,
-		float* input, const int rows, const int cols, const int channels,
+__global__ static void _cwc_kern_convolutional_backward_propagate_coefficient_default(const int strides, const int border, const int batch, const int batch_group_count,
+		float* input, const int rows, const int cols, const int channels_per_partition, const int partition,
 		float* out_grad, const int out_rows, const int out_cols,
-		float* coeff, const int filter_rows, const int filter_cols, const int count)
+		float* coeff, const int filter_rows, const int filter_cols, const int count_per_partition)
 {
 	assert(gridDim.x == filter_cols);
 	assert(gridDim.y == filter_rows);
-	assert(gridDim.z * channel_per_block * batch_per_block == channels * batch);
+	assert(gridDim.z * channel_per_block * batch_per_block == channels_per_partition * partition * batch);
+	assert(batch == batch_per_block * batch_group_count);
 	extern __shared__ float shared[];
 	float* shared_input = &shared[0];
 	float* shared_out_grad = &shared[channel_per_block];
 	const int thidx = threadIdx.x + threadIdx.y * blockDim.x;
 	const int thcnt = blockDim.x * blockDim.y;
-	assert(blockDim.x * filter_per_thread == count);
+	assert(blockDim.x * filter_per_thread == count_per_partition);
 	assert(blockDim.y * channel_per_thread == channel_per_block);
 	assert(thcnt >= channel_per_block);
-	assert(thcnt >= count);
+	assert(thcnt >= count_per_partition);
 	const int origin_x = blockIdx.x;
 	const int origin_y = blockIdx.y;
-	const int channel_group_count = channels / channel_per_block;
+	const int channel_group_count = channels_per_partition / channel_per_block;
+	const int partition_idx = blockIdx.z / (channel_group_count * batch_group_count);
+	const int batch_group_idx = (blockIdx.z % (channel_group_count * batch_group_count)) / channel_group_count;
 	const int channel_group_idx = blockIdx.z % channel_group_count;
-	const int batch_group_idx = blockIdx.z / channel_group_count;
 	const int start_x = max(origin_x - border, 0) - (origin_x - border);
 	const int end_x = min(out_cols, (cols + border - origin_x + strides - 1) / strides);
 	const int start_y = max(origin_y - border, 0) - (origin_y - border);
 	const int end_y = min(out_rows, (rows + border - origin_y + strides - 1) / strides);
-	input += rows * cols * channels * batch_group_idx * batch_per_block + (origin_y * cols + origin_x) * channels + channel_group_idx * channel_per_block;
-	out_grad += out_rows * out_cols * count * batch_group_idx * batch_per_block;
+	input += (partition_idx * batch + batch_group_idx * batch_per_block) * rows * cols * channels_per_partition + (origin_y * cols + origin_x) * channels_per_partition + channel_group_idx * channel_per_block;
+	out_grad += (partition_idx * batch + batch_group_idx * batch_per_block) * out_rows * out_cols * count_per_partition;
 	int i, j, c, x, y;
 	float prod[channel_per_thread][filter_per_thread];
 	#pragma unroll
@@ -822,10 +1140,10 @@ __global__ static void _cwc_kern_convolutional_backward_propagate_coeff_default(
 		for (y = start_y; y < end_y; y++)
 			for (x = start_x; x < end_x; x++)
 			{
-				if (thidx < count)
-					shared_out_grad[thidx] = out_grad[(y * out_cols + x) * count + thidx];
+				if (thidx < count_per_partition)
+					shared_out_grad[thidx] = out_grad[(y * out_cols + x) * count_per_partition + thidx];
 				if (thidx < channel_per_block)
-					shared_input[thidx] = input[((y * strides - border) * cols + x * strides - border) * channels + thidx];
+					shared_input[thidx] = input[((y * strides - border) * cols + x * strides - border) * channels_per_partition + thidx];
 				__syncthreads();
 				#pragma unroll
 				for (i = 0; i < channel_per_thread; i++)
@@ -834,11 +1152,11 @@ __global__ static void _cwc_kern_convolutional_backward_propagate_coeff_default(
 						prod[i][j] += shared_input[i + threadIdx.y * channel_per_thread] * shared_out_grad[j + threadIdx.x * filter_per_thread];
 				__syncthreads();
 			}
-		input += rows * cols * channels;
-		out_grad += out_rows * out_cols * count;
+		input += rows * cols * channels_per_partition;
+		out_grad += out_rows * out_cols * count_per_partition;
 	}
-	const int cocnt = filter_cols * filter_rows * count;
-	coeff += cocnt * (channels * batch_group_idx + channel_group_idx * channel_per_block) + (origin_y * filter_cols + origin_x) * count;
+	const int cocnt = filter_cols * filter_rows * count_per_partition * partition;
+	coeff += cocnt * (channels_per_partition * batch_group_idx + channel_group_idx * channel_per_block) + (origin_y * filter_cols + origin_x) * count_per_partition * partition + partition_idx * count_per_partition;
 	#pragma unroll
 	for (i = 0; i < channel_per_thread; i++)
 		#pragma unroll
@@ -846,17 +1164,18 @@ __global__ static void _cwc_kern_convolutional_backward_propagate_coeff_default(
 			coeff[(i + threadIdx.y * channel_per_thread) * cocnt + j + threadIdx.x * filter_per_thread] = prod[i][j];
 }
 
-template <int channel_per_thread, int filter_per_thread, int batch_per_block>
-__global__ static void _cwc_kern_convolutional_backward_propagate_coeff_multi_way(const int strides, const int border, const int batch,
+template <int channel_per_thread, int filter_per_thread, int static_filter_rows, int batch_per_block>
+__global__ static void _cwc_kern_convolutional_backward_propagate_coefficient_rows(const int strides, const int border, const int batch,
 		float* input, const int rows, const int cols, const int channels,
 		float* out_grad, const int out_rows, const int out_cols,
 		float* coeff, const int filter_rows, const int filter_cols, const int count)
 {
 	assert(gridDim.x == filter_cols);
-	assert(gridDim.y == filter_rows);
+	assert(gridDim.y == out_rows);
+	assert(static_filter_rows == filter_rows);
 	extern __shared__ float shared[];
 	float* shared_input = &shared[0];
-	float* shared_out_grad = &shared[channels * batch_per_block];
+	float* shared_out_grad = &shared[filter_rows * channels * batch_per_block];
 	const int thidx = threadIdx.x + threadIdx.y * blockDim.x;
 	const int thcnt = blockDim.x * blockDim.y;
 	assert(blockDim.x * filter_per_thread == count);
@@ -864,117 +1183,68 @@ __global__ static void _cwc_kern_convolutional_backward_propagate_coeff_multi_wa
 	assert(thcnt >= channels * batch_per_block);
 	assert(thcnt >= count);
 	const int origin_x = blockIdx.x;
-	const int origin_y = blockIdx.y;
-	const int batch_group_idx = blockIdx.z / out_rows;
+	const int batch_group_idx = blockIdx.z;
 	const int start_x = max(origin_x - border, 0) - (origin_x - border);
 	const int end_x = min(out_cols, (cols + border - origin_x + strides - 1) / strides);
-	input += (rows * cols * channels * batch_group_idx + (origin_y * cols + origin_x) * channels) * batch_per_block;
+	input += (rows * cols * channels * batch_group_idx + origin_x * channels) * batch_per_block;
 	out_grad += out_rows * out_cols * count * batch_group_idx * batch_per_block;
-	int i, j, c, x;
-	const int y = blockIdx.z % out_rows;
-	float prod[channel_per_thread][filter_per_thread];
+	int i, j, k, c, x;
+	const int y = blockIdx.y;
+	float prod[static_filter_rows][channel_per_thread][filter_per_thread];
 	#pragma unroll
-	for (i = 0; i < channel_per_thread; i++)
+	for (i = 0; i < static_filter_rows; i++)
 		#pragma unroll
-		for (j = 0; j < filter_per_thread; j++)
-			prod[i][j] = 0;
-	const int iy = origin_y + y * strides - border;
-	const int chidx = thidx < channels * batch_per_block ? thidx : channels * batch_per_block - 1;
-	if (iy >= 0 && iy < rows)
+		for (j = 0; j < channel_per_thread; j++)
+			#pragma unroll
+			for (k = 0; k < filter_per_thread; k++)
+				prod[i][j][k] = 0;
+	const int iy = y * strides - border;
+	input += y * strides * cols * channels * batch_per_block;
+	out_grad += y * out_cols * count * batch_per_block;
+	for (x = start_x; x < end_x; x++)
 	{
-		input += (y * strides - border) * cols * channels * batch_per_block;
-		out_grad += y * out_cols * count * batch_per_block;
-		for (x = start_x; x < end_x; x++)
-		{
-			if (thidx < count)
-				#pragma unroll
-				for (c = 0; c < batch_per_block; c++)
-					shared_out_grad[c * count + thidx] = out_grad[x * count * batch_per_block + c * count + thidx];
-			shared_input[chidx] = input[(x * strides - border) * channels * batch_per_block + chidx]; // no need for a conditional
-			__syncthreads();
+		if (thidx < channels * batch_per_block)
+			#pragma unroll
+			for (i = 0; i < static_filter_rows; i++)
+				shared_input[i * channels * batch_per_block + thidx] = (i + iy >= 0 && i + iy < rows) ? input[((i - border) * cols + x * strides - border) * channels * batch_per_block + thidx] : 0;
+		if (thidx < count)
+			#pragma unroll
+			for (c = 0; c < batch_per_block; c++)
+				shared_out_grad[c * count + thidx] = out_grad[x * count * batch_per_block + c * count + thidx];
+		__syncthreads();
+		#pragma unroll
+		for (i = 0; i < static_filter_rows; i++)
 			#pragma unroll
-			for (i = 0; i < channel_per_thread; i++)
+			for (j = 0; j < channel_per_thread; j++)
 				#pragma unroll
-				for (j = 0; j < filter_per_thread; j++)
+				for (k = 0; k < filter_per_thread; k++)
 				{
 					float sum = 0;
 					#pragma unroll
 					for (c = 0; c < batch_per_block; c++)
-						sum += shared_input[c * channels + i + threadIdx.y * channel_per_thread] * shared_out_grad[c * count + j + threadIdx.x * filter_per_thread];
-					prod[i][j] += sum;
+						sum += shared_input[i * channels * batch_per_block + c * channels + j + threadIdx.y * channel_per_thread] * shared_out_grad[c * count + k + threadIdx.x * filter_per_thread];
+					prod[i][j][k] += sum;
 				}
-			__syncthreads();
-		}
+		__syncthreads();
 	}
 	const int cocnt = filter_cols * filter_rows * count;
-	coeff += cocnt * channels * blockIdx.z + (origin_y * filter_cols + origin_x) * count;
+	coeff += cocnt * channels * (blockIdx.y + blockIdx.z * out_rows) + origin_x * count;
 	#pragma unroll
 	for (i = 0; i < channel_per_thread; i++)
 		#pragma unroll
-		for (j = 0; j < filter_per_thread; j++)
-			coeff[(i + threadIdx.y * channel_per_thread) * cocnt + j + threadIdx.x * filter_per_thread] = prod[i][j];
-}
-
-template <int out_per_thread>
-__global__ static void _cwc_kern_convolutional_backward_propagate_bias(const int batch,
-		float* out_grad, const int out_rows, const int out_cols,
-		float* bias, const int count)
-{
-	assert(gridDim.x == count);
-	const int skip_pixels = blockDim.y;
-	extern __shared__ float shared[];
-	float* shared_bias = &shared[0];
-	float* shared_grad = &shared[1];
-	int i, x;
-	const int thidx = threadIdx.x + threadIdx.y * blockDim.x;
-	const int thcnt = blockDim.x * blockDim.y;
-	const int out_loads = (batch * skip_pixels + thcnt - 1) / thcnt;
-	assert(thcnt % batch == 0);
-	out_grad += blockIdx.x * out_rows * out_cols * batch + thidx;
-	const int out_load_factor = thcnt;
-	const int out_load_pixels = thcnt / batch;
-	if (thidx == 0)
-		shared_bias[0] = 0;
-	for (x = 0; x < out_rows * out_cols; x += skip_pixels)
-	{
-		for (i = 0; i < out_loads; i++)
-			if (i * thcnt + thidx < batch * skip_pixels && x + i * out_load_pixels < out_rows * out_cols)
-				shared_grad[i * thcnt + thidx] = out_grad[x * batch + i * out_load_factor];
-		__syncthreads();
-		// because I branched out with threadIdx, therefore, synchronization must happen outside of the if clause
-		if (threadIdx.y + x < out_rows * out_cols)
-		{
+		for (j = 0; j < static_filter_rows; j++)
 			#pragma unroll
-			for (i = 1; i < out_per_thread; i++)
-				shared_grad[threadIdx.y * batch + threadIdx.x * out_per_thread] += shared_grad[threadIdx.y * batch + threadIdx.x * out_per_thread + i];
-		}
-		__syncthreads();
-		// I can do better here, but bias computation is not the bottleneck
-		if (threadIdx.y + x < out_rows * out_cols && threadIdx.x == 0)
-			#pragma unroll
-			for (i = 1; i < blockDim.x; i++)
-				shared_grad[threadIdx.y * batch] += shared_grad[threadIdx.y * batch + i * out_per_thread];
-		__syncthreads();
-		// because I branched out with threadIdx, therefore, synchronization must happen outside of the if clause, thus, this if clause appeared repeatedly
-		if (threadIdx.y + x < out_rows * out_cols && thidx == 0)
-		{
-			#pragma unroll
-			for (i = 1; i < blockDim.y && i + x < out_rows * out_cols; i++)
-				shared_grad[0] += shared_grad[i * batch];
-			shared_bias[0] += shared_grad[0];
-		}
-		__syncthreads();
-	}
-	if (thidx == 0)
-		bias[blockIdx.x] = shared_bias[0];
+			for (k = 0; k < filter_per_thread; k++)
+				coeff[(i + threadIdx.y * channel_per_thread) * cocnt + j * filter_cols * count + k + threadIdx.x * filter_per_thread] = prod[j][i][k];
 }
 
-template <int input_per_thread, int channel_per_thread, int channel_per_block>
-__global__ static void _cwc_kern_convolutional_backward_propagate(const int strides, const int border, const int batch,
+template <int input_per_thread, int channel_per_thread, int channel_per_block, int strides>
+__global__ static void _cwc_kern_convolutional_backward_propagate_error(const int border, const int batch,
 		float* input_grad, const int rows, const int cols, const int channels,
 		float* out_grad, const int out_rows, const int out_cols,
-		float* filter, const int filter_rows, const int filter_cols, const int count)
+		float* filter, const int filter_rows, const int filter_cols, const int count_per_partition, const int partition)
 {
+	assert(gridDim.z == partition);
 	extern __shared__ float shared[];
 	float* shared_grad = &shared[0];
 	float* shared_weights = &shared[batch];
@@ -987,7 +1257,7 @@ __global__ static void _cwc_kern_convolutional_backward_propagate(const int stri
 	assert(thcnt >= channel_per_block);
 	const int origin_x = blockIdx.x % cols;
 	const int origin_y = blockIdx.y;
-	const int channel_group_idx = blockIdx.x / cols;
+	const int channel_group_idx = blockIdx.z * channels / (channel_per_block * partition) + blockIdx.x / cols;
 	int i, j, k, c, x, y;
 	#pragma unroll
 	for (i = 0; i < input_per_thread; i++)
@@ -1006,14 +1276,14 @@ __global__ static void _cwc_kern_convolutional_backward_propagate(const int stri
 	const int out_start_x = max(out_x, 0);
 	const int filter_start_y = filter_y - (out_start_y - out_y) * strides;
 	const int filter_start_x = filter_x - (out_start_x - out_x) * strides;
-	out_grad += (out_start_y * out_cols + out_start_x) * batch;
+	out_grad += (blockIdx.z * count_per_partition * out_rows * out_cols + out_start_y * out_cols + out_start_x) * batch;
 	const int out_end_y = out_y + ycnt - 1;
 	const int out_end_x = out_x + xcnt - 1;
 	const int filter_end_y = (origin_x + border) % strides + (out_end_y - min(out_end_y, out_rows - 1)) * strides;
 	const int filter_end_x = (origin_y + border) % strides + (out_end_x - min(out_end_x, out_cols - 1)) * strides;
 	const int outcnt = out_rows * out_cols * batch;
 	filter += channel_group_idx * channel_per_block;
-	for (k = 0; k < count; k++)
+	for (k = 0; k < count_per_partition; k++)
 	{
 		float* out_grad_per_filter = out_grad + k * outcnt;
 		for (y = filter_start_y; y >= filter_end_y; y -= strides)
@@ -1046,107 +1316,176 @@ __global__ static void _cwc_kern_convolutional_backward_propagate(const int stri
 }
 
 // this method rewinds a matrix
-__global__ static void _cwc_kern_reorder_matrix_major(float* a, float* b, const int count, const int channels, const int batch)
+template <int reorder_per_block>
+__global__ static void _cwc_kern_reorder_matrix_major(float* a, float* b, const int count, const int channels_per_partition, const int partition, const int batch)
+{
+	assert(blockDim.x == reorder_per_block);
+	const int batch_group_idx = blockIdx.y % (batch / reorder_per_block);
+	const int channel_group_idx = blockIdx.y / (batch / reorder_per_block);
+	a += (blockIdx.z * count * channels_per_partition + blockIdx.x + channel_group_idx * reorder_per_block * count) * batch + batch_group_idx * reorder_per_block;
+	b += (blockIdx.z * count * batch + batch_group_idx * reorder_per_block * count + blockIdx.x) * channels_per_partition + channel_group_idx * reorder_per_block;
+	__shared__ float prod[reorder_per_block][reorder_per_block];
+	int i;
+	#pragma unroll
+	for (i = 0; i < reorder_per_block; i++)
+		prod[i][threadIdx.x] = a[i * count * batch + threadIdx.x];
+	__syncthreads();
+	#pragma unroll
+	for (i = 0; i < reorder_per_block; i++)
+		b[i * count * channels_per_partition + threadIdx.x] = prod[threadIdx.x][i];
+}
+
+// this method rewinds a matrix
+__global__ static void _cwc_kern_reorder_matrix_major_parted(float* a, float* b, const int count, const int channels, const int batch, const int channels_per_partition, const int batch_per_partition, const int partition)
 {
-	b[(threadIdx.x * count + blockIdx.x) * channels + blockIdx.y] = a[(blockIdx.y * count + blockIdx.x) * batch + threadIdx.x];
+	b[(threadIdx.x * count + blockIdx.x) * channels + blockIdx.y + threadIdx.y * channels_per_partition] = a[(blockIdx.y * count + blockIdx.x) * batch + threadIdx.x + threadIdx.y * batch_per_partition];
 }
 
 // this method rewinds a matrix
-__global__ static void _cwc_kern_reorder_matrix_major_per_block(float* a, float* b, const int count, const int channels, const int batch, const int batch_per_block)
+template <int batch_per_block>
+__global__ static void _cwc_kern_reorder_matrix_major_per_block_rows(float* a, float* b, const int count, const int channels, const int batch)
+{
+	const int thidx = blockIdx.y * batch_per_block + threadIdx.y;
+	b[(blockIdx.y * count + blockIdx.x) * channels * batch_per_block + threadIdx.y * channels + threadIdx.x] = a[(threadIdx.x * count + blockIdx.x) * batch + thidx];
+}
+
+// this method rewinds a matrix
+template <int channel_per_block, int batch_per_block, int batch_group_per_block>
+__global__ static void _cwc_kern_reorder_matrix_major_per_block(float* a, float* b, const int count, const int channels, const int batch)
+{
+	const int batch_group_idx = blockIdx.y % (batch / (batch_per_block * batch_group_per_block));
+	const int channel_group_idx = blockIdx.y / (batch / (batch_per_block * batch_group_per_block));
+	a += (channel_group_idx * channel_per_block * count + blockIdx.x) * batch + batch_group_idx * batch_per_block * batch_group_per_block;
+	b += (batch_group_idx * batch_group_per_block * count + blockIdx.x) * channels * batch_per_block + channel_group_idx * channel_per_block;
+	__shared__ float prod[channel_per_block][batch_per_block * batch_group_per_block];
+	int i, j;
+	#pragma unroll
+	for (i = 0; i < channel_per_block; i++)
+		prod[i][threadIdx.x] = a[i * count * batch + threadIdx.x];
+	__syncthreads();
+	if (threadIdx.x < channel_per_block)
+		#pragma unroll
+		for (i = 0; i < batch_group_per_block; i++)
+			#pragma unroll
+			for (j = 0; j < batch_per_block; j++)
+				b[(i * count * batch_per_block + j) * channels + threadIdx.x] = prod[threadIdx.x][i * batch_per_block + j];
+}
+
+static void _cwc_convnet_reorder_matrix_major_per_block(float* a, float* b, const int count, const int channels, const int batch, const cudaStream_t& stream)
 {
-	const int thidx = threadIdx.x + threadIdx.y * batch_per_block;
-	b[(threadIdx.y * count + blockIdx.x) * channels * batch_per_block + threadIdx.x * channels + blockIdx.y] = a[(blockIdx.y * count + blockIdx.x) * batch + thidx];
+	// this is by experience, ideally, this can be profile-guided too
+	const int batch_group_count = batch / BATCH_PER_BLOCK;
+	if (channels < 8)
+	{
+		assert(batch % BATCH_PER_BLOCK == 0);
+		assert(channels * BATCH_PER_BLOCK <= 1024);
+		_cwc_kern_reorder_matrix_major_per_block_rows
+		<BATCH_PER_BLOCK>
+		<<<dim3(count, batch_group_count), dim3(channels, BATCH_PER_BLOCK), 0, stream>>>
+		(a, b, count, channels, batch);
+	} else {
+		assert(channels % THREAD_PER_BLOCK == 0);
+		assert(THREAD_PER_BLOCK % BATCH_PER_BLOCK == 0);
+		assert(batch % THREAD_PER_BLOCK == 0);
+		_cwc_kern_reorder_matrix_major_per_block
+		<THREAD_PER_BLOCK, BATCH_PER_BLOCK, THREAD_PER_BLOCK / BATCH_PER_BLOCK>
+		<<<dim3(count, (channels / THREAD_PER_BLOCK) * (batch / THREAD_PER_BLOCK)), THREAD_PER_BLOCK, sizeof(float) * THREAD_PER_BLOCK * THREAD_PER_BLOCK, stream>>>
+		(a, b, count, channels, batch);
+	}
 }
 
-static int _cwc_convnet_convolutional_backward_propagate_coeff_multi_way_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle,
+static int _cwc_convnet_convolutional_backward_propagate_coefficient_rows_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle,
 		int x, int y, int z)
 {
 	if (!(layer->net.convolutional.count % y == 0 && layer->input.matrix.channels % x == 0 &&
 				layer->net.convolutional.count / y * layer->input.matrix.channels / x <= 1024 && /* thread per block constraint */
 				layer->net.convolutional.count / y * layer->input.matrix.channels / x >= layer->input.matrix.channels * BATCH_PER_BLOCK &&
 				layer->net.convolutional.count / y * layer->input.matrix.channels / x >= layer->net.convolutional.count && /* shared loading constraint */
-				sizeof(float) * BATCH_PER_BLOCK * (layer->input.matrix.channels + layer->net.convolutional.count) <= 48 * 1024 /* shared memory size constraint */))
+				sizeof(float) * BATCH_PER_BLOCK * (layer->net.convolutional.rows * layer->input.matrix.channels + layer->net.convolutional.count) <= 48 * 1024 /* shared memory size constraint */))
 		return -1;
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
+	assert(out_partition == 1); // this cannot handle partition
 	float* chm = scratch;
 	float* cha = scratch + layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch;
 	float* cbw = scratch + layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch + out_rows * out_cols * layer->net.convolutional.count * batch;
 	float alpha = 1, beta = 0;
 	int count = layer->net.convolutional.rows * layer->net.convolutional.cols * layer->net.convolutional.count * layer->input.matrix.channels;
 	const int batch_group_count = batch / BATCH_PER_BLOCK;
-	_cwc_kern_reorder_matrix_major_per_block
-	<<<dim3(layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels), dim3(BATCH_PER_BLOCK, batch_group_count), 0, stream>>>
-	(m, chm, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels, batch, BATCH_PER_BLOCK);
-	_cwc_kern_reorder_matrix_major_per_block
-	<<<dim3(out_rows * out_cols, layer->net.convolutional.count), dim3(BATCH_PER_BLOCK, batch_group_count), 0, stream>>>
-	(a, cha, out_rows * out_cols, layer->net.convolutional.count, batch, BATCH_PER_BLOCK);
-#define vary_block(_x, _y) do { \
+	_cwc_convnet_reorder_matrix_major_per_block
+	(m, chm, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels, batch, stream);
+	_cwc_convnet_reorder_matrix_major_per_block
+	(a, cha, out_rows * out_cols, layer->net.convolutional.count, batch, stream);
+#define vary_block(_x, _y, _z) do { \
 		dim3 threads_per_block_for_coeff(layer->net.convolutional.count / _y, layer->input.matrix.channels / _x); \
 		assert(threads_per_block_for_coeff.x * threads_per_block_for_coeff.y <= 1024); \
-		int batch_group_count = batch / BATCH_PER_BLOCK; \
-		dim3 num_blocks_for_coeff(layer->net.convolutional.cols, layer->net.convolutional.rows, out_rows * batch_group_count); \
-		int shared_memory_size = sizeof(float) * BATCH_PER_BLOCK * (layer->input.matrix.channels + layer->net.convolutional.count); \
-		cudaFuncSetCacheConfig(_cwc_kern_convolutional_backward_propagate_coeff_multi_way<_x, _y, BATCH_PER_BLOCK>, cudaFuncCachePreferShared); \
-		_cwc_kern_convolutional_backward_propagate_coeff_multi_way \
-		<_x, _y, BATCH_PER_BLOCK> \
+		dim3 num_blocks_for_coeff(layer->net.convolutional.cols, out_rows, batch_group_count); \
+		int shared_memory_size = sizeof(float) * BATCH_PER_BLOCK * (layer->net.convolutional.rows * layer->input.matrix.channels + layer->net.convolutional.count); \
+		cudaFuncSetCacheConfig(_cwc_kern_convolutional_backward_propagate_coefficient_rows<_x, _y, _z, BATCH_PER_BLOCK>, cudaFuncCachePreferShared); \
+		_cwc_kern_convolutional_backward_propagate_coefficient_rows \
+		<_x, _y, _z, BATCH_PER_BLOCK> \
 		<<<num_blocks_for_coeff, threads_per_block_for_coeff, shared_memory_size, stream>>> \
 		(layer->net.convolutional.strides, layer->net.convolutional.border, batch, \
 			chm, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels, \
 			cha, out_rows, out_cols, \
 			cbw, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count); \
-		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, count, 1, out_rows * batch_group_count, &alpha, cbw, count, unit, out_rows * batch_group_count, &beta, configuration->w, count); \
+		cublasSgemv(handle, CUBLAS_OP_N, count, out_rows * batch_group_count, &alpha, cbw, count, unit, 1, &beta, configuration->w, 1); \
 	} while (0)
 	// special casing for image
-	cwc_vary_4_a(x, 1, 2, 3, 4, cwc_vary_2_c, y, 1, 2, vary_block);
+	cwc_vary_4_a(x, 1, 2, 3, 4, cwc_vary_4_b, y, 1, 2, 3, 4, cwc_vary_5_c, layer->net.convolutional.rows, 3, 5, 7, 9, 11, vary_block);
 #undef vary_block
 	assert(cudaGetLastError() == cudaSuccess);
 	return 0;
 }
 
-static void _cwc_convnet_convolutional_backward_propagate_coeff_multi_way(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle)
+static void _cwc_convnet_convolutional_backward_propagate_coefficient_rows(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
 	static int vary_x[] = { 1, 2, 3, 4 };
-	static int vary_y[] = { 1, 2 };
+	static int vary_y[] = { 1, 2, 3, 4 };
 	static int vary_z[] = { 1 };
-	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->backward_propagate_coeff, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_coeff_multi_way_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
+	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->convolutional.backward.coefficient, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_coefficient_rows_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
 }
 
-static int _cwc_convnet_convolutional_backward_propagate_coeff_default_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle,
+static int _cwc_convnet_convolutional_backward_propagate_coefficient_default_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle,
 		int x, int y, int z)
 {
-	if (!(layer->net.convolutional.count % y == 0 && z % x == 0 && layer->net.convolutional.channels % z == 0 &&
-				layer->net.convolutional.count / y * z / x <= 1024 && /* thread per block constraint */
-				layer->net.convolutional.count / y * z / x >= z && layer->net.convolutional.count / y * z / x >= layer->net.convolutional.count && /* shared loading constraint */
-				sizeof(float) * (z + layer->net.convolutional.count) <= 32 * 1024 /* shared memory size constraint */))
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
+	if (!(layer->net.convolutional.count % (y * out_partition) == 0 && z % x == 0 && layer->net.convolutional.channels % (z * out_partition) == 0 &&
+		  layer->net.convolutional.count / (y * out_partition) * z / x <= 1024 && /* thread per block constraint */
+		  layer->net.convolutional.count / (y * out_partition) * z / x >= z && layer->net.convolutional.count / (y * out_partition) * z / x >= layer->net.convolutional.count / out_partition && /* shared loading constraint */
+				sizeof(float) * (z + layer->net.convolutional.count / out_partition) <= 32 * 1024 /* shared memory size constraint */))
 		return -1;
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
 	float* chm = scratch;
 	float* cha = scratch + layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch;
 	float* cbw = scratch + layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels * batch + out_rows * out_cols * layer->net.convolutional.count * batch;
 	float alpha = 1, beta = 0;
-	int count = layer->net.convolutional.rows * layer->net.convolutional.cols * layer->net.convolutional.count * layer->input.matrix.channels;
+	int count = layer->net.convolutional.rows * layer->net.convolutional.cols * layer->net.convolutional.count * layer->input.matrix.channels / out_partition;
+	assert((layer->input.matrix.channels / out_partition) % THREAD_PER_BLOCK == 0);
+	assert((layer->net.convolutional.count / out_partition) % THREAD_PER_BLOCK == 0);
+	assert(batch % THREAD_PER_BLOCK == 0);
 	_cwc_kern_reorder_matrix_major
-	<<<dim3(layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels), batch, 0, stream>>>
-	(m, chm, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels, batch);
+	<THREAD_PER_BLOCK>
+	<<<dim3(layer->input.matrix.rows * layer->input.matrix.cols, (layer->input.matrix.channels / out_partition / THREAD_PER_BLOCK) * (batch / THREAD_PER_BLOCK), out_partition), THREAD_PER_BLOCK, sizeof(float) * THREAD_PER_BLOCK * THREAD_PER_BLOCK, stream>>>
+	(m, chm, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels / out_partition, out_partition, batch);
 	_cwc_kern_reorder_matrix_major
-	<<<dim3(out_rows * out_cols, layer->net.convolutional.count), batch, 0, stream>>>
-	(a, cha, out_rows * out_cols, layer->net.convolutional.count, batch);
+	<THREAD_PER_BLOCK>
+	<<<dim3(out_rows * out_cols, (layer->net.convolutional.count / out_partition / THREAD_PER_BLOCK) * (batch / THREAD_PER_BLOCK), out_partition), THREAD_PER_BLOCK, sizeof(float) * THREAD_PER_BLOCK * THREAD_PER_BLOCK, stream>>>
+	(a, cha, out_rows * out_cols, layer->net.convolutional.count / out_partition, out_partition, batch);
 #define vary_block(_x, _y, _z) do { \
-		dim3 threads_per_block_for_coeff(layer->net.convolutional.count / _y, _z / _x); \
+		dim3 threads_per_block_for_coeff(layer->net.convolutional.count / (_y * out_partition), _z / _x); \
 		assert(threads_per_block_for_coeff.x * threads_per_block_for_coeff.y <= 1024); \
 		int batch_group_count = batch / BATCH_PER_BLOCK; \
 		dim3 num_blocks_for_coeff(layer->net.convolutional.cols, layer->net.convolutional.rows, layer->net.convolutional.channels / _z * batch_group_count); \
-		int shared_memory_size = sizeof(float) * (_z + layer->net.convolutional.count); \
-		_cwc_kern_convolutional_backward_propagate_coeff_default \
+		int shared_memory_size = sizeof(float) * (_z + layer->net.convolutional.count / out_partition); \
+		_cwc_kern_convolutional_backward_propagate_coefficient_default \
 		<_x, _y, _z, BATCH_PER_BLOCK> \
 		<<<num_blocks_for_coeff, threads_per_block_for_coeff, shared_memory_size, stream>>> \
-		(layer->net.convolutional.strides, layer->net.convolutional.border, batch, \
-			chm, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels, \
+		(layer->net.convolutional.strides, layer->net.convolutional.border, batch, batch_group_count, \
+			chm, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels / out_partition, out_partition, \
 			cha, out_rows, out_cols, \
-			cbw, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count); \
-		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, count, 1, batch_group_count, &alpha, cbw, count, unit, batch_group_count, &beta, configuration->w, count); \
+			cbw, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count / out_partition); \
+		cublasSgemv(handle, CUBLAS_OP_N, count, batch_group_count, &alpha, cbw, count, unit, 1, &beta, configuration->w, 1); \
 	} while (0)
 	cwc_vary_6_a(x, 1, 2, 3, 4, 6, 8, cwc_vary_6_b, y, 1, 2, 3, 4, 6, 8, cwc_vary_4_c, z, 16, 24, 32, 36, vary_block);
 #undef vary_block
@@ -1154,44 +1493,44 @@ static int _cwc_convnet_convolutional_backward_propagate_coeff_default_vary(ccv_
 	return 0;
 }
 
-static void _cwc_convnet_convolutional_backward_propagate_coeff_default(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle)
+static void _cwc_convnet_convolutional_backward_propagate_coefficient_default(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
 	static int vary_x[] = { 1, 2, 3, 4, 6, 8 };
 	static int vary_y[] = { 1, 2, 3, 4, 6, 8 };
 	static int vary_z[] = { 16, 24, 32, 36 };
-	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->backward_propagate_coeff, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_coeff_default_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
+	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->convolutional.backward.coefficient, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_coefficient_default_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
 }
 
 static int _cwc_convnet_convolutional_backward_propagate_error_vary(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle,
 		int x, int y, int z)
 {
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	if (!(batch % x == 0 && z % y == 0 &&
-				layer->input.matrix.channels % z == 0 &&
+				layer->input.matrix.channels % (z * out_partition) == 0 &&
 				batch / x * z / y <= 1024 && /* thread per block constraint */
 				batch / x * z / y >= batch && batch / x * z / y >= z && /* shared memory loading constraint */
 				sizeof(float) * (batch + z) <= 48 * 1024 /* shared memory size constraint */))
 		return -1;
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
 	float* chw = scratch;
-	_cwc_kern_reorder_matrix_major
-	<<<dim3(layer->net.convolutional.rows * layer->net.convolutional.cols, layer->input.matrix.channels), layer->net.convolutional.count, 0, stream>>>
-	(layer->w, chw, layer->net.convolutional.rows * layer->net.convolutional.cols, layer->input.matrix.channels, layer->net.convolutional.count);
-#define vary_block(_x, _y, _z) do { \
+	_cwc_kern_reorder_matrix_major_parted
+	<<<dim3(layer->net.convolutional.rows * layer->net.convolutional.cols, layer->input.matrix.channels / out_partition), dim3(layer->net.convolutional.count / out_partition, out_partition), 0, stream>>>
+	(layer->w, chw, layer->net.convolutional.rows * layer->net.convolutional.cols, layer->input.matrix.channels, layer->net.convolutional.count, layer->input.matrix.channels / out_partition, layer->net.convolutional.count / out_partition, out_partition);
+#define vary_block(_x, _y, _z, _s) do { \
 		dim3 threads_per_block(batch / _x, _z / _y); \
 		assert(threads_per_block.x * threads_per_block.y <= 1024); \
-		dim3 num_blocks(layer->input.matrix.cols * layer->input.matrix.channels / _z, layer->input.matrix.rows); \
+		dim3 num_blocks(layer->input.matrix.cols * layer->input.matrix.channels / (_z * out_partition), layer->input.matrix.rows, out_partition); \
 		int shared_memory_size = sizeof(float) * (batch + _z); \
-		cudaFuncSetCacheConfig(_cwc_kern_convolutional_backward_propagate<_x, _y, _z>, cudaFuncCachePreferShared); \
-		_cwc_kern_convolutional_backward_propagate \
-		<_x, _y, _z> \
+		cudaFuncSetCacheConfig(_cwc_kern_convolutional_backward_propagate_error<_x, _y, _z, _s>, cudaFuncCachePreferShared); \
+		_cwc_kern_convolutional_backward_propagate_error \
+		<_x, _y, _z, _s> \
 		<<<num_blocks, threads_per_block, shared_memory_size, stream>>> \
-		(layer->net.convolutional.strides, layer->net.convolutional.border, batch, \
+		(layer->net.convolutional.border, batch, \
 		 b, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels, \
 		 a, out_rows, out_cols, \
-		 chw, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count); \
+		 chw, layer->net.convolutional.rows, layer->net.convolutional.cols, layer->net.convolutional.count / out_partition, out_partition); \
 	} while (0)
-	cwc_vary_4_a(x, 1, 2, 4, 8, cwc_vary_5_b, y, 1, 2, 4, 6, 8, cwc_vary_6_c, z, 16, 24, 32, 36, 64, 72, vary_block);
+	cwc_vary_4_a(x, 1, 2, 4, 8, cwc_vary_5_b, y, 1, 2, 4, 6, 8, cwc_vary_6_c, z, 16, 24, 32, 36, 64, 72, cwc_vary_4_d, layer->net.convolutional.strides, 1, 2, 3, 4, vary_block);
 #undef vary_block
 	assert(cudaGetLastError() == cudaSuccess);
 	return 0;
@@ -1202,34 +1541,27 @@ static void _cwc_convnet_convolutional_backward_propagate_error(ccv_convnet_laye
 	static int vary_x[] = { 1, 2, 4, 8 };
 	static int vary_y[] = { 1, 2, 4, 6, 8 };
 	static int vary_z[] = { 16, 24, 32, 36, 64, 72 };
-	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->backward_propagate_error, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_error_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
+	CWC_IMPLEMENT_VARY_STUB(VARY(layer)->convolutional.backward.gradient, vary_x, vary_y, vary_z, _cwc_convnet_convolutional_backward_propagate_error_vary, layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
 }
 
 static void _cwc_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, ccv_convnet_layer_t* configuration, float* scratch, float* unit, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
 	assert(layer->net.convolutional.count % 4 == 0);
 	assert(batch % BATCH_PER_BLOCK == 0);
-	int out_rows, out_cols, shared_memory_size;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	// it turns out that first apply relu would save us a lot of computation because no need to low both out and out_grad any more
-	_cwc_kern_convolutional_relu_backward_propagate
+	_cwc_kern_relu_backward_propagate
 	<<<dim3(out_cols, out_rows, layer->net.convolutional.count), batch, 0, stream>>>
 	(batch, n, a, out_rows, out_cols, layer->net.convolutional.count);
 	assert(cudaGetLastError() == cudaSuccess);
-	if (_cwc_convnet_layer_use_multi_way(layer))
-		_cwc_convnet_convolutional_backward_propagate_coeff_multi_way(layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
+	float alpha = 1, beta = 0;
+	if (_cwc_convnet_layer_use_rows(layer))
+		_cwc_convnet_convolutional_backward_propagate_coefficient_rows(layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
 	else
-		_cwc_convnet_convolutional_backward_propagate_coeff_default(layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
-	dim3 threads_per_block_for_bias(batch / 16, 16);
-	assert(threads_per_block_for_bias.x * threads_per_block_for_bias.y <= 1024);
-	dim3 num_blocks_for_bias(layer->net.convolutional.count);
-	shared_memory_size = sizeof(float) * (1 + batch * 16);
-	_cwc_kern_convolutional_backward_propagate_bias
-	<16>
-	<<<num_blocks_for_bias, threads_per_block_for_bias, shared_memory_size, stream>>>
-	(batch,
-		a, out_rows, out_cols,
-		configuration->bias, layer->net.convolutional.count);
+		_cwc_convnet_convolutional_backward_propagate_coefficient_default(layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
+	// compute the bias directly using gemv routine
+	cublasSgemv(handle, CUBLAS_OP_T, out_rows * out_cols * batch, layer->net.convolutional.count, &alpha, a, out_rows * out_cols * batch, unit, 1, &beta, configuration->bias, 1);
 	assert(cudaGetLastError() == cudaSuccess);
 	if (b)
 		_cwc_convnet_convolutional_backward_propagate_error(layer, batch, a, n, m, b, configuration, scratch, unit, stream, handle);
@@ -1237,65 +1569,57 @@ static void _cwc_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* l
 
 template <int input_per_thread, int size>
 __global__ static void _cwc_kern_rnorm_backward_propagate(const int batch,
-		float* input, float* input_grad, const int rows, const int cols, const int channels,
+		float* input, float* input_grad, const int rows, const int cols, const int channels_per_partition, const int partition,
 		float* out, float* out_grad, float* denoms, const float kappa, const float alpha, const float beta)
 {
 	assert(gridDim.x == cols);
 	assert(gridDim.y == rows);
+	assert(gridDim.z == partition);
 	extern __shared__ float shared[];
 	float* shared_out_grad = &shared[0];
 	float* shared_out = &shared[batch * size];
 	float* shared_denoms = &shared[batch * size * 2];
 	float* shared_input = &shared[batch * size * 3];
 	const int way = size / 2;
-	const int thcnt = blockDim.x;
+	assert(blockDim.x == batch);
 	const int thidx = threadIdx.x;
-	const int input_loads = (batch + thcnt - 1) / thcnt;
 	int i, j, c;
 	float prod[input_per_thread];
 	const int incnt = rows * cols * batch;
-	out += (blockIdx.y * cols + blockIdx.x) * batch;
-	out_grad += (blockIdx.y * cols + blockIdx.x) * batch;
-	denoms += (blockIdx.y * cols + blockIdx.x) * batch;
-	input += (blockIdx.y * cols + blockIdx.x) * batch;
-	input_grad += (blockIdx.y * cols + blockIdx.x) * batch;
-	const int end_way = min(way, channels - 1);
+	out += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	out_grad += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	denoms += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	input += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	input_grad += (blockIdx.z * channels_per_partition * rows * cols + blockIdx.y * cols + blockIdx.x) * batch;
+	const int end_way = min(way, channels_per_partition - 1);
 	for (c = 0; c < end_way; c++)
 	{
-		#pragma unroll
-		for (i = 0; i < input_loads; i++)
-			if (i * thcnt + thidx < batch)
-				shared_out_grad[c * batch + i * thcnt + thidx] = out_grad[i * thcnt + thidx],
-				shared_out[c * batch + i * thcnt + thidx] = out[i * thcnt + thidx],
-				shared_denoms[c * batch + i * thcnt + thidx] = denoms[i * thcnt + thidx];
+		shared_out_grad[c * batch + thidx] = out_grad[thidx],
+		shared_out[c * batch + thidx] = out[thidx],
+		shared_denoms[c * batch + thidx] = denoms[thidx];
 		out_grad += incnt;
 		out += incnt;
 		denoms += incnt;
 	}
-	for (c = 0; c < channels; c++)
+	for (c = 0; c < channels_per_partition; c++)
 	{
 		const int start_way = max(c - way, 0);
-		const int end_way = min(c + way, channels - 1);
-		if (c + way < channels)
+		const int end_way = min(c + way, channels_per_partition - 1);
+		if (c + way < channels_per_partition)
 		{
-			#pragma unroll
-			for (i = 0; i < input_loads; i++)
-				if (i * thcnt + thidx < batch)
-					shared_out_grad[(end_way % size) * batch + i * thcnt + thidx] = out_grad[i * thcnt + thidx],
-					shared_out[(end_way % size) * batch + i * thcnt + thidx] = out[i * thcnt + thidx],
-					shared_denoms[(end_way % size) * batch + i * thcnt + thidx] = denoms[i * thcnt + thidx];
+			shared_out_grad[(end_way % size) * batch + thidx] = out_grad[thidx],
+			shared_out[(end_way % size) * batch + thidx] = out[thidx],
+			shared_denoms[(end_way % size) * batch + thidx] = denoms[thidx];
 			out_grad += incnt;
 			out += incnt;
 			denoms += incnt;
 		}
-		for (i = 0; i < input_loads; i++)
-			if (i * thcnt + thidx < batch)
-				shared_input[i * thcnt + thidx] = input[i * thcnt + thidx],
+		shared_input[thidx] = input[thidx];
 		__syncthreads();
 		#pragma unroll
 		for (i = 0; i < input_per_thread; i++)
 			prod[i] = 0;
-		#pragma unroll
+		#pragma unroll 5
 		for (i = start_way; i <= end_way; i++)
 			#pragma unroll
 			for (j = 0; j < input_per_thread; j++)
@@ -1311,28 +1635,20 @@ __global__ static void _cwc_kern_rnorm_backward_propagate(const int batch,
 
 static void _cwc_convnet_rnorm_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* denoms, float* b, const cudaStream_t& stream)
 {
-	dim3 num_blocks(layer->input.matrix.cols, layer->input.matrix.rows);
+	dim3 num_blocks(layer->input.matrix.cols, layer->input.matrix.rows, layer->input.matrix.partition);
 	dim3 threads_per_block(batch);
 	assert(threads_per_block.x <= 1024);
 	int shared_memory_size = sizeof(float) * batch * (layer->net.rnorm.size * 3 + 1);
-	if (layer->net.rnorm.size == 3)
-	{
-		cudaFuncSetCacheConfig(_cwc_kern_rnorm_backward_propagate<1, 3>, cudaFuncCachePreferShared);
-		_cwc_kern_rnorm_backward_propagate
-		<1, 3>
-		<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
-		(batch,
-		 m, b, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
-		 n, a, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
-	} else if (layer->net.rnorm.size == 5) {
-		cudaFuncSetCacheConfig(_cwc_kern_rnorm_backward_propagate<1, 5>, cudaFuncCachePreferShared);
-		_cwc_kern_rnorm_backward_propagate
-		<1, 5>
-		<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
-		(batch,
-		 m, b, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels,
-		 n, a, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
-	}
+#define vary_block(_, _x) \
+	cudaFuncSetCacheConfig(_cwc_kern_rnorm_backward_propagate<1, _x>, cudaFuncCachePreferShared); \
+	_cwc_kern_rnorm_backward_propagate \
+	<1, _x> \
+	<<<num_blocks, threads_per_block, shared_memory_size, stream>>> \
+	(batch, \
+	 m, b, layer->input.matrix.rows, layer->input.matrix.cols, layer->input.matrix.channels / layer->input.matrix.partition, layer->input.matrix.partition, \
+	 n, a, denoms, layer->net.rnorm.kappa, layer->net.rnorm.alpha, layer->net.rnorm.beta);
+	cwc_vary_2_a(layer->net.rnorm.size, 3, 5, vary_block);
+#undef vary_block
 }
 
 template <int input_per_thread>
@@ -1347,9 +1663,8 @@ __global__ static void _cwc_kern_max_pool_backward_propagate(const int strides,
 	float* shared_input = &shared[0];
 	float* shared_out = &shared[batch];
 	float* shared_grad = &shared[batch * 2];
-	const int thcnt = blockDim.x;
+	assert(blockDim.x == batch);
 	const int thidx = threadIdx.x;
-	assert(thcnt >= batch);
 	float prod[input_per_thread];
 	int i, x, y;
 	#pragma unroll
@@ -1372,9 +1687,8 @@ __global__ static void _cwc_kern_max_pool_backward_propagate(const int strides,
 	{
 		for (x = out_start_x; x < out_end_x; x++)
 		{
-			if (thidx < batch)
-				shared_out[thidx] = out[x * batch + thidx],
-				shared_grad[thidx] = out_grad[x * batch + thidx];
+			shared_out[thidx] = out[x * batch + thidx],
+			shared_grad[thidx] = out_grad[x * batch + thidx];
 			__syncthreads();
 			#pragma unroll
 			for (i = 0; i < input_per_thread; i++)
@@ -1395,8 +1709,8 @@ __global__ static void _cwc_kern_max_pool_backward_propagate(const int strides,
 
 static void _cwc_convnet_max_pool_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, const cudaStream_t& stream)
 {
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	dim3 num_blocks(layer->input.matrix.cols, layer->input.matrix.rows, layer->input.matrix.channels);
 	dim3 threads_per_block(batch);
 	int shared_memory_size = sizeof(float) * batch * 3;
@@ -1458,8 +1772,8 @@ __global__ static void _cwc_kern_average_pool_backward_propagate(const int strid
 
 static void _cwc_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, const cudaStream_t& stream)
 {
-	int out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	dim3 num_blocks(layer->input.matrix.cols, layer->input.matrix.rows, layer->input.matrix.channels);
 	dim3 threads_per_block(batch);
 	assert(threads_per_block.x <= 1024);
@@ -1472,12 +1786,17 @@ static void _cwc_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* la
 	 a, out_rows, out_cols);
 }
 
-static void _cwc_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* m, float* b, float* batch_unit, ccv_convnet_layer_t* configuration, const cublasHandle_t& handle)
+static void _cwc_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, float* batch_unit, ccv_convnet_layer_t* configuration, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
-	int rows, out_rows, out_cols;
-	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+	int rows, out_rows, out_cols, out_partition;
+	_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 	out_cols = batch;
 	rows = layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels;
+	// apply relu for full connect layer, not that this requires both n and a, and for the last full connect layer, we re-used the forwards, thus, it required the last full connect layer to not have relu enabled
+	if (layer->net.full_connect.relu)
+		_cwc_kern_relu_backward_propagate
+		<<<dim3(1, out_rows, 1), batch, 0, stream>>>
+		(batch, n, a, out_rows, 1, 1);
 	float alpha = 1;
 	float beta = 0;
 	// propagate bias
@@ -1488,25 +1807,23 @@ static void _cwc_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* la
 	cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, rows, out_rows, batch, &alpha, m, batch, a, batch, &beta, configuration->w, rows);
 }
 
-template <int input_per_thread>
-__global__ static void _cwc_kern_convnet_softmax_with_logistic_loss(const int batch, const int count, float* a, int* c)
+__global__ static void _cwc_kern_softmax_with_logistic_loss(const int batch, const int count, float* a, int* c)
 {
 	int i;
-	extern float shared[];
-	const int thidx = threadIdx.x;
+	const int thidx = blockIdx.x * blockDim.x + threadIdx.x;
 	float max_val = a[thidx];
 	for (i = 1; i < count; i++)
 	{
-		shared[thidx] = a[i * batch + thidx];
-		if (shared[thidx] > max_val)
-			max_val = shared[thidx];
+		float prod = a[i * batch + thidx];
+		if (prod > max_val)
+			max_val = prod;
 	}
 	float val = 0;
 	for (i = 0; i < count; i++)
 	{
-		shared[thidx] = a[i * batch + thidx];
-		val += (shared[thidx] = expf(shared[thidx] - max_val));
-		a[i * batch + thidx] = shared[thidx];
+		float prod = a[i * batch + thidx];
+		val += (prod = expf(prod - max_val));
+		a[i * batch + thidx] = prod;
 	}
 	val = 1.0 / val;
 	for (i = 0; i < count; i++)
@@ -1515,21 +1832,19 @@ __global__ static void _cwc_kern_convnet_softmax_with_logistic_loss(const int ba
 
 static void _cwc_convnet_softmax_with_logistic_loss(int batch, int count, float* a, int* c, const cudaStream_t& stream)
 {
-	dim3 num_blocks(1);
-	dim3 threads_per_block(batch);
+	dim3 num_blocks(ccv_max(1, batch / 64));
+	dim3 threads_per_block(ccv_min(batch, 64));
 	assert(threads_per_block.x <= 1024);
 	int shared_memory_size = sizeof(float) * batch;
-	_cwc_kern_convnet_softmax_with_logistic_loss
-	<1>
+	_cwc_kern_softmax_with_logistic_loss
 	<<<num_blocks, threads_per_block, shared_memory_size, stream>>>
 	(batch, count, a, c);
 }
 
-template <int input_per_thread>
-__global__ static void _cwc_kern_convnet_tests_return(const int batch, const int count, float* a, int* c)
+__global__ static void _cwc_kern_tests_return(const int batch, const int count, float* a, int* c)
 {
 	int i;
-	const int thidx = threadIdx.x;
+	const int thidx = blockIdx.x * blockDim.x + threadIdx.x;
 	float max_val = a[thidx];
 	int max_idx = 0;
 	for (i = 1; i < count; i++)
@@ -1543,11 +1858,10 @@ __global__ static void _cwc_kern_convnet_tests_return(const int batch, const int
 
 static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, const cudaStream_t& stream)
 {
-	dim3 num_blocks(1);
-	dim3 threads_per_block(batch);
+	dim3 num_blocks(ccv_max(1, batch / 64));
+	dim3 threads_per_block(ccv_min(batch, 64));
 	assert(threads_per_block.x <= 1024);
-	_cwc_kern_convnet_tests_return
-	<1>
+	_cwc_kern_tests_return
 	<<<num_blocks, threads_per_block, 0, stream>>>
 	(batch, count, a, c);
 }
@@ -1555,7 +1869,7 @@ static void _cwc_convnet_tests_return(int batch, int count, float* a, int* c, co
 template <int momentum_read>
 __global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum,
 		const int count,
-		const float learn_rate, const float momentum_rate, const float decay)
+		const float learn_rate, const float momentum_rate, const float decay_and_learn)
 {
 	if (blockIdx.x * blockDim.x + threadIdx.x < count)
 	{
@@ -1564,83 +1878,85 @@ __global__ static void _cwc_kern_net_sgd(float* a, float* grad, float* momentum,
 		momentum += blockIdx.x * blockDim.x;
 		const int thidx = threadIdx.x;
 		float old_a = a[thidx];
-		float velocity = (momentum_read ? momentum_rate * momentum[thidx] : 0) - decay * learn_rate * old_a + learn_rate * grad[thidx];
+		float velocity = (momentum_read ? momentum_rate * momentum[thidx] : 0) - decay_and_learn * old_a + learn_rate * grad[thidx];
 		a[thidx] = velocity + old_a;
 		momentum[thidx] = velocity;
 	}
 }
 
-static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int batch, ccv_convnet_layer_train_param_t* layer_params, cwc_convnet_context_t* context)
+static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int device_id, int momentum_read, int batch, ccv_convnet_layer_train_param_t* layer_params, cwc_convnet_context_t* context)
 {
-	int i, out_rows, out_cols;
-	dim3 threads_per_block(128);
+	int i, out_rows, out_cols, out_partition;
+	dim3 threads_per_block(64);
 	assert(threads_per_block.x <= 1024);
 	dim3 num_blocks_for_coeff;
 	dim3 num_blocks_for_bias;
 	for (i = 0; i < convnet->count; i++)
 	{
-		ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
-		ccv_convnet_layer_t* configuration = GPU(convnet)->configurations + i;
-		ccv_convnet_layer_t* momentum = GPU(convnet)->momentums + i;
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+		ccv_convnet_layer_t* configuration = GPU(convnet)->device[device_id].configurations + i;
+		ccv_convnet_layer_t* momentum = GPU(convnet)->device[device_id].momentums + i;
 		switch (layer->type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
-				num_blocks_for_coeff = (layer->net.convolutional.rows * layer->net.convolutional.cols * layer->net.convolutional.count * layer->net.convolutional.channels + 127) / 128;
-				num_blocks_for_bias = (layer->net.convolutional.count + 127) / 128;
+				_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
+				num_blocks_for_coeff = (layer->wnum + 63) / 64;
+				num_blocks_for_bias = (layer->net.convolutional.count + 63) / 64;
 				if (momentum_read)
 				{
 					_cwc_kern_net_sgd
 					<1>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_coeff, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->w, configuration->w, momentum->w, layer->wnum,
-					 layer_params[i].w.learn_rate, layer_params[i].w.momentum, layer_params[i].w.decay);
+					 layer_params[i].w.learn_rate / batch, layer_params[i].w.momentum, layer_params[i].w.decay * layer_params[i].w.learn_rate);
 					_cwc_kern_net_sgd
 					<1>
-					<<<num_blocks_for_bias, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_bias, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->bias, configuration->bias, momentum->bias, layer->net.convolutional.count,
-					 layer_params[i].bias.learn_rate, layer_params[i].bias.momentum, layer_params[i].bias.decay);
+					 layer_params[i].bias.learn_rate / batch, layer_params[i].bias.momentum, layer_params[i].bias.decay * layer_params[i].bias.learn_rate);
 				} else {
 					_cwc_kern_net_sgd
 					<0>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_coeff, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->w, configuration->w, momentum->w, layer->wnum,
-					 layer_params[i].w.learn_rate, layer_params[i].w.momentum, layer_params[i].w.decay);
+					 layer_params[i].w.learn_rate / batch, layer_params[i].w.momentum, layer_params[i].w.decay * layer_params[i].w.learn_rate);
 					_cwc_kern_net_sgd
 					<0>
-					<<<num_blocks_for_bias, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_bias, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->bias, configuration->bias, momentum->bias, layer->net.convolutional.count,
-					 layer_params[i].bias.learn_rate, layer_params[i].bias.momentum, layer_params[i].bias.decay);
+					 layer_params[i].bias.learn_rate / batch, layer_params[i].bias.momentum, layer_params[i].bias.decay * layer_params[i].bias.learn_rate);
 				}
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
 				// assume coeff and bias in the same continuous memory region
-				num_blocks_for_coeff = (layer->wnum + layer->net.full_connect.count + 127) / 128;
+				num_blocks_for_coeff = (layer->wnum + 63) / 64;
+				num_blocks_for_bias = (layer->net.full_connect.count + 63) / 64;
 				if (momentum_read)
 				{
 					_cwc_kern_net_sgd
 					<1>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_coeff, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->w, configuration->w, momentum->w, layer->wnum,
-					 layer_params[i].w.learn_rate, layer_params[i].w.momentum, layer_params[i].w.decay);
+					 layer_params[i].w.learn_rate / batch, layer_params[i].w.momentum, layer_params[i].w.decay * layer_params[i].w.learn_rate);
 					_cwc_kern_net_sgd
 					<1>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_bias, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->bias, configuration->bias, momentum->bias, layer->net.full_connect.count,
-					 layer_params[i].bias.learn_rate, layer_params[i].bias.momentum, layer_params[i].bias.decay);
+					 layer_params[i].bias.learn_rate / batch, layer_params[i].bias.momentum, layer_params[i].bias.decay * layer_params[i].bias.learn_rate);
 				} else {
 					_cwc_kern_net_sgd
 					<0>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_coeff, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->w, configuration->w, momentum->w, layer->wnum,
-					 layer_params[i].w.learn_rate, layer_params[i].w.momentum, layer_params[i].w.decay);
+					 layer_params[i].w.learn_rate / batch, layer_params[i].w.momentum, layer_params[i].w.decay * layer_params[i].w.learn_rate);
 					_cwc_kern_net_sgd
 					<0>
-					<<<num_blocks_for_coeff, threads_per_block, 0, context->device.stream>>>
+					<<<num_blocks_for_bias, threads_per_block, 0, context->device[device_id].data_stream>>>
 					(layer->bias, configuration->bias, momentum->bias, layer->net.full_connect.count,
-					 layer_params[i].bias.learn_rate, layer_params[i].bias.momentum, layer_params[i].bias.decay);
+					 layer_params[i].bias.learn_rate / batch, layer_params[i].bias.momentum, layer_params[i].bias.decay * layer_params[i].bias.learn_rate);
 				}
 				break;
+			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
 			case CCV_CONVNET_MAX_POOL:
 			case CCV_CONVNET_AVERAGE_POOL:
 				break;
@@ -1648,74 +1964,218 @@ static void _cwc_convnet_net_sgd(ccv_convnet_t* convnet, int momentum_read, int
 	}
 }
 
-static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, int* idx, ccv_size_t dim, int rows, int cols, int channels, int batch, int offset, int size, float* b, int* c)
+static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_dense_matrix_t* eigenvectors, ccv_dense_matrix_t* eigenvalues, float color_gain, int* idx, ccv_size_t dim, int rows, int cols, int channels, int category_count, int symmetric, int batch, int offset, int size, float* b, int* c)
 {
 	int i, k, x;
 	assert(size <= batch);
+	float* channel_gains = (float*)alloca(sizeof(float) * channels);
+	memset(channel_gains, 0, sizeof(float) * channels);
 	for (i = 0; i < size; i++)
 	{
 		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, idx ? idx[offset + i] : offset + i);
+		assert(categorized->c < category_count && categorized->c >= 0); // now only accept classes listed
 		if (c)
 			c[i] = categorized->c;
+		ccv_dense_matrix_t* image;
 		switch (categorized->type)
 		{
 			case CCV_CATEGORIZED_DENSE_MATRIX:
-				assert(rows == categorized->matrix->rows && cols == categorized->matrix->cols && channels == CCV_GET_CHANNEL(categorized->matrix->type));
-				for (k = 0; k < channels; k++)
-					for (x = 0; x < rows * cols; x++)
-						b[(k * rows * cols + x) * batch + i] = categorized->matrix->data.f32[x * channels + k];
+				image = categorized->matrix;
 				break;
 			case CCV_CATEGORIZED_FILE:
-			{
-				ccv_dense_matrix_t* image = 0;
+				image = 0;
 				ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
-				if (image)
+				if (!image)
 				{
-					ccv_dense_matrix_t* norm = 0;
-					if (image->rows > dim.height && image->cols > dim.width)
-						ccv_resample(image, &norm, 0, ccv_max(dim.height, (int)(image->rows * (float)dim.height / image->cols + 0.5)), ccv_max(dim.width, (int)(image->cols * (float)dim.width / image->rows + 0.5)), CCV_INTER_AREA);
-					else if (image->rows < dim.height || image->cols < dim.width)
-						ccv_resample(image, &norm, 0, ccv_max(dim.height, (int)(image->rows * (float)dim.height / image->cols + 0.5)), ccv_max(dim.width, (int)(image->cols * (float)dim.width / image->rows + 0.5)), CCV_INTER_CUBIC);
-					else
-						norm = image;
-					if (norm != image)
-					{
-						printf("%s is not properly formatted at %dx%d, and ccv resampled it to required size. But you may want to preprocess it to save time.\n", categorized->file.filename, image->rows, image->cols);
-						ccv_matrix_free(image);
-					}
-					ccv_dense_matrix_t* patch = 0;
-					if (norm->cols != cols || norm->rows != rows)
-					{
-						int x = gsl_rng_uniform_int(rng, norm->cols - cols + 1);
-						int y = gsl_rng_uniform_int(rng, norm->rows - rows + 1);
-						ccv_slice(norm, (ccv_matrix_t**)&patch, CCV_32F, y, x, rows, cols);
-					} else
-						patch = norm;
-					// random horizontal reflection
-					if (gsl_rng_uniform_int(rng, 2) == 0)
-						ccv_flip(patch, &patch, 0, CCV_FLIP_X);
-					if (norm != patch)
-						ccv_matrix_free(norm);
-					assert(channels == CCV_GET_CHANNEL(patch->type));
-					for (k = 0; k < channels; k++)
-						for (x = 0; x < rows * cols; x++)
-							b[(k * rows * cols + x) * batch + i] = patch->data.f32[x * channels + k] - 128.0;
-					ccv_matrix_free(patch);
-				} else
 					printf("cannot load %s.\n", categorized->file.filename);
+					continue;
+				}
 				break;
+		}
+		assert(image->rows == dim.height || image->cols == dim.width);
+		ccv_dense_matrix_t* input = 0;
+		if (image->cols != dim.width || image->rows != dim.height)
+		{
+			int x = rng ? gsl_rng_uniform_int(rng, image->cols - dim.width + 1) : (image->cols - dim.width + 1) / 2;
+			int y = rng ? gsl_rng_uniform_int(rng, image->rows - dim.height + 1) : (image->rows - dim.height + 1) / 2;
+			assert(x == 0 || y == 0);
+			ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, y, x, dim.height, dim.width);
+		} else
+			ccv_shift(image, (ccv_matrix_t**)&input, CCV_32F, 0, 0); // converting to 32f
+		// we loaded it in, deallocate it now
+		if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
+			ccv_matrix_free(image);
+		// random horizontal reflection
+		if (symmetric && rng && gsl_rng_uniform_int(rng, 2) == 0)
+			ccv_flip(input, &input, 0, CCV_FLIP_X);
+		ccv_subtract(input, mean_activity, (ccv_matrix_t**)&input, 0);
+		ccv_dense_matrix_t* patch = 0;
+		if (input->cols != cols || input->rows != rows)
+		{
+			int x = rng ? gsl_rng_uniform_int(rng, input->cols - cols + 1) : (input->cols - cols + 1) / 2;
+			int y = rng ? gsl_rng_uniform_int(rng, input->rows - rows + 1) : (input->rows - rows + 1) / 2;
+			ccv_slice(input, (ccv_matrix_t**)&patch, CCV_32F, y, x, rows, cols);
+			ccv_matrix_free(input);
+		} else
+			patch = input;
+		assert(channels == CCV_GET_CHANNEL(patch->type));
+		if (color_gain > 0 && rng && eigenvectors && eigenvalues)
+		{
+			assert(channels == 3); // only support RGB color gain
+			memset(channel_gains, 0, sizeof(float) * channels);
+			for (k = 0; k < channels; k++)
+			{
+				float alpha = gsl_ran_gaussian(rng, color_gain) * eigenvalues->data.f64[k];
+				for (x = 0; x < channels; x++)
+					channel_gains[x] += eigenvectors->data.f64[k * channels + x] * alpha;
 			}
 		}
+		for (k = 0; k < channels; k++)
+			for (x = 0; x < rows * cols; x++)
+				b[(k * rows * cols + x) * batch + i] = patch->data.f32[x * channels + k] + channel_gains[k];
+		ccv_matrix_free(patch);
+	}
+}
+
+static void _cwc_convnet_mean_formation(ccv_array_t* categorizeds, ccv_size_t dim, int channels, int symmetric, ccv_dense_matrix_t** b)
+{
+	int i, count = 0;
+	ccv_dense_matrix_t* c = ccv_dense_matrix_new(dim.height, dim.width, channels | CCV_64F, 0, 0);
+	ccv_zero(c);
+	ccv_dense_matrix_t* db = *b = ccv_dense_matrix_renew(*b, dim.height, dim.width, channels | CCV_32F, channels | CCV_32F, 0);
+	for (i = 0; i < categorizeds->rnum; i++)
+	{
+		if (i % 23 == 0 || i == categorizeds->rnum - 1)
+			FLUSH(" - compute mean activity %d / %d", i + 1, categorizeds->rnum);
+		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, i);
+		ccv_dense_matrix_t* image;
+		switch (categorized->type)
+		{
+			case CCV_CATEGORIZED_DENSE_MATRIX:
+				image = categorized->matrix;
+				break;
+			case CCV_CATEGORIZED_FILE:
+				image = 0;
+				ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+				if (!image)
+				{
+					printf("cannot load %s.\n", categorized->file.filename);
+					continue;
+				}
+				break;
+		}
+		ccv_dense_matrix_t* patch = 0;
+		if (image->cols != dim.width || image->rows != dim.height)
+		{
+			int x = (image->cols - dim.width + 1) / 2;
+			int y = (image->rows - dim.height + 1) / 2;
+			assert(x == 0 || y == 0);
+			ccv_slice(image, (ccv_matrix_t**)&patch, CCV_32F, y, x, dim.height, dim.width);
+		} else
+			ccv_shift(image, (ccv_matrix_t**)&patch, CCV_32F, 0, 0); // converting to 32f
+		if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
+			ccv_matrix_free(image);
+		ccv_add(patch, c, (ccv_matrix_t**)&c, CCV_64F);
+		++count;
+		ccv_matrix_free(patch);
+	}
+	if (symmetric)
+	{
+		int j, k;
+		double p = 0.5 / count;
+		double* cptr = c->data.f64;
+		float* dbptr = db->data.f32;
+		for (i = 0; i < db->rows; i++)
+		{
+			for (j = 0; j < db->cols; j++)
+				for (k = 0; k < channels; k++)
+					dbptr[j * channels + k] = p * (cptr[j * channels + k] + cptr[(c->cols - j - 1) * channels + k]);
+			dbptr += db->cols * channels;
+			cptr += c->cols * channels;
+		}
+	} else {
+		double p = 1.0 / count;
+		for (i = 0; i < dim.height * dim.width * channels; i++)
+			db->data.f32[i] = p * c->data.f64[i];
+	}
+	ccv_matrix_free(c);
+	printf("\n");
+}
+
+static void _cwc_convnet_channel_eigen(ccv_array_t* categorizeds, ccv_dense_matrix_t* mean_activity, ccv_size_t dim, int channels, ccv_dense_matrix_t** eigenvectors, ccv_dense_matrix_t** eigenvalues)
+{
+	assert(channels == 3); // this function cannot handle anything other than 3x3 covariance matrix
+	double* mean_value = (double*)alloca(sizeof(double) * channels);
+	memset(mean_value, 0, sizeof(double) * channels);
+	assert(CCV_GET_CHANNEL(mean_activity->type) == channels);
+	assert(mean_activity->rows == dim.height);
+	assert(mean_activity->cols == dim.width);
+	int i, j, k, c, count = 0;
+	for (i = 0; i < dim.height * dim.width; i++)
+		for (k = 0; k < channels; k++)
+			mean_value[k] += mean_activity->data.f32[i * channels + k];
+	for (i = 0; i < channels; i++)
+		mean_value[i] = mean_value[i] / (dim.height * dim.width);
+	double* covariance = (double*)alloca(sizeof(double) * channels * channels);
+	memset(covariance, 0, sizeof(double) * channels * channels);
+	for (c = 0; c < categorizeds->rnum; c++)
+	{
+		if (c % 23 == 0 || c == categorizeds->rnum - 1)
+			FLUSH(" - compute covariance matrix for data augmentation (color gain) %d / %d", c + 1, categorizeds->rnum);
+		ccv_categorized_t* categorized = (ccv_categorized_t*)ccv_array_get(categorizeds, c);
+		ccv_dense_matrix_t* image;
+		switch (categorized->type)
+		{
+			case CCV_CATEGORIZED_DENSE_MATRIX:
+				image = categorized->matrix;
+				break;
+			case CCV_CATEGORIZED_FILE:
+				image = 0;
+				ccv_read(categorized->file.filename, &image, CCV_IO_ANY_FILE | CCV_IO_RGB_COLOR);
+				if (!image)
+				{
+					printf("cannot load %s.\n", categorized->file.filename);
+					continue;
+				}
+				break;
+		}
+		ccv_dense_matrix_t* patch = 0;
+		if (image->cols != dim.width || image->rows != dim.height)
+		{
+			int x = (image->cols - dim.width + 1) / 2;
+			int y = (image->rows - dim.height + 1) / 2;
+			assert(x == 0 || y == 0);
+			ccv_slice(image, (ccv_matrix_t**)&patch, CCV_32F, y, x, dim.height, dim.width);
+		} else
+			ccv_shift(image, (ccv_matrix_t**)&patch, CCV_32F, 0, 0); // converting to 32f
+		if (categorized->type != CCV_CATEGORIZED_DENSE_MATRIX)
+			ccv_matrix_free(image);
+		for (i = 0; i < dim.width * dim.height; i++)
+			for (j = 0; j < channels; j++)
+				for (k = j; k < channels; k++)
+					covariance[j * channels + k] += (patch->data.f32[i * channels + j] - mean_value[j]) * (patch->data.f32[i * channels + k] - mean_value[k]);
+		++count;
+		ccv_matrix_free(patch);
 	}
+	for (i = 0; i < channels; i++)
+		for (j = 0; j < i; j++)
+			covariance[i * channels + j] = covariance[j * channels + i];
+	double p = 1.0 / ((double)count * dim.height * dim.width);
+	for (i = 0; i < channels; i++)
+		for (j = 0; j < channels; j++)
+			covariance[i * channels + j] *= p; // scale down
+	ccv_dense_matrix_t covm = ccv_dense_matrix(3, 3, CCV_64F | CCV_C1, covariance, 0);
+	ccv_eigen(&covm, eigenvectors, eigenvalues, CCV_64F, 1e-8);
+	printf("\n");
 }
 
-static void _cwc_convnet_dor_mean_net(ccv_convnet_t* convnet, ccv_convnet_layer_train_param_t* layer_params, const cublasHandle_t& handle)
+static void _cwc_convnet_dor_mean_net(ccv_convnet_t* convnet, int device_id, ccv_convnet_layer_train_param_t* layer_params, const cublasHandle_t& handle)
 {
 	int i;
 	for (i = 0; i < convnet->count; i++)
 		if (layer_params[i].dor > 0)
 		{
-			ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
+			ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
 			float dor = 1.0 - layer_params[i].dor;
 			cublasSscal(handle, layer->wnum, &dor, layer->w, 1);
 			assert(layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT);
@@ -1731,13 +2191,13 @@ static void _cwc_convnet_dor_mean_net(ccv_convnet_t* convnet, ccv_convnet_layer_
 		}
 }
 
-static void _cwc_convnet_dor_mean_net_undo(ccv_convnet_t* convnet, ccv_convnet_layer_train_param_t* layer_params, const cublasHandle_t& handle)
+static void _cwc_convnet_dor_mean_net_undo(ccv_convnet_t* convnet, int device_id, ccv_convnet_layer_train_param_t* layer_params, const cublasHandle_t& handle)
 {
 	int i;
 	for (i = 0; i < convnet->count; i++)
 		if (layer_params[i].dor > 0)
 		{
-			ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
+			ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
 			float inv_dor = 1.0 / (1.0 - layer_params[i].dor);
 			cublasSscal(handle, layer->wnum, &inv_dor, layer->w, 1);
 			assert(layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT);
@@ -1753,66 +2213,66 @@ static void _cwc_convnet_dor_mean_net_undo(ccv_convnet_t* convnet, ccv_convnet_l
 		}
 }
 
-static void _cwc_convnet_dor_formation(ccv_convnet_t* convnet, int batch, gsl_rng* rng, ccv_convnet_layer_train_param_t* layer_params, cwc_convnet_context_t* context)
+static void _cwc_convnet_dor_formation(ccv_convnet_t* convnet, int device_id, int batch, gsl_rng* rng, ccv_convnet_layer_train_param_t* layer_params, cwc_convnet_context_t* context)
 {
 	int i, j;
 	for (i = 0; i < convnet->count; i++)
-		if (context->host.dor[i])
+		if (context->host[device_id].dor[i])
 		{
-			assert(context->device.dor[i]);
+			assert(context->device[device_id].dor[i]);
 			assert(layer_params[i].dor > 0);
-			ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
-			int out_rows, out_cols;
-			_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+			ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+			int out_rows, out_cols, out_partition;
+			_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 			assert(layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT);
 			int count = layer->type == CCV_CONVNET_FULL_CONNECT ? layer->net.full_connect.count : out_rows * out_cols * layer->net.convolutional.count;
 			for (j = 0; j < batch * count; j++)
-				context->host.dor[i][j] = (gsl_rng_uniform(rng) >= layer_params[i].dor) ? 1.0 : 0.0;
-			cudaMemcpyAsync(context->device.dor[i], context->host.dor[i], sizeof(float) * count * batch, cudaMemcpyHostToDevice, context->device.stream);
+				context->host[device_id].dor[i][j] = (gsl_rng_uniform(rng) >= layer_params[i].dor) ? 1.0 : 0.0;
+			cudaMemcpyAsync(context->device[device_id].dor[i], context->host[device_id].dor[i], sizeof(float) * count * batch, cudaMemcpyHostToDevice, context->device[device_id].data_stream);
 			assert(cudaGetLastError() == cudaSuccess);
 		}
 }
 
-static void _cwc_convnet_backwards_propagate_error(ccv_convnet_t* convnet, float* a, float* m, int batch, cwc_convnet_context_t* context)
+static void _cwc_convnet_backwards_propagate_error(ccv_convnet_t* convnet, int device_id, float* a, float* m, int batch, cwc_convnet_context_t* context)
 {
 	assert(batch % 16 == 0);
 	int i;
 	for (i = convnet->count - 1; i >= 0; i--)
 	{
-		ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
-		ccv_convnet_layer_t* configuration = GPU(convnet)->configurations + i;
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
+		ccv_convnet_layer_t* configuration = GPU(convnet)->device[device_id].configurations + i;
 		switch (layer->type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				if (context->device.dor[i])
+				if (context->device[device_id].dor[i])
 				{
-					int out_rows, out_cols;
-					_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols);
+					int out_rows, out_cols, out_partition;
+					_ccv_convnet_layer_derive_output(layer, layer->input.matrix.rows, layer->input.matrix.cols, &out_rows, &out_cols, &out_partition);
 					_cwc_kern_mute_neuron
-					<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device.stream>>>
-					(i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], context->device.dor[i]);
+					<<<out_rows * out_cols * layer->net.convolutional.count, batch, 0, context->device[device_id].data_stream>>>
+					(i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], context->device[device_id].dor[i]);
 				}
-				_cwc_convnet_convolutional_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], GPU(convnet)->forwards[i], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->backwards[i], configuration, GPU(convnet)->scratch, GPU(convnet)->unit, context->device.stream, context->device.cublas);
+				_cwc_convnet_convolutional_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : m, GPU(convnet)->device[device_id].backwards[i], configuration, GPU(convnet)->device[device_id].scratch, GPU(convnet)->device[device_id].unit, context->device[device_id].data_stream, context->device[device_id].data_cublas);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
-				if (context->device.dor[i])
+				if (context->device[device_id].dor[i])
 					_cwc_kern_mute_neuron
-					<<<layer->net.full_connect.count, batch, 0, context->device.stream>>>
-					(i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], context->device.dor[i]);
-				_cwc_convnet_full_connect_backward_propagate(layer, batch,  i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->backwards[i], GPU(convnet)->unit, configuration, context->device.cublas);
+					<<<layer->net.full_connect.count, batch, 0, context->device[device_id].data_stream>>>
+					(i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], context->device[device_id].dor[i]);
+				_cwc_convnet_full_connect_backward_propagate(layer, batch,  i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : m, GPU(convnet)->device[device_id].backwards[i], GPU(convnet)->device[device_id].unit, configuration, context->device[device_id].data_stream, context->device[device_id].data_cublas);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
-				_cwc_convnet_rnorm_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], GPU(convnet)->forwards[i], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->denoms[i], GPU(convnet)->backwards[i], context->device.stream);
+				_cwc_convnet_rnorm_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : m, GPU(convnet)->device[device_id].denoms[i], GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 			case CCV_CONVNET_MAX_POOL:
-				_cwc_convnet_max_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], GPU(convnet)->forwards[i], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->backwards[i], context->device.stream);
+				_cwc_convnet_max_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].forwards[i], i > 0 ? GPU(convnet)->device[device_id].forwards[i - 1] : m, GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 			case CCV_CONVNET_AVERAGE_POOL:
-				_cwc_convnet_average_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], GPU(convnet)->backwards[i], context->device.stream);
+				_cwc_convnet_average_pool_backward_propagate(layer, batch, i == convnet->count - 1 ? a : GPU(convnet)->device[device_id].backwards[i + 1], GPU(convnet)->device[device_id].backwards[i], context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 		}
@@ -1824,6 +2284,9 @@ typedef struct {
 	int inum;
 	int* idx;
 	ccv_convnet_t* convnet;
+	// these are eigenvectors / values for color covariance matrix
+	ccv_dense_matrix_t* eigenvectors;
+	ccv_dense_matrix_t* eigenvalues;
 	ccv_function_state_reserve_field
 } cwc_convnet_supervised_train_function_state_t;
 
@@ -1832,35 +2295,42 @@ static void _cwc_convnet_supervised_train_function_state_read(const char* filena
 	ccv_convnet_t* convnet = ccv_convnet_read(1, filename);
 	if (!convnet)
 		return;
-	int i;
-	for (i = 0; i < convnet->count; i++)
-	{
-		ccv_convnet_layer_t* layer = GPU(z->convnet)->layers + i;
-		ccv_convnet_layer_t* z_layer = z->convnet->layers + i;
-		ccv_convnet_layer_t* host_layer = convnet->layers + i;
-		switch (layer->type)
+	int i, j;
+	for (i = 0; i < GPU(z->convnet)->dual_device + 1; i++)
+		for (j = 0; j < convnet->count; j++)
 		{
-			case CCV_CONVNET_CONVOLUTIONAL:
-				_cwc_convnet_reorder_convolutional_weights_onto_device(host_layer->w, layer->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels);
-				cudaMemcpy(layer->bias, host_layer->bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyHostToDevice);
-				memcpy(z_layer->w, host_layer->w, sizeof(float) * (layer->wnum + layer->net.convolutional.count));
-				assert(cudaGetLastError() == cudaSuccess);
-				break;
-			case CCV_CONVNET_FULL_CONNECT:
-				_cwc_convnet_reorder_full_connect_weights_onto_device(host_layer->w, layer->w, layer->wnum, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels);
-				cudaMemcpy(layer->bias, host_layer->bias, sizeof(float) * layer->net.full_connect.count, cudaMemcpyHostToDevice);
-				memcpy(z_layer->w, host_layer->w, sizeof(float) * (layer->wnum + layer->net.full_connect.count));
-				assert(cudaGetLastError() == cudaSuccess);
-				break;
+			ccv_convnet_layer_t* layer = GPU(z->convnet)->device[i].layers + j;
+			ccv_convnet_layer_t* z_layer = z->convnet->layers + j;
+			ccv_convnet_layer_t* host_layer = convnet->layers + j;
+			switch (layer->type)
+			{
+				case CCV_CONVNET_CONVOLUTIONAL:
+					_cwc_convnet_reorder_convolutional_weights_onto_device(host_layer->w, layer->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels, layer->input.matrix.partition);
+					cudaMemcpy(layer->bias, host_layer->bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyHostToDevice);
+					memcpy(z_layer->w, host_layer->w, sizeof(float) * (layer->wnum + layer->net.convolutional.count));
+					assert(cudaGetLastError() == cudaSuccess);
+					break;
+				case CCV_CONVNET_FULL_CONNECT:
+					_cwc_convnet_reorder_full_connect_weights_onto_device(host_layer->w, layer->w, layer->wnum, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels);
+					cudaMemcpy(layer->bias, host_layer->bias, sizeof(float) * layer->net.full_connect.count, cudaMemcpyHostToDevice);
+					memcpy(z_layer->w, host_layer->w, sizeof(float) * (layer->wnum + layer->net.full_connect.count));
+					assert(cudaGetLastError() == cudaSuccess);
+					break;
+			}
 		}
-	}
+	assert(convnet->input.height == z->convnet->input.height);
+	assert(convnet->input.width == z->convnet->input.width);
+	assert(convnet->rows == z->convnet->rows);
+	assert(convnet->cols == z->convnet->cols);
+	assert(convnet->channels == z->convnet->channels);
+	memcpy(z->convnet->mean_activity->data.f32, convnet->mean_activity->data.f32, sizeof(float) * z->convnet->input.height * z->convnet->input.width * z->convnet->channels);
 	ccv_convnet_free(convnet);
 	sqlite3* db = 0;
 	if (SQLITE_OK == sqlite3_open(filename, &db))
 	{
 		z->line_no = 0;
 		const char function_state_qs[] =
-			"SELECT t, i, inum, line_no, idx FROM function_state WHERE fsid = 0;";
+			"SELECT t, i, inum, line_no, idx, eigenvectors, eigenvalues FROM function_state WHERE fsid = 0;";
 		sqlite3_stmt* function_state_stmt = 0;
 		if (SQLITE_OK == sqlite3_prepare_v2(db, function_state_qs, sizeof(function_state_qs), &function_state_stmt, 0))
 		{
@@ -1873,6 +2343,18 @@ static void _cwc_convnet_supervised_train_function_state_read(const char* filena
 				z->line_no = sqlite3_column_int(function_state_stmt, 3);
 				const void* idx = sqlite3_column_blob(function_state_stmt, 4);
 				memcpy(z->idx, idx, sizeof(int) * z->inum);
+				if (sqlite3_column_bytes(function_state_stmt, 5) == sizeof(double) * 3 * 3 &&
+					sqlite3_column_bytes(function_state_stmt, 6) == sizeof(double) * 3)
+				{
+					const void* eigenvectors = sqlite3_column_blob(function_state_stmt, 5);
+					const void* eigenvalues = sqlite3_column_blob(function_state_stmt, 6);
+					if (!z->eigenvectors)
+						z->eigenvectors = ccv_dense_matrix_new(3, 3, CCV_64F | CCV_C1, 0, 0);
+					if (!z->eigenvalues)
+						z->eigenvalues = ccv_dense_matrix_new(1, 3, CCV_64F | CCV_C1, 0, 0);
+					memcpy(z->eigenvectors->data.u8, eigenvectors, sizeof(double) * 3 * 3);
+					memcpy(z->eigenvalues->data.u8, eigenvalues, sizeof(double) * 3);
+				}
 			}
 			sqlite3_finalize(function_state_stmt);
 		}
@@ -1883,28 +2365,31 @@ static void _cwc_convnet_supervised_train_function_state_read(const char* filena
 		{
 			while(sqlite3_step(momentum_data_stmt) == SQLITE_ROW)
 			{
-				ccv_convnet_layer_t* layer = GPU(z->convnet)->layers + sqlite3_column_int(momentum_data_stmt, 0);
-				ccv_convnet_layer_t* momentum = GPU(z->convnet)->momentums + sqlite3_column_int(momentum_data_stmt, 0);
-				int wnum = sqlite3_column_bytes(momentum_data_stmt, 1) / sizeof(float);
-				int bnum = sqlite3_column_bytes(momentum_data_stmt, 2) / sizeof(float);
-				if (wnum != layer->wnum)
-					continue;
-				const void* w = sqlite3_column_blob(momentum_data_stmt, 1);
-				const void* bias = sqlite3_column_blob(momentum_data_stmt, 2);
-				switch (layer->type)
+				for (i = 0; i < GPU(z->convnet)->dual_device + 1; i++)
 				{
-					case CCV_CONVNET_CONVOLUTIONAL:
-						if (bnum != layer->net.convolutional.count)
-							continue;
-						_cwc_convnet_reorder_convolutional_weights_onto_device((float*)w, momentum->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels);
-						cudaMemcpy(momentum->bias, bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyHostToDevice);
-						break;
-					case CCV_CONVNET_FULL_CONNECT:
-						if (bnum != layer->net.full_connect.count)
-							continue;
-						_cwc_convnet_reorder_full_connect_weights_onto_device((float*)w, momentum->w, layer->wnum, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels);
-						cudaMemcpy(momentum->bias, bias, sizeof(float) * layer->net.full_connect.count, cudaMemcpyHostToDevice);
-						break;
+					ccv_convnet_layer_t* layer = GPU(z->convnet)->device[i].layers + sqlite3_column_int(momentum_data_stmt, 0);
+					ccv_convnet_layer_t* momentum = GPU(z->convnet)->device[i].momentums + sqlite3_column_int(momentum_data_stmt, 0);
+					int wnum = sqlite3_column_bytes(momentum_data_stmt, 1) / sizeof(float);
+					int bnum = sqlite3_column_bytes(momentum_data_stmt, 2) / sizeof(float);
+					if (wnum != layer->wnum)
+						continue;
+					const void* w = sqlite3_column_blob(momentum_data_stmt, 1);
+					const void* bias = sqlite3_column_blob(momentum_data_stmt, 2);
+					switch (layer->type)
+					{
+						case CCV_CONVNET_CONVOLUTIONAL:
+							if (bnum != layer->net.convolutional.count)
+								continue;
+							_cwc_convnet_reorder_convolutional_weights_onto_device((float*)w, momentum->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels, layer->input.matrix.partition);
+							cudaMemcpy(momentum->bias, bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyHostToDevice);
+							break;
+						case CCV_CONVNET_FULL_CONNECT:
+							if (bnum != layer->net.full_connect.count)
+								continue;
+							_cwc_convnet_reorder_full_connect_weights_onto_device((float*)w, momentum->w, layer->wnum, layer->input.matrix.rows * layer->input.matrix.cols, layer->input.matrix.channels);
+							cudaMemcpy(momentum->bias, bias, sizeof(float) * layer->net.full_connect.count, cudaMemcpyHostToDevice);
+							break;
+					}
 				}
 			}
 			sqlite3_finalize(momentum_data_stmt);
@@ -1913,17 +2398,18 @@ static void _cwc_convnet_supervised_train_function_state_read(const char* filena
 	}
 }
 
-static void _cwc_convnet_reorder_convolutional_weights_onto_host(float* w, float* hw, int wnum, int filters, int channels)
+static void _cwc_convnet_reorder_convolutional_weights_onto_host(float* w, float* hw, int wnum, int filters, int channels, int channel_partition)
 {
-	assert(wnum % (filters * channels) == 0);
+	int channels_per_partition = channels / channel_partition;
+	assert(wnum % (filters * channels_per_partition) == 0);
 	float* iw = (float*)ccmalloc(sizeof(float) * wnum);
 	cudaMemcpy(iw, w, sizeof(float) * wnum, cudaMemcpyDeviceToHost);
-	int count = wnum / (filters * channels);
+	int count = wnum / (filters * channels_per_partition);
 	int i, j, k;
-	for (i = 0; i < channels; i++)
+	for (i = 0; i < channels_per_partition; i++)
 		for (j = 0; j < count; j++)
 			for (k = 0; k < filters; k++)
-				hw[k * count * channels + j * channels + i] = iw[i * count * filters + j * filters + k];
+				hw[k * count * channels_per_partition + j * channels_per_partition + i] = iw[i * count * filters + j * filters + k];
 	ccfree(iw);
 }
 
@@ -1941,17 +2427,17 @@ static void _cwc_convnet_reorder_full_connect_weights_onto_host(float* w, float*
 	ccfree(iw);
 }
 
-static void _cwc_convnet_host_synchronize(ccv_convnet_t* convnet)
+static void _cwc_convnet_host_synchronize(ccv_convnet_t* convnet, int device_id)
 {
 	int i;
 	for (i = 0; i < convnet->count; i++)
 	{
-		ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[device_id].layers + i;
 		ccv_convnet_layer_t* host_layer = convnet->layers + i;
 		switch (layer->type)
 		{
 			case CCV_CONVNET_CONVOLUTIONAL:
-				_cwc_convnet_reorder_convolutional_weights_onto_host(layer->w, host_layer->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels);
+				_cwc_convnet_reorder_convolutional_weights_onto_host(layer->w, host_layer->w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels, layer->input.matrix.partition);
 				cudaMemcpy(host_layer->bias, layer->bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyDeviceToHost);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
@@ -1966,7 +2452,8 @@ static void _cwc_convnet_host_synchronize(ccv_convnet_t* convnet)
 
 static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_supervised_train_function_state_t* z, const char* filename)
 {
-	_cwc_convnet_host_synchronize(z->convnet);
+	// the master state kept in device id == 0
+	_cwc_convnet_host_synchronize(z->convnet, 0);
 	ccv_convnet_write_param_t params;
 	params.half_precision = 0;
 	ccv_convnet_write(z->convnet, filename, params);
@@ -1975,14 +2462,14 @@ static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_super
 	{
 		const char function_state_create_table_qs[] =
 			"CREATE TABLE IF NOT EXISTS function_state "
-			"(fsid INTEGER PRIMARY KEY ASC, t INTEGER, i INTEGER, inum INTEGER, line_no INTEGER, idx BLOB);"
+			"(fsid INTEGER PRIMARY KEY ASC, t INTEGER, i INTEGER, inum INTEGER, line_no INTEGER, idx BLOB, eigenvectors BLOB, eigenvalues BLOB);"
 			"CREATE TABLE IF NOT EXISTS momentum_data "
 			"(layer INTEGER PRIMARY KEY ASC, weight BLOB, bias BLOB);";
 		assert(SQLITE_OK == sqlite3_exec(db, function_state_create_table_qs, 0, 0, 0));
 		const char function_state_insert_qs[] =
 			"REPLACE INTO function_state "
-			"(fsid, t, i, inum, line_no, idx) VALUES "
-			"(0, $t, $i, $inum, $line_no, $idx);";
+			"(fsid, t, i, inum, line_no, idx, eigenvectors, eigenvalues) VALUES "
+			"(0, $t, $i, $inum, $line_no, $idx, $eigenvectors, $eigenvalues);";
 		sqlite3_stmt* function_state_insert_stmt = 0;
 		assert(SQLITE_OK == sqlite3_prepare_v2(db, function_state_insert_qs, sizeof(function_state_insert_qs), &function_state_insert_stmt, 0));
 		sqlite3_bind_int(function_state_insert_stmt, 1, z->t);
@@ -1990,6 +2477,10 @@ static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_super
 		sqlite3_bind_int(function_state_insert_stmt, 3, z->inum);
 		sqlite3_bind_int(function_state_insert_stmt, 4, z->line_no);
 		sqlite3_bind_blob(function_state_insert_stmt, 5, z->idx, sizeof(int) * z->inum, SQLITE_STATIC);
+		if (z->eigenvectors)
+			sqlite3_bind_blob(function_state_insert_stmt, 6, z->eigenvectors->data.u8, sizeof(double) * 3 * 3, SQLITE_STATIC);
+		if (z->eigenvalues)
+			sqlite3_bind_blob(function_state_insert_stmt, 7, z->eigenvalues->data.u8, sizeof(double) * 3, SQLITE_STATIC);
 		assert(SQLITE_DONE == sqlite3_step(function_state_insert_stmt));
 		sqlite3_finalize(function_state_insert_stmt);
 		const char momentum_data_insert_qs[] =
@@ -2000,8 +2491,8 @@ static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_super
 		int i;
 		for (i = 0; i < z->convnet->count; i++)
 		{
-			ccv_convnet_layer_t* layer = GPU(z->convnet)->layers + i;
-			ccv_convnet_layer_t* momentum = GPU(z->convnet)->momentums + i;
+			ccv_convnet_layer_t* layer = GPU(z->convnet)->device[0].layers + i;
+			ccv_convnet_layer_t* momentum = GPU(z->convnet)->device[0].momentums + i;
 			// insert momentum data
 			if (layer->type == CCV_CONVNET_CONVOLUTIONAL || layer->type == CCV_CONVNET_FULL_CONNECT)
 			{
@@ -2011,7 +2502,7 @@ static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_super
 				switch (layer->type)
 				{
 					case CCV_CONVNET_CONVOLUTIONAL:
-						_cwc_convnet_reorder_convolutional_weights_onto_host(momentum->w, w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels);
+						_cwc_convnet_reorder_convolutional_weights_onto_host(momentum->w, w, layer->wnum, layer->net.convolutional.count, layer->net.convolutional.channels, layer->input.matrix.partition);
 						cudaMemcpy(bias, momentum->bias, sizeof(float) * layer->net.convolutional.count, cudaMemcpyDeviceToHost);
 						assert(cudaGetLastError() == cudaSuccess);
 						break;
@@ -2040,20 +2531,201 @@ static void _cwc_convnet_supervised_train_function_state_write(cwc_convnet_super
 
 void cwc_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch)
 {
-	_cwc_convnet_alloc_reserved(convnet, batch, 0);
-	_cwc_convnet_encode_impl(convnet, 0, batch, 0, 0);
+	assert(batch == 0);
+}
+
+__global__ static void _cwc_kern_neuron_scan(float* a, float* b,
+		const int rows, const int cols,
+		const int out_rows, const int out_cols, const int channels, const int batch)
+{
+	assert(gridDim.x == cols);
+	assert(gridDim.y == rows);
+	assert(gridDim.z == channels);
+	assert(blockDim.x == batch);
+	assert(out_rows > rows);
+	assert(out_cols > cols);
+	b += (blockIdx.z * rows * cols + blockIdx.y * cols + blockIdx.x) * batch * 5;
+	a += (blockIdx.z * out_rows * out_cols + blockIdx.y * out_cols + blockIdx.x) * batch;
+	const int thidx = threadIdx.x;
+	b[thidx] = a[thidx]; // top left
+	b += batch;
+	float* c = a + (out_cols - cols) * batch; // top right
+	b[thidx] = c[thidx];
+	b += batch;
+	c = a + (((out_rows - rows) / 2) * out_cols + (out_cols - cols) / 2) * batch; // center
+	b[thidx] = c[thidx];
+	b += batch;
+	c = a + (out_rows - rows) * out_cols * batch; // bottom left
+	b[thidx] = c[thidx];
+	b += batch;
+	c = a + ((out_rows - rows) * out_cols + (out_cols - cols)) * batch; // bottom right
+	b[thidx] = c[thidx];
+}
+
+__global__ static void _cwc_kern_softmax(float* a, const int batch, const int count)
+{
+	int i;
+	const int thidx = threadIdx.x;
+	float max_val = a[thidx];
+	for (i = 1; i < count; i++)
+	{
+		float v = a[i * batch + thidx];
+		if (v > max_val)
+			max_val = v;
+	}
+	float val = 0;
+	for (i = 0; i < count; i++)
+	{
+		float v = a[i * batch + thidx];
+		val += (v = expf(v - max_val));
+		a[i * batch + thidx] = v;
+	}
+	val = 1.0 / val;
+	for (i = 0; i < count; i++)
+		a[i * batch + thidx] *= val;
+}
+
+template <int vary>
+__global__ static void _cwc_kern_classify(float* a, int* c, float* b, const int batch, const int count, const int tops)
+{
+	int i, j;
+	assert(blockDim.x == batch);
+	const int thidx = threadIdx.x;
+	for (i = 0; i < count; i++)
+		#pragma unroll
+		for (j = 1; j < vary; j++)
+			a[i * batch * vary + thidx] += a[(i * vary + j) * batch + thidx];
+	#pragma unroll
+	for (i = 0; i < tops; i++)
+	{
+		float max_val = -1;
+		int max_idx = -1;
+		for (j = 0; j < count; j++)
+		{
+			float v = a[j * batch * vary + thidx];
+			if (v >= 0 && v > max_val)
+				max_val = v, max_idx = j;
+		}
+		assert(max_idx >= 0);
+		a[max_idx * batch * vary + thidx] = -1;
+		c[thidx] = max_idx;
+		b[thidx] = max_val;
+		c += batch;
+		b += batch;
+	}
 }
 
-void cwc_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int* labels, int batch)
+void cwc_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch)
 {
-	_cwc_convnet_alloc_reserved(convnet, batch, 0);
+	assert(symmetric == 1); // only works with symmetric as well
+	assert(batch == 32); // I haven't figured out to do this for any batch size
+	// classify step uses only device 0
+	_cwc_convnet_alloc_reserved_for_classify(convnet, tops, batch);
+	int i, j, k;
+	int rows = convnet->input.height, cols = convnet->input.width, channels = convnet->channels;
+	cwc_convnet_context_t* default_context = GPU(convnet)->contexts;
+	float* c = default_context->host[0].input;
+	for (i = 0; i < batch; i++)
+	{
+		assert(a[i]->rows == rows || a[i]->cols == cols);
+		assert(a[i]->rows >= rows && a[i]->cols >= cols);
+		// top / left
+		ccv_dense_matrix_t* b = 0;
+		ccv_slice(a[i], (ccv_matrix_t**)&b, CCV_32F, 0, 0, rows, cols);
+		ccv_subtract(b, convnet->mean_activity, (ccv_matrix_t**)&b, 0);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + i] = b->data.f32[j * channels + k];
+		ccv_flip(b, &b, 0, CCV_FLIP_X);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + batch + i] = b->data.f32[j * channels + k];
+		ccv_matrix_free(b);
+		// center
+		b = 0;
+		ccv_slice(a[i], (ccv_matrix_t**)&b, CCV_32F, (a[i]->rows - rows) / 2, (a[i]->cols - cols) / 2, rows, cols);
+		ccv_subtract(b, convnet->mean_activity, (ccv_matrix_t**)&b, 0);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + 2 * batch + i] = b->data.f32[j * channels + k];
+		ccv_flip(b, &b, 0, CCV_FLIP_X);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + 3 * batch + i] = b->data.f32[j * channels + k];
+		ccv_matrix_free(b);
+		// bottom / right
+		b = 0;
+		ccv_slice(a[i], (ccv_matrix_t**)&b, CCV_32F, a[i]->rows - rows, a[i]->cols - cols, rows, cols);
+		ccv_subtract(b, convnet->mean_activity, (ccv_matrix_t**)&b, 0);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + 4 * batch + i] = b->data.f32[j * channels + k];
+		ccv_flip(b, &b, 0, CCV_FLIP_X);
+		for (k = 0; k < channels; k++)
+			for (j = 0; j < rows * cols; j++)
+				c[(k * rows * cols + j) * batch * 6 + 5 * batch + i] = b->data.f32[j * channels + k];
+		ccv_matrix_free(b);
+	}
+	cudaMemcpyAsync(default_context->device[0].input, default_context->host[0].input, sizeof(float) * rows * cols * channels * batch * 6, cudaMemcpyHostToDevice, default_context->device[0].data_stream);
+	int scan = _cwc_convnet_find_scan(convnet, 0);
+	assert(scan >= 0 && scan < convnet->count);
+	int out_rows, out_cols, out_partition;
+	for (i = 0; i < scan + 1; i++)
+	{
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[0].layers + i;
+		_ccv_convnet_layer_derive_output(layer, rows, cols, &out_rows, &out_cols, &out_partition);
+		_cwc_convnet_layer_forward_propagate(layer, 0, i, rows, cols, batch * 6, 0, i == 0 ? default_context->device[0].input : GPU(convnet)->device[0].forwards[i - 1], GPU(convnet)->device[0].forwards[i], GPU(convnet)->device[0].denoms[i], GPU(convnet)->device[0].unit, default_context);
+		rows = out_rows, cols = out_cols;
+	}
+	// copy data to scans
+	dim3 num_blocks = dim3(GPU(convnet)->device[0].layers[scan + 1].input.matrix.cols, GPU(convnet)->device[0].layers[scan + 1].input.matrix.rows, GPU(convnet)->device[0].layers[scan + 1].input.matrix.channels);
+	_cwc_kern_neuron_scan
+		<<<num_blocks, batch * 6, 0, default_context->device[0].data_stream>>>
+		(GPU(convnet)->device[0].forwards[scan], GPU(convnet)->device[0].scans[scan], GPU(convnet)->device[0].layers[scan + 1].input.matrix.rows, GPU(convnet)->device[0].layers[scan + 1].input.matrix.cols, rows, cols, GPU(convnet)->device[0].layers[scan + 1].input.matrix.channels, batch * 6);
+	for (i = scan + 1; i < convnet->count; i++)
+	{
+		ccv_convnet_layer_t* layer = GPU(convnet)->device[0].layers + i;
+		_cwc_convnet_layer_forward_propagate(layer, 0, i, layer->input.matrix.rows, layer->input.matrix.cols, batch * 30, 0, i == scan + 1 ? GPU(convnet)->device[0].scans[i - 1] : GPU(convnet)->device[0].forwards[i - 1], GPU(convnet)->device[0].forwards[i], GPU(convnet)->device[0].denoms[i], GPU(convnet)->device[0].unit, default_context);
+	}
+	// doing softmax for the last layer
+	int category_count = convnet->layers[convnet->count - 1].net.full_connect.count;
+	_cwc_kern_softmax
+		<<<1, batch * 30, 0, default_context->device[0].data_stream>>>
+		(GPU(convnet)->device[0].forwards[convnet->count - 1], batch * 30, category_count);
+	// collect classify results
+	_cwc_kern_classify
+		<30>
+		<<<1, batch, 0, default_context->device[0].data_stream>>>
+		(GPU(convnet)->device[0].forwards[convnet->count - 1], default_context->device[0].c, default_context->device[0].out, batch, category_count, tops);
+	cudaMemcpyAsync(default_context->host[0].c, default_context->device[0].c, sizeof(int) * batch * tops, cudaMemcpyDeviceToHost, default_context->device[0].data_stream);
+	cudaMemcpyAsync(default_context->host[0].out, default_context->device[0].out, sizeof(float) * batch * tops, cudaMemcpyDeviceToHost, default_context->device[0].data_stream);
+	// wait for the classify to finish
+	cudaStreamSynchronize(default_context->device[0].data_stream);
+	// collect result to ccv_array_t
+	for (i = 0; i < batch; i++)
+	{
+		ranks[i] = ccv_array_new(sizeof(ccv_classification_t), tops, 0);
+		for (j = 0; j < tops; j++)
+		{
+			ccv_classification_t classification = {
+				.id = default_context->host[0].c[j * batch + i],
+				.confidence = default_context->host[0].out[j * batch + i] / 30,
+			};
+			ccv_array_push(ranks[i], &classification);
+		}
+	}
 }
 
 void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params)
 {
 #ifdef HAVE_GSL
 	assert(params.mini_batch % BATCH_PER_BLOCK == 0);
-	_cwc_convnet_alloc_reserved(convnet, params.mini_batch, params.layer_params);
+	int device_count = 0;
+	cudaGetDeviceCount(&device_count);
+	if (params.dual_device && device_count < 2)
+		params.dual_device = 0;
+	assert(device_count > 0);
+	_cwc_convnet_alloc_reserved_both(convnet, params.mini_batch, params.dual_device, params.layer_params);
 	int i, j, k;
 	gsl_rng_env_setup();
 	gsl_rng* rng = gsl_rng_alloc(gsl_rng_default);
@@ -2089,75 +2761,83 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
 	z.idx = idx;
 	z.inum = categorizeds->rnum;
 	z.convnet = convnet;
+	z.eigenvectors = 0;
+	z.eigenvalues = 0;
 	z.line_no = 0;
 	int miss;
 	float elapsed_time;
+	const int device_id = 0; // temporarily set device id == 0
 	ccv_function_state_begin(_cwc_convnet_supervised_train_function_state_read, z, filename);
+	_cwc_convnet_mean_formation(categorizeds, z.convnet->input, z.convnet->channels, params.symmetric, &z.convnet->mean_activity);
+	ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
+	if (z.convnet->channels == 3 && params.color_gain > 0) // do this if we want color gain type of data augmentation, and it is RGB color
+		_cwc_convnet_channel_eigen(categorizeds, z.convnet->mean_activity, z.convnet->input, z.convnet->channels, &z.eigenvectors, &z.eigenvalues);
+	ccv_function_state_resume(_cwc_convnet_supervised_train_function_state_write, z, filename);
 	for (z.t = 0; z.t < params.max_epoch; z.t++)
 	{
 		for (z.i = 0; z.i < aligned_batches; z.i += params.iterations)
 		{
 			cudaEventRecord(start, 0);
 			// using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in updating
-			if (z.t > 0) // undo the mean network for further training
-				_cwc_convnet_dor_mean_net_undo(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas);
+			// undo the mean network for further training
+			_cwc_convnet_dor_mean_net_undo(z.convnet, device_id, params.layer_params, GPU(z.convnet)->contexts[(z.i + 1) % 2].device[device_id].data_cublas);
 			miss = 0;
 			// run updates
 			for (i = z.i; i < ccv_min(z.i + params.iterations, aligned_batches); i++)
 			{
 				cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (i % 2);
-				_cwc_convnet_batch_formation(rng, categorizeds, z.idx, params.size, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.mini_batch, i * params.mini_batch, params.mini_batch, context->host.input, context->host.c);
-				cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
+				_cwc_convnet_batch_formation(rng, categorizeds, z.convnet->mean_activity, z.eigenvectors, z.eigenvalues, params.color_gain, z.idx, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, category_count, params.symmetric, params.mini_batch, i * params.mini_batch, params.mini_batch, context->host[device_id].input, context->host[device_id].c);
+				cudaMemcpyAsync(context->device[device_id].input, context->host[device_id].input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
-				cudaMemcpyAsync(context->device.c, context->host.c, sizeof(int) * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
+				cudaMemcpyAsync(context->device[device_id].c, context->host[device_id].c, sizeof(int) * params.mini_batch, cudaMemcpyHostToDevice, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
-				_cwc_convnet_dor_formation(z.convnet, params.mini_batch, rng, params.layer_params, context);
+				_cwc_convnet_dor_formation(z.convnet, device_id, params.mini_batch, rng, params.layer_params, context);
 				assert(cudaGetLastError() == cudaSuccess);
 				// sync with the other stream core so that we can compute on the single true layer parameters
 				if (i > z.i)
-					cudaEventRecord(stop, GPU(z.convnet)->contexts[(i + 1) % 2].device.stream);
-				cudaStreamSynchronize(GPU(z.convnet)->contexts[(i + 1) % 2].device.stream);
+					cudaEventRecord(stop, GPU(z.convnet)->contexts[(i + 1) % 2].device[device_id].data_stream);
+				cudaStreamSynchronize(GPU(z.convnet)->contexts[(i + 1) % 2].device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				if (i > z.i) // we have another result, pull these
 				{
-					int* c = GPU(z.convnet)->contexts[(i + 1) % 2].host.c;
+					int* c = GPU(z.convnet)->contexts[(i + 1) % 2].host[device_id].c;
 					for (k = 0; k < params.mini_batch; k++)
 						if (c[k] != test_returns[(i + 1) % 2].host[k])
 							++miss;
 					cudaEventElapsedTime(&elapsed_time, iteration, stop);
 					FLUSH(" - at epoch %03d / %d => stochastic gradient descent with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f /((i - z.i) * params.mini_batch), i + 1, aligned_batches, elapsed_time / 1000);
 				}
-				cudaEventRecord(iteration, context->device.stream);
-				_cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 1, context);
+				cudaEventRecord(iteration, context->device[device_id].data_stream);
+				_cwc_convnet_encode_impl(z.convnet, device_id, context->device[device_id].input, params.mini_batch, 1, context);
 				assert(cudaGetLastError() == cudaSuccess);
 				// compute miss rate on training data
-				_cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[i % 2].device, context->device.stream);
+				_cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->device[device_id].forwards[z.convnet->count - 1], test_returns[i % 2].device, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
-				cudaMemcpyAsync(test_returns[i % 2].host, test_returns[i % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream);
+				cudaMemcpyAsync(test_returns[i % 2].host, test_returns[i % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				// do the logistic loss and backward propagate
-				_cwc_convnet_softmax_with_logistic_loss(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], context->device.c, context->device.stream);
+				_cwc_convnet_softmax_with_logistic_loss(params.mini_batch, category_count, GPU(z.convnet)->device[device_id].forwards[z.convnet->count - 1], context->device[device_id].c, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
-				_cwc_convnet_backwards_propagate_error(z.convnet, GPU(z.convnet)->forwards[z.convnet->count - 1], context->device.input, params.mini_batch, context);
+				_cwc_convnet_backwards_propagate_error(z.convnet, device_id, GPU(z.convnet)->device[device_id].forwards[z.convnet->count - 1], context->device[device_id].input, params.mini_batch, context);
 				assert(cudaGetLastError() == cudaSuccess);
-				_cwc_convnet_net_sgd(z.convnet, z.t > 0 || i > 0, params.mini_batch, params.layer_params, context);
+				_cwc_convnet_net_sgd(z.convnet, device_id, z.t > 0 || i > 0, params.mini_batch, params.layer_params, context);
 				assert(cudaGetLastError() == cudaSuccess);
 			}
 			cudaDeviceSynchronize(); // synchronize at this point
 			// using context-1's cublas handle because we will wait this handle to finish when the copy to context-0 is required in testing
-			_cwc_convnet_dor_mean_net(z.convnet, params.layer_params, GPU(z.convnet)->contexts[1].device.cublas);
+			_cwc_convnet_dor_mean_net(z.convnet, device_id, params.layer_params, GPU(z.convnet)->contexts[1].device[device_id].data_cublas);
 			// run tests
 			miss = 0;
 			for (i = j = 0; i < tests->rnum; i += params.mini_batch, j++)
 			{
 				cwc_convnet_context_t* context = GPU(z.convnet)->contexts + (j % 2);
-				_cwc_convnet_batch_formation(rng, tests, 0, params.size, z.convnet->rows, z.convnet->cols, z.convnet->channels, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host.input, 0);
-				cudaMemcpyAsync(context->device.input, context->host.input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device.stream);
+				_cwc_convnet_batch_formation(0, tests, z.convnet->mean_activity, 0, 0, 0, 0, z.convnet->input, z.convnet->rows, z.convnet->cols, z.convnet->channels, category_count, params.symmetric, params.mini_batch, i, ccv_min(params.mini_batch, tests->rnum - i), context->host[device_id].input, 0);
+				cudaMemcpyAsync(context->device[device_id].input, context->host[device_id].input, sizeof(float) * z.convnet->rows * z.convnet->cols * z.convnet->channels * params.mini_batch, cudaMemcpyHostToDevice, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				if (j > 0)
-					cudaEventRecord(stop, GPU(z.convnet)->contexts[(i + 1) % 2].device.stream);
+					cudaEventRecord(stop, GPU(z.convnet)->contexts[(j + 1) % 2].device[device_id].data_stream);
 				// sync with the other stream core so that we can compute on the single true layer parameters
-				cudaStreamSynchronize(GPU(z.convnet)->contexts[(j + 1) % 2].device.stream);
+				cudaStreamSynchronize(GPU(z.convnet)->contexts[(j + 1) % 2].device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 				if (j > 0) // we have another result, pull these
 				{
@@ -2170,12 +2850,12 @@ void cwc_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categoriz
 					cudaEventElapsedTime(&elapsed_time, iteration, stop);
 					FLUSH(" - at epoch %03d / %d => with miss rate %.2f%% at %d / %d (%.3f sec)", z.t + 1, params.max_epoch, miss * 100.0f / i, j + 1, (tests->rnum + params.mini_batch - 1) / params.mini_batch, elapsed_time / 1000);
 				}
-				cudaEventRecord(iteration, context->device.stream);
-				_cwc_convnet_encode_impl(z.convnet, context->device.input, params.mini_batch, 0, context);
+				cudaEventRecord(iteration, context->device[device_id].data_stream);
+				_cwc_convnet_encode_impl(z.convnet, device_id, context->device[device_id].input, params.mini_batch, 0, context);
 				assert(cudaGetLastError() == cudaSuccess);
-				_cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device.stream);
+				_cwc_convnet_tests_return(params.mini_batch, category_count, GPU(z.convnet)->device[device_id].forwards[z.convnet->count - 1], test_returns[j % 2].device, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
-				cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device.stream);
+				cudaMemcpyAsync(test_returns[j % 2].host, test_returns[j % 2].device, sizeof(int) * params.mini_batch, cudaMemcpyDeviceToHost, context->device[device_id].data_stream);
 				assert(cudaGetLastError() == cudaSuccess);
 			}
 			cudaDeviceSynchronize(); // synchronize at this point
@@ -2223,43 +2903,82 @@ void cwc_convnet_compact(ccv_convnet_t* convnet)
 {
 	if (GPU(convnet))
 	{
-		cudaFree(GPU(convnet)->scratch);
-		cudaFree(GPU(convnet)->unit);
-		int i, j;
+		int dual_device = GPU(convnet)->dual_device + 1;
+		int i, j, k;
+		for (i = 0; i < dual_device + 1; i++)
+		{
+			if (GPU(convnet)->device[i].scratch)
+				cudaFree(GPU(convnet)->device[i].scratch);
+			cudaFree(GPU(convnet)->device[i].unit);
+		}
 		for (i = 0; i < 2; i++)
 		{
 			cwc_convnet_context_t* context = GPU(convnet)->contexts + i;
-			cudaFreeHost(context->host.input);
-			cudaFree(context->device.input);
-			cudaFreeHost(context->host.c);
-			cudaFree(context->device.c);
-			cudaStreamDestroy(context->device.stream);
-			cublasDestroy(context->device.cublas);
+			for (j = 0; j < dual_device + 1; j++)
+			{
+				if (context->host[j].input)
+					cudaFreeHost(context->host[j].input);
+				if (context->device[j].input)
+					cudaFree(context->device[j].input);
+				if (context->host[j].c)
+					cudaFreeHost(context->host[j].c);
+				if (context->device[j].c)
+					cudaFree(context->device[j].c);
+				if (context->host[j].out)
+					cudaFreeHost(context->host[j].out);
+				if (context->device[j].out)
+					cudaFree(context->device[j].out);
+				for (k = 0; k < 6; k++)
+					if (context->device[j].joint[k])
+						cudaEventDestroy(context->device[j].joint[k]);
+				if (context->device[j].data_cublas)
+					cublasDestroy(context->device[j].data_cublas);
+				if (context->device[j].data_stream)
+					cudaStreamDestroy(context->device[j].data_stream);
+				for (k = 0; k < 2; k++)
+				{
+					if (context->device[j].model_cublas[k])
+						cublasDestroy(context->device[j].model_cublas[k]);
+					if (context->device[j].model_stream[k])
+						cudaStreamDestroy(context->device[j].model_stream[k]);
+				}
+			}
 		}
 		for (i = 0; i < convnet->count; i++)
 		{
-			ccv_convnet_layer_t* layer = GPU(convnet)->layers + i;
-			if (layer->w)
-				cudaFree(layer->w);
-			ccv_convnet_layer_t* configuration = GPU(convnet)->configurations + i;
-			if (configuration->w)
-				cudaFree(configuration->w);
-			ccv_convnet_layer_t* momentum = GPU(convnet)->momentums + i;
-			if (momentum->w)
-				cudaFree(momentum->w);
-			if (GPU(convnet)->denoms[i])
-				cudaFree(GPU(convnet)->denoms[i]);
-			if (GPU(convnet)->forwards[i])
-				cudaFree(GPU(convnet)->forwards[i]);
-			if (GPU(convnet)->backwards[i])
-				cudaFree(GPU(convnet)->backwards[i]);
-			for (j = 0; j < 2; j++)
+			for (j = 0; j < dual_device + 1; j++)
 			{
-				cwc_convnet_context_t* context = GPU(convnet)->contexts + j;
-				if (context->host.dor[i])
-					cudaFreeHost(context->host.dor[i]);
-				if (context->device.dor[i])
-					cudaFree(context->device.dor[i]);
+				ccv_convnet_layer_t* layer = GPU(convnet)->device[j].layers + i;
+				if (layer->w)
+					cudaFree(layer->w);
+				if (GPU(convnet)->device[j].configurations)
+				{
+					ccv_convnet_layer_t* configuration = GPU(convnet)->device[j].configurations + i;
+					if (configuration->w)
+						cudaFree(configuration->w);
+				}
+				if (GPU(convnet)->device[j].momentums)
+				{
+					ccv_convnet_layer_t* momentum = GPU(convnet)->device[j].momentums + i;
+					if (momentum->w)
+						cudaFree(momentum->w);
+				}
+				if (GPU(convnet)->device[j].denoms && GPU(convnet)->device[j].denoms[i])
+					cudaFree(GPU(convnet)->device[j].denoms[i]);
+				if (GPU(convnet)->device[j].forwards && GPU(convnet)->device[j].forwards[i])
+					cudaFree(GPU(convnet)->device[j].forwards[i]);
+				if (GPU(convnet)->device[j].backwards && GPU(convnet)->device[j].backwards[i])
+					cudaFree(GPU(convnet)->device[j].backwards[i]);
+				if (GPU(convnet)->device[j].scans && GPU(convnet)->device[j].scans[i])
+					cudaFree(GPU(convnet)->device[j].scans[i]);
+				for (k = 0; k < 2; k++)
+				{
+					cwc_convnet_context_t* context = GPU(convnet)->contexts + k;
+					if (context->host[j].dor && context->host[j].dor[i])
+						cudaFreeHost(context->host[j].dor[i]);
+					if (context->device[j].dor && context->device[j].dor[i])
+						cudaFree(context->device[j].dor[i]);
+				}
 			}
 		}
 		ccfree(convnet->reserved);
diff --git a/lib/cuda/cwc_internal.h b/lib/cuda/cwc_internal.h
index f4f243070..9b91f75fb 100644
--- a/lib/cuda/cwc_internal.h
+++ b/lib/cuda/cwc_internal.h
@@ -99,6 +99,36 @@
 	case e: { block(__VA_ARGS__, e); break; } \
 	case f: { block(__VA_ARGS__, f); break; } } }
 
+#define cwc_vary_2_d(type, a, b, block, ...) { switch (type) { \
+	case a: { block(__VA_ARGS__, a); break; } \
+	case b: { block(__VA_ARGS__, b); break; } } }
+
+#define cwc_vary_3_d(type, a, b, c, block, ...) { switch (type) { \
+	case a: { block(__VA_ARGS__, a); break; } \
+	case b: { block(__VA_ARGS__, b); break; } \
+	case c: { block(__VA_ARGS__, c); break; } } }
+
+#define cwc_vary_4_d(type, a, b, c, d, block, ...) { switch (type) { \
+	case a: { block(__VA_ARGS__, a); break; } \
+	case b: { block(__VA_ARGS__, b); break; } \
+	case c: { block(__VA_ARGS__, c); break; } \
+	case d: { block(__VA_ARGS__, d); break; } } }
+
+#define cwc_vary_5_d(type, a, b, c, d, e, block, ...) { switch (type) { \
+	case a: { block(__VA_ARGS__, a); break; } \
+	case b: { block(__VA_ARGS__, b); break; } \
+	case c: { block(__VA_ARGS__, c); break; } \
+	case d: { block(__VA_ARGS__, d); break; } \
+	case e: { block(__VA_ARGS__, e); break; } } }
+
+#define cwc_vary_6_d(type, a, b, c, d, e, f, block, ...) { switch (type) { \
+	case a: { block(__VA_ARGS__, a); break; } \
+	case b: { block(__VA_ARGS__, b); break; } \
+	case c: { block(__VA_ARGS__, c); break; } \
+	case d: { block(__VA_ARGS__, d); break; } \
+	case e: { block(__VA_ARGS__, e); break; } \
+	case f: { block(__VA_ARGS__, f); break; } } }
+
 // define the body of the function that bench / select best kernels
 
 #define CWC_IMPLEMENT_VARY_STUB(config, vary_x, vary_y, vary_z, vary_func, ...) \
diff --git a/lib/inl/ccv_convnet_inl.h b/lib/inl/ccv_convnet_inl.h
new file mode 100644
index 000000000..df1e86c8c
--- /dev/null
+++ b/lib/inl/ccv_convnet_inl.h
@@ -0,0 +1,42 @@
+#ifndef GUARD_ccv_convnet_inl_h
+#define GUARD_ccv_convnet_inl_h
+
+inline static void _ccv_convnet_layer_derive_output(ccv_convnet_layer_t* layer, int input_rows, int input_cols, int* rows, int* cols, int* partition)
+{
+	assert(rows != 0 && cols != 0);
+	switch(layer->type)
+	{
+		case CCV_CONVNET_CONVOLUTIONAL:
+			assert(layer->net.convolutional.rows % 2); // as of now, don't support even number of kernel size
+			assert(layer->net.convolutional.cols % 2);
+			assert((input_rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows) % layer->net.convolutional.strides == 0);
+			assert((input_cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols) % layer->net.convolutional.strides == 0);
+			*rows = (input_rows + layer->net.convolutional.border * 2 - layer->net.convolutional.rows + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
+			*cols = (input_cols + layer->net.convolutional.border * 2 - layer->net.convolutional.cols + layer->net.convolutional.strides - 1) / layer->net.convolutional.strides + 1;
+			*partition = layer->input.matrix.partition;
+			break;
+		case CCV_CONVNET_FULL_CONNECT:
+			*rows = layer->net.full_connect.count;
+			*cols = 1;
+			*partition = 1;
+			break;
+		case CCV_CONVNET_LOCAL_RESPONSE_NORM:
+			*rows = input_rows;
+			*cols = input_cols;
+			*partition = layer->input.matrix.partition;
+			break;
+		case CCV_CONVNET_MAX_POOL:
+		case CCV_CONVNET_AVERAGE_POOL:
+			assert((input_rows + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
+			assert((input_cols + layer->net.pool.border * 2 - layer->net.pool.size) % layer->net.pool.strides == 0);
+			*rows = (input_rows + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
+			*cols = (input_cols + layer->net.pool.border * 2 - layer->net.pool.size + layer->net.pool.strides - 1) / layer->net.pool.strides + 1;
+			*partition = layer->input.matrix.partition;
+			break;
+		default:
+			assert(0);
+			break;
+	}
+}
+
+#endif
diff --git a/samples/dex.png b/samples/dex.png
new file mode 100644
index 000000000..4be3286b8
Binary files /dev/null and b/samples/dex.png differ
diff --git a/samples/download-image-net.sh b/samples/download-image-net.sh
index 91cb00fa5..3acb72d33 100755
--- a/samples/download-image-net.sh
+++ b/samples/download-image-net.sh
@@ -1,7 +1,4 @@
 #!/bin/sh
 
-wget -c http://liuliu.github.io/ccv/downloads/image-net.sqlite3.1 &&
-wget -c http://liuliu.github.io/ccv/downloads/image-net.sqlite3.2 &&
-wget -c http://liuliu.github.io/ccv/downloads/image-net.sqlite3.3 &&
-cat image-net.sqlite3.1 image-net.sqlite3.2 image-net.sqlite3.3 > image-net.sqlite3 &&
-rm image-net.sqlite3.1 image-net.sqlite3.2 image-net.sqlite3.3
+wget -c http://static.libccv.org/image-net-2010.sqlite3
+wget -c http://static.libccv.org/image-net-2012.sqlite3
diff --git a/samples/image-net-2010.sqlite3 b/samples/image-net-2010.sqlite3
new file mode 100644
index 000000000..0d3eb55c7
Binary files /dev/null and b/samples/image-net-2010.sqlite3 differ
diff --git a/samples/image-net-2010.words b/samples/image-net-2010.words
new file mode 100644
index 000000000..4728bef8e
--- /dev/null
+++ b/samples/image-net-2010.words
@@ -0,0 +1,1000 @@
+french fries, french-fried potatoes, fries, chips
+mashed potato
+black olive, ripe olive
+face powder
+crab apple, crabapple
+Granny Smith
+strawberry
+blueberry
+cranberry
+currant
+blackberry
+raspberry
+persimmon
+mulberry
+orange
+kumquat
+lemon
+grapefruit
+plum
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+cherry
+grape
+custard apple
+durian
+mango
+elderberry
+guava
+litchi, litchi nut, litchee, lichi, leechee, lichee, lychee
+pomegranate
+quince
+kidney bean
+soy, soybean, soya, soya bean
+green pea, garden pea
+chickpea, garbanzo
+chard, Swiss chard, spinach beet, leaf beet
+lettuce
+cress
+spinach
+bell pepper
+pimento, pimiento
+jalapeno, jalapeno pepper
+cherry tomato
+parsnip
+turnip
+mustard, mustard greens, leaf mustard, Indian mustard
+bok choy, bok choi
+head cabbage
+broccoli
+cauliflower
+brussels sprouts
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+asparagus
+green onion, spring onion, scallion
+shallot
+leek
+cardoon
+celery
+mushroom
+pumpkin
+cliff, drop, drop-off
+lunar crater
+valley, vale
+alp
+volcano
+promontory, headland, head, foreland
+sandbar, sand bar
+dune, sand dune
+coral reef
+lakeside, lakeshore
+seashore, coast, seacoast, sea-coast
+geyser
+bakery, bakeshop, bakehouse
+juniper berry
+gourd
+acorn
+olive
+hip, rose hip, rosehip
+ear, spike, capitulum
+pumpkin seed
+sunflower seed
+coffee bean, coffee berry, coffee
+rapeseed
+corn
+buckeye, horse chestnut, conker
+bean
+peanut, earthnut, goober, goober pea, groundnut, monkey nut
+walnut
+cashew, cashew nut
+chestnut
+hazelnut, filbert, cobnut, cob
+coconut, cocoanut
+pecan
+pistachio, pistachio nut
+lentil
+pea
+peanut
+okra
+sunflower, helianthus
+lesser celandine, pilewort, Ranunculus ficaria
+wood anemone, Anemone nemorosa
+blue columbine, Aquilegia caerulea, Aquilegia scopulorum calcarea
+delphinium
+nigella
+calla lily, calla, arum lily, Zantedeschia aethiopica
+sandwort
+pink, garden pink
+baby's breath, babies'-breath, Gypsophila paniculata
+ice plant, icicle plant, Mesembryanthemum crystallinum
+globe amaranth, bachelor's button, Gomphrena globosa
+four o'clock
+Virginia spring beauty, Claytonia virginica
+wallflower
+damask violet, Dame's violet, sweet rocket, Hesperis matronalis
+candytuft
+Iceland poppy, Papaver alpinum
+prickly poppy, Papaver argemone
+oriental poppy, Papaver orientale
+celandine, greater celandine, swallowwort, swallow wort, Chelidonium majus
+blue poppy, Meconopsis betonicifolia
+Welsh poppy, Meconopsis cambrica
+celandine poppy, wood poppy, Stylophorum diphyllum
+corydalis
+pearly everlasting, cottonweed, Anaphalis margaritacea
+strawflower, golden everlasting, yellow paper daisy, Helichrysum bracteatum
+yellow chamomile, golden marguerite, dyers' chamomile, Anthemis tinctoria
+dusty miller, silver-lace, silver lace, Tanacetum ptarmiciflorum, Chrysanthemum ptarmiciflorum
+tansy, golden buttons, scented fern, Tanacetum vulgare
+daisy
+common marigold, pot marigold, ruddles, Scotch marigold, Calendula officinalis
+China aster, Callistephus chinensis
+cornflower, bachelor's button, bluebottle, Centaurea cyanus
+chrysanthemum
+mistflower, mist-flower, ageratum, Conoclinium coelestinum, Eupatorium coelestinum
+cosmos, cosmea
+dahlia, Dahlia pinnata
+coneflower
+blue daisy, blue marguerite, Felicia amelloides
+gazania
+African daisy
+male orchis, early purple orchid, Orchis mascula
+butterfly orchid, butterfly orchis, Orchis papilionaceae
+aerides
+brassavola
+spider orchid, Brassia lawrenceana
+grass pink, Calopogon pulchellum, Calopogon tuberosum
+calypso, fairy-slipper, Calypso bulbosa
+cattleya
+red helleborine, Cephalanthera rubra
+coelogyne
+cymbid, cymbidium
+lady's slipper, lady-slipper, ladies' slipper, slipper orchid
+marsh orchid
+dendrobium
+disa
+helleborine
+fragrant orchid, Gymnadenia conopsea
+fringed orchis, fringed orchid
+lizard orchid, Himantoglossum hircinum
+laelia
+masdevallia
+odontoglossum
+oncidium, dancing lady orchid, butterfly plant, butterfly orchid
+bee orchid, Ophrys apifera
+fly orchid, Ophrys insectifera, Ophrys muscifera
+spider orchid
+phaius
+moth orchid, moth plant
+ladies' tresses, lady's tresses
+stanhopea
+stelis
+vanda
+cyclamen, Cyclamen purpurascens
+centaury
+gentian
+begonia
+commelina
+scabious, scabiosa
+achimenes, hot water plant
+African violet, Saintpaulia ionantha
+streptocarpus
+scorpionweed, scorpion weed, phacelia
+calceolaria, slipperwort
+toadflax, butter-and-eggs, wild snapdragon, devil's flax, Linaria vulgaris
+veronica, speedwell
+bonsai
+star anise, Chinese anise, Illicium verum
+wattle
+huisache, cassie, mimosa bush, sweet wattle, sweet acacia, scented wattle, flame tree, Acacia farnesiana
+silk tree, Albizia julibrissin, Albizzia julibrissin
+rain tree, saman, monkeypod, monkey pod, zaman, zamang, Albizia saman
+dita, dita bark, devil tree, Alstonia scholaris
+pandanus, screw pine
+linden, linden tree, basswood, lime, lime tree
+American beech, white beech, red beech, Fagus grandifolia, Fagus americana
+New Zealand beech
+live oak
+shingle oak, laurel oak, Quercus imbricaria
+pin oak, swamp oak, Quercus palustris
+cork oak, Quercus suber
+yellow birch, Betula alleghaniensis, Betula leutea
+American white birch, paper birch, paperbark birch, canoe birch, Betula cordifolia, Betula papyrifera
+downy birch, white birch, Betula pubescens
+alder, alder tree
+fringe tree
+European ash, common European ash, Fraxinus excelsior
+fig, common fig, common fig tree, Ficus carica
+witch elm, wych elm, Ulmus glabra
+Dutch elm, Ulmus hollandica
+cabbage tree, grass tree, Cordyline australis
+golden shower tree, drumstick tree, purging cassia, pudding pipe tree, canafistola, canafistula, Cassia fistula
+honey locust, Gleditsia triacanthos
+Kentucky coffee tree, bonduc, chicot, Gymnocladus dioica
+Brazilian rosewood, caviuna wood, jacaranda, Dalbergia nigra
+logwood, logwood tree, campeachy, bloodwood tree, Haematoxylum campechianum
+coral tree, erythrina
+Japanese pagoda tree, Chinese scholartree, Chinese scholar tree, Sophora japonica, Sophora sinensis
+kowhai, Sophora tetraptera
+palm, palm tree
+Arabian coffee, Coffea arabica
+cork tree, Phellodendron amurense
+weeping willow, Babylonian weeping willow, Salix babylonica
+pussy willow, Salix discolor
+goat willow, florist's willow, pussy willow, Salix caprea
+China tree, false dogwood, jaboncillo, chinaberry, Sapindus saponaria
+pepper tree, molle, Peruvian mastic tree, Schinus molle
+balata, balata tree, beefwood, bully tree, Manilkara bidentata
+teak, Tectona grandis
+ginkgo, gingko, maidenhair tree, Ginkgo biloba
+pine, pine tree, true pine
+ilang-ilang, ylang-ylang, Cananga odorata
+laurel
+magnolia
+tulip tree, tulip poplar, yellow poplar, canary whitewood, Liriodendron tulipifera
+baobab, monkey-bread tree, Adansonia digitata
+kapok, ceiba tree, silk-cotton tree, white silk-cotton tree, Bombay ceiba, God tree, Ceiba pentandra
+red beech, brown oak, booyong, crow's foot, stave wood, silky elm, Heritiera trifoliolata, Terrietia trifoliolata
+cacao, cacao tree, chocolate tree, Theobroma cacao
+sorrel tree, sourwood, titi, Oxydendrum arboreum
+iron tree, iron-tree, ironwood, ironwood tree
+mangrove, Rhizophora mangle
+paper mulberry, Broussonetia papyrifera
+Judas tree, love tree, Circis siliquastrum
+redbud, Cercis canadensis
+mountain ash
+ailanthus
+silver maple, Acer saccharinum
+Oregon maple, big-leaf maple, Acer macrophyllum
+sycamore, great maple, scottish maple, Acer pseudoplatanus
+box elder, ash-leaved maple, Acer negundo
+Japanese maple, full moon maple, Acer japonicum
+holly
+dogwood, dogwood tree, cornel
+truffle, earthnut, earth-ball
+shiitake, shiitake mushroom, Chinese black mushroom, golden oak mushroom, Oriental black mushroom, Lentinus edodes
+lichen
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+jelly fungus
+dead-man's-fingers, dead-men's-fingers, Xylaria polymorpha
+earthstar
+coral fungus
+stinkhorn, carrion fungus
+puffball, true puffball
+gyromitra
+bolete
+polypore, pore fungus, pore mushroom
+gill fungus
+morel
+agaric
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+mite
+centipede
+millipede, millepede, milliped
+horseshoe crab, king crab, Limulus polyphemus, Xiphosurus polyphemus
+isopod
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+shrimp
+barnacle, cirriped, cirripede
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+weevil
+fly
+mosquito
+ant, emmet, pismire
+bee
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+mayfly, dayfly, shadfly
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+nymphalid, nymphalid butterfly, brush-footed butterfly, four-footed butterfly
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+moth
+polyp
+jellyfish
+sea anemone, anemone
+coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+earthworm, angleworm, fishworm, fishing worm, wiggler, nightwalker, nightcrawler, crawler, dew worm, red worm
+conch
+snail
+slug
+sea slug, nudibranch
+cowrie, cowry
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+clam
+mussel
+chambered nautilus, pearly nautilus, nautilus
+starfish, sea star
+sea urchin
+sea cucumber, holothurian
+Egyptian cat
+Persian cat
+tiger cat
+Siamese cat, Siamese
+tabby, tabby cat
+vizsla, Hungarian pointer
+English setter
+Gordon setter
+Irish setter, red setter
+Brittany spaniel
+golden retriever
+flat-coated retriever
+Australian terrier
+Yorkshire terrier
+Staffordshire bullterrier, Staffordshire bull terrier
+Scottish deerhound, deerhound
+greyhound
+Rhodesian ridgeback
+dalmatian, coach dog, carriage dog
+corgi, Welsh corgi
+Newfoundland, Newfoundland dog
+miniature poodle
+standard poodle
+basenji
+griffon, Brussels griffon, Belgian griffon
+Great Dane
+Bouvier des Flandres, Bouviers des Flandres
+Border collie
+German shepherd, German shepherd dog, German police dog, alsatian
+Doberman, Doberman pinscher
+boxer
+Eskimo dog, husky
+Saint Bernard, St Bernard
+Tibetan mastiff
+Pekinese, Pekingese, Peke
+Chihuahua
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+pheasant
+quail
+partridge
+hare
+Angora, Angora rabbit
+wood rabbit, cottontail, cottontail rabbit
+indri, indris, Indri indri, Indri brevicaudatus
+Madagascar cat, ring-tailed lemur, Lemur catta
+orangutan, orang, orangutang, Pongo pygmaeus
+chimpanzee, chimp, Pan troglodytes
+gorilla, Gorilla gorilla
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+gibbon, Hylobates lar
+titi, titi monkey
+squirrel monkey, Saimiri sciureus
+spider monkey, Ateles geoffroyi
+howler monkey, howler
+marmoset
+colobus, colobus monkey
+guenon, guenon monkey
+proboscis monkey, Nasalis larvatus
+patas, hussar monkey, Erythrocebus patas
+baboon
+macaque
+langur
+African elephant, Loxodonta africana
+Indian elephant, Elephas maximus
+brown bear, bruin, Ursus arctos
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+American black bear, black bear, Ursus americanus, Euarctos americanus
+sloth bear, Melursus ursinus, Ursus ursinus
+badger
+mink
+black-footed ferret, ferret, Mustela nigripes
+skunk, polecat, wood pussy
+weasel
+polecat, fitch, foulmart, foumart, Mustela putorius
+otter
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+raccoon, racoon
+meerkat, mierkat
+mongoose
+dingo, warrigal, warragal, Canis dingo
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+hyena, hyaena
+kit fox, Vulpes macrotis
+red fox, Vulpes vulpes
+grey fox, gray fox, Urocyon cinereoargenteus
+Arctic fox, white fox, Alopex lagopus
+red wolf, maned wolf, Canis rufus, Canis niger
+coyote, prairie wolf, brush wolf, Canis latrans
+white wolf, Arctic wolf, Canis lupus tundrarum
+timber wolf, grey wolf, gray wolf, Canis lupus
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+lynx, catamount
+jaguar, panther, Panthera onca, Felis onca
+tiger, Panthera tigris
+leopard, Panthera pardus
+snow leopard, ounce, Panthera uncia
+lion, king of beasts, Panthera leo
+cheetah, chetah, Acinonyx jubatus
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+Indian cobra, Naja naja
+green mamba
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+wood-frog, wood frog, Rana sylvatica
+leopard frog, spring frog, Rana pipiens
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+axolotl, mud puppy, Ambystoma mexicanum
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+barracouta, snoek
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+rainbow trout, Salmo gairdneri
+striped bass, striper, Roccus saxatilis, rockfish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+euphonium
+accordion, piano accordion, squeeze box
+grand piano, grand
+upright, upright piano
+chime, bell, gong
+drum, membranophone, tympan
+gong, tam-tam
+maraca
+marimba, xylophone
+banjo
+cello, violoncello
+violin, fiddle
+harp
+acoustic guitar
+electric guitar
+cornet, horn, trumpet, trump
+French horn, horn
+trombone
+harmonica, mouth organ, harp, mouth harp
+organ pipe, pipe, pipework
+panpipe, pandean pipe, syrinx
+bassoon
+oboe, hautboy, hautbois
+sax, saxophone
+flute, transverse flute
+bell
+whistle
+paintbrush
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+oxygen mask
+snorkel
+earphone, earpiece, headphone, phone
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+microphone, mike
+mouse, computer mouse
+scanner, digital scanner, image scanner
+trackball
+electric fan, blower
+guillotine
+barometer
+rule, ruler
+odometer, hodometer, mileometer, milometer
+scale, weighing machine
+thermometer
+watch, ticker
+analog clock
+digital clock
+pendulum clock
+hourglass
+sundial
+stethoscope
+syringe
+binoculars, field glasses, opera glasses
+projector
+hand glass, simple microscope, magnifying glass
+optical telescope
+radio telescope, radio reflector
+missile
+bow
+cannon
+machine gun
+revolver, six-gun, six-shooter
+rifle
+sword, blade, brand, steel
+computer keyboard, keypad
+crane
+lighter, light, igniter, ignitor
+chain saw, chainsaw
+circular saw, buzz saw
+abacus
+hand calculator, pocket calculator
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+desktop computer
+laptop, laptop computer
+hand-held computer, hand-held microcomputer
+web site, website, internet site, site
+harvester, reaper
+thresher, thrasher, threshing machine
+printer
+slot, one-armed bandit
+vending machine
+sewing machine
+joystick
+gas pump, gasoline pump, petrol pump, island dispenser
+carousel, carrousel, merry-go-round, roundabout, whirligig
+Ferris wheel
+roller coaster, big dipper, chute-the-chute
+hard disc, hard disk, fixed disk
+flash memory
+car mirror
+solar dish, solar collector, solar furnace
+remote control, remote
+buckle
+button
+hair slide
+paper clip, paperclip, gem clip
+knot
+combination lock
+padlock
+nail
+pin
+screw
+seat belt, seatbelt
+shredder
+candle, taper, wax light
+flash, photoflash, flash lamp, flashgun, flashbulb, flash bulb
+Chinese lantern
+jack-o'-lantern
+mousetrap
+spider web, spider's web
+comb
+ax, axe
+chisel
+cleaver, meat cleaver, chopper
+pocketknife, pocket knife
+plane, carpenter's plane, woodworking plane
+safety razor
+shaver, electric shaver, electric razor
+straight razor
+scissors, pair of scissors
+drill
+lawn mower, mower
+hammer
+cap opener
+corkscrew, bottle screw
+can opener, tin opener
+plunger, plumber's helper
+saw
+screwdriver
+shovel
+trowel
+adjustable wrench, adjustable spanner
+plow, plough
+rake
+ceramic ware
+frying pan, frypan, skillet
+wok
+caldron, cauldron
+coffeepot
+teapot
+spatula
+eggbeater, eggwhisk
+swab, swob, mop
+broom
+toothbrush
+rubber eraser, rubber, pencil eraser
+pencil sharpener
+fishing rod, fishing pole
+quill, quill pen
+ballpoint, ballpoint pen, ballpen, Biro
+fountain pen
+pencil
+needle
+drumstick
+crutch
+flagpole, flagstaff
+matchstick
+bicycle-built-for-two, tandem bicycle, tandem
+mountain bike, all-terrain bike, off-roader
+freight car
+passenger car, coach, carriage
+barrow, garden cart, lawn cart, wheelbarrow
+shopping cart
+motor scooter, scooter
+tank, army tank, armored combat vehicle, armoured combat vehicle
+forklift
+electric locomotive
+steam locomotive
+amphibian, amphibious vehicle
+ambulance
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+cab, hack, taxi, taxicab
+convertible
+jeep, landrover
+limousine, limo
+minivan
+Model T
+racer, race car, racing car
+go-kart
+golfcart, golf cart
+moped
+snowplow, snowplough
+fire engine, fire truck
+garbage truck, dustcart
+pickup, pickup truck
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+recreational vehicle, RV, R.V.
+half track
+snowmobile
+tractor
+tricycle, trike, velocipede
+unicycle, monocycle
+horse cart, horse-cart
+jinrikisha, ricksha, rickshaw
+oxcart
+covered wagon, Conestoga wagon, Conestoga, prairie wagon, prairie schooner
+hamper
+shopping basket
+thimble
+mailbag, postbag
+backpack, back pack, knapsack, packsack, rucksack, haversack
+sleeping bag
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+cocktail shaker
+saltshaker, salt shaker
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+vase
+beaker
+bucket, pail
+mortar
+beer bottle
+ink bottle, inkpot
+canteen
+hipflask, pocket flask
+water tower
+rain barrel
+bathtub, bathing tub, bath, tub
+ladle
+barrel, cask
+pitcher, ewer
+mug
+envelope
+beer glass
+goblet
+carton
+chest
+safe
+crate
+mailbox, letter box
+measuring cup
+wallet, billfold, notecase, pocketbook
+bag, traveling bag, travelling bag, grip, suitcase
+golf ball
+baseball
+balance beam, beam
+horizontal bar, high bar
+parallel bars, bars
+basketball
+croquet ball
+barbell
+dumbbell
+in-line skate
+ice skate
+pool table, billiard table, snooker table
+crossword puzzle, crossword
+jigsaw puzzle
+bowling pin, pin
+chessman, chess piece
+soccer ball
+pool ball
+rugby ball
+volleyball
+tennis ball
+point-and-shoot camera
+Polaroid camera, Polaroid Land camera
+reflex camera
+carpenter's kit, tool kit
+drilling platform, offshore rig
+iPod
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+camcorder
+monitor
+cassette player
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+pay-phone, pay-station
+dial telephone, dial phone
+tape player
+CD player
+photocopier
+metronome
+parachute, chute
+brick
+lipstick, lip rouge
+nail polish, nail enamel, nail varnish
+lotion
+hair spray
+bumper car, Dodgem
+airliner
+warplane, military plane
+airship, dirigible
+balloon
+fireboat
+gondola
+speedboat
+lifeboat
+canoe
+schooner
+catamaran
+trimaran
+container ship, containership, container vessel
+liner, ocean liner
+aircraft carrier, carrier, flattop, attack aircraft carrier
+submarine, pigboat, sub, U-boat
+bobsled, bobsleigh, bob
+dogsled, dog sled, dog sleigh
+subway train
+bullet train, bullet
+trolleybus, trolley coach, trackless trolley
+school bus
+bassinet
+crib, cot
+four-poster
+trundle bed, trundle, truckle bed, truckle
+bookcase
+china cabinet, china closet
+medicine chest, medicine cabinet
+lamp
+file, file cabinet, filing cabinet
+flat bench
+park bench
+pew, church bench
+armchair
+barber chair
+folding chair
+swivel chair
+wheelchair
+desk
+table-tennis table, ping-pong table, pingpong table
+dining table, board
+entertainment center
+wardrobe, closet, press
+rug, carpet, carpeting
+shower curtain
+theater curtain, theatre curtain
+newspaper, paper
+magazine, mag
+radio, wireless
+television, television system
+maze, labyrinth
+triumphal arch
+bridge, span
+barn
+greenhouse, nursery, glasshouse
+library
+garage
+apiary, bee house
+boathouse
+church, church building
+mosque
+stupa, tope
+planetarium
+restaurant, eating house, eating place, eatery
+skyscraper
+cinema, movie theater, movie theatre, movie house, picture palace
+home theater, home theatre
+lumbermill, sawmill
+coil, spiral, volute, whorl, helix
+colonnade
+obelisk
+totem pole
+castle
+prison, prison house
+grocery store, grocery, food market, market
+barbershop
+bookshop, bookstore, bookstall
+butcher shop, meat market
+confectionery, confectionary, candy store
+shoe shop, shoe-shop, shoe store
+tobacco shop, tobacconist shop, tobacconist
+toyshop
+fountain
+house of cards, cardhouse, card-house, cardcastle
+cliff dwelling
+yurt
+dock, dockage, docking facility
+brass, memorial tablet, plaque
+megalith, megalithic structure
+bannister, banister, balustrade, balusters, handrail
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+dam, dike, dyke
+chainlink fence
+picket fence, paling
+stone wall
+grille, radiator grille
+turnstile
+tent, collapsible shelter
+stadium, bowl, arena, sports stadium
+honeycomb
+buttress, buttressing
+beacon, lighthouse, beacon light, pharos
+silo
+handkerchief, hankie, hanky, hankey
+dishrag, dishcloth
+paper towel
+bath towel
+bib
+mask
+shoji
+sandal
+running shoe
+Loafer
+boot
+clog, geta, patten, sabot
+sling, scarf bandage, triangular bandage
+Band Aid
+eyepatch, patch
+quilt, comforter, comfort, puff
+doormat, welcome mat
+fancy dress, masquerade, masquerade costume
+wig
+collar
+shin guard, shinpad
+knee pad
+apron
+vestment
+academic gown, academic robe, judge's robe
+bulletproof vest
+swimming trunks, bathing trunks
+maillot, tank suit
+bikini, two-piece
+suit, suit of clothes
+jean, blue jean, denim
+pajama, pyjama
+short pants, shorts, trunks
+feather boa, boa
+headscarf
+stole
+bow tie, bow-tie, bowtie
+Windsor tie
+bolo tie, bolo, bola tie, bola
+hoopskirt, crinoline
+sarong
+poncho
+fur coat
+oilskin, slicker
+lab coat, laboratory coat
+pullover, slipover
+cardigan
+kimono
+abaya
+bathrobe
+polo shirt, sport shirt
+jersey, T-shirt, tee shirt
+brassiere, bra, bandeau
+diaper, nappy, napkin
+mitten
+crash helmet
+football helmet
+hard hat, tin hat, safety hat
+bathing cap, swimming cap
+mortarboard
+shower cap
+garrison cap, overseas cap
+bonnet, poke bonnet
+bearskin, busby, shako
+cowboy hat, ten-gallon hat
+sombrero
+turban
+maillot
+sock
+Christmas stocking
+military uniform
+pajama, pyjama, pj's, jammies
+brace, suspender, gallus
+jumper, pinafore, pinny
+gown
+pickelhaube
+cannon
+breastplate, aegis, egis
+lens cap, lens cover
+tile roof
+dome
+thatch, thatched roof
+vault
+Venetian blind
+window shade
+binder, ring-binder
+lampshade, lamp shade
+mosquito net
+window screen
+fire screen, fireguard
+shield, buckler
+cuirass
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+gasmask, respirator, gas helmet
+ski mask
+holster
+scabbard
+birdhouse
+bell cote, bell cot
+umbrella
+manhole cover
+bottlecap
+nipple
+tumble-dryer, tumble drier
+curling iron
+iron, smoothing iron
+espresso maker
+hot plate, hotplate
+ice maker
+microwave, microwave oven
+Dutch oven
+rotisserie
+electric range
+Primus stove, Primus
+toaster
+waffle iron
+vacuum, vacuum cleaner
+dishwasher, dish washer, dishwashing machine
+refrigerator, icebox
+washer, automatic washer, washing machine
diff --git a/samples/image-net-2012.sqlite3 b/samples/image-net-2012.sqlite3
new file mode 100644
index 000000000..67005c8c1
Binary files /dev/null and b/samples/image-net-2012.sqlite3 differ
diff --git a/samples/image-net-2012.wnid b/samples/image-net-2012.wnid
new file mode 100644
index 000000000..cc2d32614
--- /dev/null
+++ b/samples/image-net-2012.wnid
@@ -0,0 +1,1000 @@
+n02119789
+n02100735
+n02110185
+n02096294
+n02102040
+n02066245
+n02509815
+n02124075
+n02417914
+n02123394
+n02125311
+n02423022
+n02346627
+n02077923
+n02110063
+n02447366
+n02109047
+n02089867
+n02102177
+n02091134
+n02092002
+n02071294
+n02442845
+n02504458
+n02092339
+n02098105
+n02096437
+n02114712
+n02105641
+n02128925
+n02091635
+n02088466
+n02096051
+n02117135
+n02138441
+n02097130
+n02493509
+n02457408
+n02389026
+n02443484
+n02110341
+n02089078
+n02086910
+n02445715
+n02093256
+n02113978
+n02106382
+n02441942
+n02113712
+n02113186
+n02105162
+n02415577
+n02356798
+n02488702
+n02123159
+n02098413
+n02422699
+n02114855
+n02094433
+n02111277
+n02132136
+n02119022
+n02091467
+n02106550
+n02422106
+n02091831
+n02120505
+n02104365
+n02086079
+n02112706
+n02098286
+n02095889
+n02484975
+n02137549
+n02500267
+n02129604
+n02090721
+n02396427
+n02108000
+n02391049
+n02412080
+n02108915
+n02480495
+n02110806
+n02128385
+n02107683
+n02085936
+n02094114
+n02087046
+n02100583
+n02096177
+n02494079
+n02105056
+n02101556
+n02123597
+n02481823
+n02105505
+n02088094
+n02085782
+n02489166
+n02364673
+n02114548
+n02134084
+n02480855
+n02090622
+n02113624
+n02093859
+n02403003
+n02097298
+n02108551
+n02493793
+n02107142
+n02096585
+n02107574
+n02107908
+n02086240
+n02102973
+n02112018
+n02093647
+n02397096
+n02437312
+n02483708
+n02097047
+n02106030
+n02099601
+n02093991
+n02110627
+n02106166
+n02326432
+n02108089
+n02097658
+n02088364
+n02111129
+n02100236
+n02486261
+n02115913
+n02486410
+n02487347
+n02099849
+n02108422
+n02104029
+n02492035
+n02110958
+n02099429
+n02094258
+n02099267
+n02395406
+n02112350
+n02109961
+n02101388
+n02113799
+n02095570
+n02128757
+n02101006
+n02115641
+n02097209
+n02342885
+n02097474
+n02120079
+n02095314
+n02088238
+n02408429
+n02133161
+n02328150
+n02410509
+n02492660
+n02398521
+n02112137
+n02510455
+n02093428
+n02105855
+n02111500
+n02085620
+n02123045
+n02490219
+n02099712
+n02109525
+n02454379
+n02111889
+n02088632
+n02090379
+n02443114
+n02361337
+n02105412
+n02483362
+n02437616
+n02107312
+n02325366
+n02091032
+n02129165
+n02102318
+n02100877
+n02074367
+n02504013
+n02363005
+n02102480
+n02113023
+n02086646
+n02497673
+n02087394
+n02127052
+n02116738
+n02488291
+n02091244
+n02114367
+n02130308
+n02089973
+n02105251
+n02134418
+n02093754
+n02106662
+n02444819
+n01882714
+n01871265
+n01872401
+n01877812
+n01873310
+n01883070
+n04086273
+n04507155
+n04147183
+n04254680
+n02672831
+n02219486
+n02317335
+n01968897
+n03452741
+n03642806
+n07745940
+n02690373
+n04552348
+n02692877
+n02782093
+n04266014
+n03344393
+n03447447
+n04273569
+n03662601
+n02951358
+n04612504
+n02981792
+n04483307
+n03095699
+n03673027
+n03947888
+n02687172
+n04347754
+n04606251
+n03478589
+n04389033
+n03773504
+n02860847
+n03218198
+n02835271
+n03792782
+n03393912
+n03895866
+n02797295
+n04204347
+n03791053
+n03384352
+n03272562
+n04310018
+n02704792
+n02701002
+n02814533
+n02930766
+n03100240
+n03594945
+n03670208
+n03770679
+n03777568
+n04037443
+n04285008
+n03444034
+n03445924
+n03785016
+n04252225
+n03345487
+n03417042
+n03930630
+n04461696
+n04467665
+n03796401
+n03977966
+n04065272
+n04335435
+n04252077
+n04465501
+n03776460
+n04482393
+n04509417
+n03538406
+n03599486
+n03868242
+n02804414
+n03125729
+n03131574
+n03388549
+n02870880
+n03018349
+n03742115
+n03016953
+n04380533
+n03337140
+n03891251
+n02791124
+n04429376
+n03376595
+n04099969
+n04344873
+n04447861
+n03179701
+n03982430
+n03201208
+n03290653
+n04550184
+n07742313
+n07747607
+n07749582
+n07753113
+n07753275
+n07753592
+n07754684
+n07760859
+n07768694
+n12267677
+n12620546
+n13133613
+n11879895
+n12144580
+n12768682
+n03854065
+n04515003
+n03017168
+n03249569
+n03447721
+n03720891
+n03721384
+n04311174
+n02787622
+n02992211
+n04536866
+n03495258
+n02676566
+n03272010
+n03110669
+n03394916
+n04487394
+n03494278
+n03840681
+n03884397
+n02804610
+n03838899
+n04141076
+n03372029
+n11939491
+n12057211
+n09246464
+n09468604
+n09193705
+n09472597
+n09399592
+n09421951
+n09256479
+n09332890
+n09428293
+n09288635
+n03498962
+n03041632
+n03658185
+n03954731
+n03995372
+n03649909
+n03481172
+n03109150
+n02951585
+n03970156
+n04154565
+n04208210
+n03967562
+n03000684
+n01514668
+n01514859
+n01518878
+n01530575
+n01531178
+n01532829
+n01534433
+n01537544
+n01558993
+n01560419
+n01580077
+n01582220
+n01592084
+n01601694
+n01608432
+n01614925
+n01616318
+n01622779
+n01795545
+n01796340
+n01797886
+n01798484
+n01806143
+n01806567
+n01807496
+n01817953
+n01818515
+n01819313
+n01820546
+n01824575
+n01828970
+n01829413
+n01833805
+n01843065
+n01843383
+n01847000
+n01855032
+n01855672
+n01860187
+n02002556
+n02002724
+n02006656
+n02007558
+n02009912
+n02009229
+n02011460
+n02012849
+n02013706
+n02018207
+n02018795
+n02025239
+n02027492
+n02028035
+n02033041
+n02037110
+n02017213
+n02051845
+n02056570
+n02058221
+n01484850
+n01491361
+n01494475
+n01496331
+n01498041
+n02514041
+n02536864
+n01440764
+n01443537
+n02526121
+n02606052
+n02607072
+n02643566
+n02655020
+n02640242
+n02641379
+n01664065
+n01665541
+n01667114
+n01667778
+n01669191
+n01675722
+n01677366
+n01682714
+n01685808
+n01687978
+n01688243
+n01689811
+n01692333
+n01693334
+n01694178
+n01695060
+n01704323
+n01697457
+n01698640
+n01728572
+n01728920
+n01729322
+n01729977
+n01734418
+n01735189
+n01737021
+n01739381
+n01740131
+n01742172
+n01744401
+n01748264
+n01749939
+n01751748
+n01753488
+n01755581
+n01756291
+n01629819
+n01630670
+n01631663
+n01632458
+n01632777
+n01641577
+n01644373
+n01644900
+n04579432
+n04592741
+n03876231
+n03483316
+n03868863
+n04251144
+n03691459
+n03759954
+n04152593
+n03793489
+n03271574
+n03843555
+n04332243
+n04265275
+n04330267
+n03467068
+n02794156
+n04118776
+n03841143
+n04141975
+n02708093
+n03196217
+n04548280
+n03544143
+n04355338
+n03891332
+n04328186
+n03197337
+n04317175
+n04376876
+n03706229
+n02841315
+n04009552
+n04356056
+n03692522
+n04044716
+n02879718
+n02950826
+n02749479
+n04090263
+n04008634
+n03085013
+n04505470
+n03126707
+n03666591
+n02666196
+n02977058
+n04238763
+n03180011
+n03485407
+n03832673
+n06359193
+n03496892
+n04428191
+n04004767
+n04243546
+n04525305
+n04179913
+n03602883
+n04372370
+n03532672
+n02974003
+n03874293
+n03944341
+n03992509
+n03425413
+n02966193
+n04371774
+n04067472
+n04040759
+n04019541
+n03492542
+n04355933
+n03929660
+n02965783
+n04258138
+n04074963
+n03208938
+n02910353
+n03476684
+n03627232
+n03075370
+n03874599
+n03804744
+n04127249
+n04153751
+n03803284
+n04162706
+n04228054
+n02948072
+n03590841
+n04286575
+n04456115
+n03814639
+n03933933
+n04485082
+n03733131
+n03794056
+n04275548
+n01768244
+n01770081
+n01770393
+n01773157
+n01773549
+n01773797
+n01774384
+n01774750
+n01775062
+n01776313
+n01784675
+n01990800
+n01978287
+n01978455
+n01980166
+n01981276
+n01983481
+n01984695
+n01985128
+n01986214
+n02165105
+n02165456
+n02167151
+n02168699
+n02169497
+n02172182
+n02174001
+n02177972
+n02190166
+n02206856
+n02226429
+n02229544
+n02231487
+n02233338
+n02236044
+n02256656
+n02259212
+n02264363
+n02268443
+n02268853
+n02276258
+n02277742
+n02279972
+n02280649
+n02281406
+n02281787
+n01910747
+n01914609
+n01917289
+n01924916
+n01930112
+n01943899
+n01944390
+n01945685
+n01950731
+n01955084
+n02319095
+n02321529
+n03584829
+n03297495
+n03761084
+n03259280
+n04111531
+n04442312
+n04542943
+n04517823
+n03207941
+n04070727
+n04554684
+n03133878
+n03400231
+n04596742
+n02939185
+n03063689
+n04398044
+n04270147
+n02699494
+n04486054
+n03899768
+n04311004
+n04366367
+n04532670
+n02793495
+n03457902
+n03877845
+n03781244
+n03661043
+n02727426
+n02859443
+n03028079
+n03788195
+n04346328
+n03956157
+n04081281
+n03032252
+n03529860
+n03697007
+n03065424
+n03837869
+n04458633
+n02980441
+n04005630
+n03461385
+n02776631
+n02791270
+n02871525
+n02927161
+n03089624
+n04200800
+n04443257
+n04462240
+n03388043
+n03042490
+n04613696
+n03216828
+n02892201
+n03743016
+n02788148
+n02894605
+n03160309
+n03000134
+n03930313
+n04604644
+n04326547
+n03459775
+n04239074
+n04501370
+n03792972
+n04149813
+n03530642
+n03961711
+n03903868
+n02814860
+n07711569
+n07720875
+n07714571
+n07714990
+n07715103
+n07716358
+n07716906
+n07717410
+n07717556
+n07718472
+n07718747
+n07730033
+n07734744
+n04209239
+n03594734
+n02971356
+n03485794
+n04133789
+n02747177
+n04125021
+n07579787
+n03814906
+n03134739
+n03404251
+n04423845
+n03877472
+n04120489
+n03062245
+n03014705
+n03717622
+n03777754
+n04493381
+n04476259
+n02777292
+n07693725
+n03998194
+n03617480
+n07590611
+n04579145
+n03623198
+n07248320
+n04277352
+n04229816
+n02823428
+n03127747
+n02877765
+n04435653
+n03724870
+n03710637
+n03920288
+n03379051
+n02807133
+n04399382
+n03527444
+n03983396
+n03924679
+n04532106
+n06785654
+n03445777
+n07613480
+n04350905
+n04562935
+n03325584
+n03045698
+n07892512
+n03250847
+n04192698
+n03026506
+n03534580
+n07565083
+n04296562
+n02869837
+n07871810
+n02799071
+n03314780
+n04141327
+n04357314
+n02823750
+n13052670
+n07583066
+n03637318
+n04599235
+n07802026
+n02883205
+n03709823
+n04560804
+n02909870
+n03207743
+n04263257
+n07932039
+n03786901
+n04479046
+n03873416
+n02999410
+n04367480
+n03775546
+n07875152
+n04591713
+n04201297
+n02916936
+n03240683
+n02840245
+n02963159
+n04370456
+n03991062
+n02843684
+n03482405
+n03942813
+n03908618
+n03902125
+n07584110
+n02730930
+n04023962
+n02769748
+n10148035
+n02817516
+n03908714
+n02906734
+n03788365
+n02667093
+n03787032
+n03980874
+n03141823
+n03976467
+n04264628
+n07930864
+n04039381
+n06874185
+n04033901
+n04041544
+n07860988
+n03146219
+n03763968
+n03676483
+n04209133
+n03782006
+n03857828
+n03775071
+n02892767
+n07684084
+n04522168
+n03764736
+n04118538
+n03887697
+n13044778
+n03291819
+n03770439
+n03124170
+n04487081
+n03916031
+n02808440
+n07697537
+n12985857
+n02917067
+n03938244
+n15075141
+n02978881
+n02966687
+n03633091
+n13040303
+n03690938
+n03476991
+n02669723
+n03220513
+n03127925
+n04584207
+n07880968
+n03937543
+n03000247
+n04418357
+n04590129
+n02795169
+n04553703
+n02783161
+n02802426
+n02808304
+n03124043
+n03450230
+n04589890
+n12998815
+n02992529
+n03825788
+n02790996
+n03710193
+n03630383
+n03347037
+n03769881
+n03871628
+n03733281
+n03976657
+n03535780
+n04259630
+n03929855
+n04049303
+n04548362
+n02979186
+n06596364
+n03935335
+n06794110
+n02825657
+n03388183
+n04591157
+n04540053
+n03866082
+n04136333
+n04026417
+n02865351
+n02834397
+n03888257
+n04235860
+n04404412
+n04371430
+n03733805
+n07920052
+n07873807
+n02895154
+n04204238
+n04597913
+n04131690
+n07836838
+n09835506
+n03443371
+n13037406
+n04336792
+n04557648
+n03187595
+n04254120
+n03595614
+n04146614
+n03598930
+n03958227
+n04069434
+n03188531
+n02786058
+n07615774
+n04525038
+n04409515
+n03424325
+n03223299
+n03680355
+n07614500
+n07695742
+n04033995
+n03710721
+n04392985
+n03047690
+n03584254
+n13054560
+n10565667
+n03950228
+n03729826
+n02837789
+n04254777
+n02988304
+n03657121
+n04417672
+n04523525
+n02815834
+n09229709
+n07697313
+n03888605
+n03355925
+n03063599
+n04116512
+n04325704
+n07831146
+n03255030
diff --git a/samples/image-net-2012.words b/samples/image-net-2012.words
new file mode 100644
index 000000000..e23efa24a
--- /dev/null
+++ b/samples/image-net-2012.words
@@ -0,0 +1,1000 @@
+kit fox, Vulpes macrotis
+English setter
+Siberian husky
+Australian terrier
+English springer, English springer spaniel
+grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+Egyptian cat
+ibex, Capra ibex
+Persian cat
+cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+gazelle
+porcupine, hedgehog
+sea lion
+malamute, malemute, Alaskan malamute
+badger
+Great Dane
+Walker hound, Walker foxhound
+Welsh springer spaniel
+whippet
+Scottish deerhound, deerhound
+killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+mink
+African elephant, Loxodonta africana
+Weimaraner
+soft-coated wheaten terrier
+Dandie Dinmont, Dandie Dinmont terrier
+red wolf, maned wolf, Canis rufus, Canis niger
+Old English sheepdog, bobtail
+jaguar, panther, Panthera onca, Felis onca
+otterhound, otter hound
+bloodhound, sleuthhound
+Airedale, Airedale terrier
+hyena, hyaena
+meerkat, mierkat
+giant schnauzer
+titi, titi monkey
+three-toed sloth, ai, Bradypus tridactylus
+sorrel
+black-footed ferret, ferret, Mustela nigripes
+dalmatian, coach dog, carriage dog
+black-and-tan coonhound
+papillon
+skunk, polecat, wood pussy
+Staffordshire bullterrier, Staffordshire bull terrier
+Mexican hairless
+Bouvier des Flandres, Bouviers des Flandres
+weasel
+miniature poodle
+Cardigan, Cardigan Welsh corgi
+malinois
+bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+fox squirrel, eastern fox squirrel, Sciurus niger
+colobus, colobus monkey
+tiger cat
+Lhasa, Lhasa apso
+impala, Aepyceros melampus
+coyote, prairie wolf, brush wolf, Canis latrans
+Yorkshire terrier
+Newfoundland, Newfoundland dog
+brown bear, bruin, Ursus arctos
+red fox, Vulpes vulpes
+Norwegian elkhound, elkhound
+Rottweiler
+hartebeest
+Saluki, gazelle hound
+grey fox, gray fox, Urocyon cinereoargenteus
+schipperke
+Pekinese, Pekingese, Peke
+Brabancon griffon
+West Highland white terrier
+Sealyham terrier, Sealyham
+guenon, guenon monkey
+mongoose
+indri, indris, Indri indri, Indri brevicaudatus
+tiger, Panthera tigris
+Irish wolfhound
+wild boar, boar, Sus scrofa
+EntleBucher
+zebra
+ram, tup
+French bulldog
+orangutan, orang, orangutang, Pongo pygmaeus
+basenji
+leopard, Panthera pardus
+Bernese mountain dog
+Maltese dog, Maltese terrier, Maltese
+Norfolk terrier
+toy terrier
+vizsla, Hungarian pointer
+cairn, cairn terrier
+squirrel monkey, Saimiri sciureus
+groenendael
+clumber, clumber spaniel
+Siamese cat, Siamese
+chimpanzee, chimp, Pan troglodytes
+komondor
+Afghan hound, Afghan
+Japanese spaniel
+proboscis monkey, Nasalis larvatus
+guinea pig, Cavia cobaya
+white wolf, Arctic wolf, Canis lupus tundrarum
+ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+gorilla, Gorilla gorilla
+borzoi, Russian wolfhound
+toy poodle
+Kerry blue terrier
+ox
+Scotch terrier, Scottish terrier, Scottie
+Tibetan mastiff
+spider monkey, Ateles geoffroyi
+Doberman, Doberman pinscher
+Boston bull, Boston terrier
+Greater Swiss Mountain dog
+Appenzeller
+Shih-Tzu
+Irish water spaniel
+Pomeranian
+Bedlington terrier
+warthog
+Arabian camel, dromedary, Camelus dromedarius
+siamang, Hylobates syndactylus, Symphalangus syndactylus
+miniature schnauzer
+collie
+golden retriever
+Irish terrier
+affenpinscher, monkey pinscher, monkey dog
+Border collie
+hare
+boxer
+silky terrier, Sydney silky
+beagle
+Leonberg
+German short-haired pointer
+patas, hussar monkey, Erythrocebus patas
+dhole, Cuon alpinus
+baboon
+macaque
+Chesapeake Bay retriever
+bull mastiff
+kuvasz
+capuchin, ringtail, Cebus capucinus
+pug, pug-dog
+curly-coated retriever
+Norwich terrier
+flat-coated retriever
+hog, pig, grunter, squealer, Sus scrofa
+keeshond
+Eskimo dog, husky
+Brittany spaniel
+standard poodle
+Lakeland terrier
+snow leopard, ounce, Panthera uncia
+Gordon setter
+dingo, warrigal, warragal, Canis dingo
+standard schnauzer
+hamster
+Tibetan terrier, chrysanthemum dog
+Arctic fox, white fox, Alopex lagopus
+wire-haired fox terrier
+basset, basset hound
+water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+American black bear, black bear, Ursus americanus, Euarctos americanus
+Angora, Angora rabbit
+bison
+howler monkey, howler
+hippopotamus, hippo, river horse, Hippopotamus amphibius
+chow, chow chow
+giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+Shetland sheepdog, Shetland sheep dog, Shetland
+Great Pyrenees
+Chihuahua
+tabby, tabby cat
+marmoset
+Labrador retriever
+Saint Bernard, St Bernard
+armadillo
+Samoyed, Samoyede
+bluetick
+redbone
+polecat, fitch, foulmart, foumart, Mustela putorius
+marmot
+kelpie
+gibbon, Hylobates lar
+llama
+miniature pinscher
+wood rabbit, cottontail, cottontail rabbit
+Italian greyhound
+lion, king of beasts, Panthera leo
+cocker spaniel, English cocker spaniel, cocker
+Irish setter, red setter
+dugong, Dugong dugon
+Indian elephant, Elephas maximus
+beaver
+Sussex spaniel
+Pembroke, Pembroke Welsh corgi
+Blenheim spaniel
+Madagascar cat, ring-tailed lemur, Lemur catta
+Rhodesian ridgeback
+lynx, catamount
+African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+langur
+Ibizan hound, Ibizan Podenco
+timber wolf, grey wolf, gray wolf, Canis lupus
+cheetah, chetah, Acinonyx jubatus
+English foxhound
+briard
+sloth bear, Melursus ursinus, Ursus ursinus
+Border terrier
+German shepherd, German shepherd dog, German police dog, alsatian
+otter
+koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+tusker
+echidna, spiny anteater, anteater
+wallaby, brush kangaroo
+platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+wombat
+revolver, six-gun, six-shooter
+umbrella
+schooner
+soccer ball
+accordion, piano accordion, squeeze box
+ant, emmet, pismire
+starfish, sea star
+chambered nautilus, pearly nautilus, nautilus
+grand piano, grand
+laptop, laptop computer
+strawberry
+airliner
+warplane, military plane
+airship, dirigible
+balloon
+space shuttle
+fireboat
+gondola
+speedboat
+lifeboat
+canoe
+yawl
+catamaran
+trimaran
+container ship, containership, container vessel
+liner, ocean liner
+pirate, pirate ship
+aircraft carrier, carrier, flattop, attack aircraft carrier
+submarine, pigboat, sub, U-boat
+wreck
+half track
+tank, army tank, armored combat vehicle, armoured combat vehicle
+missile
+bobsled, bobsleigh, bob
+dogsled, dog sled, dog sleigh
+bicycle-built-for-two, tandem bicycle, tandem
+mountain bike, all-terrain bike, off-roader
+freight car
+passenger car, coach, carriage
+barrow, garden cart, lawn cart, wheelbarrow
+shopping cart
+motor scooter, scooter
+forklift
+electric locomotive
+steam locomotive
+amphibian, amphibious vehicle
+ambulance
+beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+cab, hack, taxi, taxicab
+convertible
+jeep, landrover
+limousine, limo
+minivan
+Model T
+racer, race car, racing car
+sports car, sport car
+go-kart
+golfcart, golf cart
+moped
+snowplow, snowplough
+fire engine, fire truck
+garbage truck, dustcart
+pickup, pickup truck
+tow truck, tow car, wrecker
+trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+moving van
+police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+recreational vehicle, RV, R.V.
+streetcar, tram, tramcar, trolley, trolley car
+snowmobile
+tractor
+mobile home, manufactured home
+tricycle, trike, velocipede
+unicycle, monocycle
+horse cart, horse-cart
+jinrikisha, ricksha, rickshaw
+oxcart
+bassinet
+cradle
+crib, cot
+four-poster
+bookcase
+china cabinet, china closet
+medicine chest, medicine cabinet
+chiffonier, commode
+table lamp
+file, file cabinet, filing cabinet
+park bench
+barber chair
+throne
+folding chair
+rocking chair, rocker
+studio couch, day bed
+toilet seat
+desk
+pool table, billiard table, snooker table
+dining table, board
+entertainment center
+wardrobe, closet, press
+Granny Smith
+orange
+lemon
+fig
+pineapple, ananas
+banana
+jackfruit, jak, jack
+custard apple
+pomegranate
+acorn
+hip, rose hip, rosehip
+ear, spike, capitulum
+rapeseed
+corn
+buckeye, horse chestnut, conker
+organ, pipe organ
+upright, upright piano
+chime, bell, gong
+drum, membranophone, tympan
+gong, tam-tam
+maraca
+marimba, xylophone
+steel drum
+banjo
+cello, violoncello
+violin, fiddle
+harp
+acoustic guitar
+electric guitar
+cornet, horn, trumpet, trump
+French horn, horn
+trombone
+harmonica, mouth organ, harp, mouth harp
+ocarina, sweet potato
+panpipe, pandean pipe, syrinx
+bassoon
+oboe, hautboy, hautbois
+sax, saxophone
+flute, transverse flute
+daisy
+yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+cliff, drop, drop-off
+valley, vale
+alp
+volcano
+promontory, headland, head, foreland
+sandbar, sand bar
+coral reef
+lakeside, lakeshore
+seashore, coast, seacoast, sea-coast
+geyser
+hatchet
+cleaver, meat cleaver, chopper
+letter opener, paper knife, paperknife
+plane, carpenter's plane, woodworking plane
+power drill
+lawn mower, mower
+hammer
+corkscrew, bottle screw
+can opener, tin opener
+plunger, plumber's helper
+screwdriver
+shovel
+plow, plough
+chain saw, chainsaw
+cock
+hen
+ostrich, Struthio camelus
+brambling, Fringilla montifringilla
+goldfinch, Carduelis carduelis
+house finch, linnet, Carpodacus mexicanus
+junco, snowbird
+indigo bunting, indigo finch, indigo bird, Passerina cyanea
+robin, American robin, Turdus migratorius
+bulbul
+jay
+magpie
+chickadee
+water ouzel, dipper
+kite
+bald eagle, American eagle, Haliaeetus leucocephalus
+vulture
+great grey owl, great gray owl, Strix nebulosa
+black grouse
+ptarmigan
+ruffed grouse, partridge, Bonasa umbellus
+prairie chicken, prairie grouse, prairie fowl
+peacock
+quail
+partridge
+African grey, African gray, Psittacus erithacus
+macaw
+sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser, Mergus serrator
+goose
+black swan, Cygnus atratus
+white stork, Ciconia ciconia
+black stork, Ciconia nigra
+spoonbill
+flamingo
+American egret, great white heron, Egretta albus
+little blue heron, Egretta caerulea
+bittern
+crane
+limpkin, Aramus pictus
+American coot, marsh hen, mud hen, water hen, Fulica americana
+bustard
+ruddy turnstone, Arenaria interpres
+red-backed sandpiper, dunlin, Erolia alpina
+redshank, Tringa totanus
+dowitcher
+oystercatcher, oyster catcher
+European gallinule, Porphyrio porphyrio
+pelican
+king penguin, Aptenodytes patagonica
+albatross, mollymawk
+great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+tiger shark, Galeocerdo cuvieri
+hammerhead, hammerhead shark
+electric ray, crampfish, numbfish, torpedo
+stingray
+barracouta, snoek
+coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+tench, Tinca tinca
+goldfish, Carassius auratus
+eel
+rock beauty, Holocanthus tricolor
+anemone fish
+lionfish
+puffer, pufferfish, blowfish, globefish
+sturgeon
+gar, garfish, garpike, billfish, Lepisosteus osseus
+loggerhead, loggerhead turtle, Caretta caretta
+leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+mud turtle
+terrapin
+box turtle, box tortoise
+banded gecko
+common iguana, iguana, Iguana iguana
+American chameleon, anole, Anolis carolinensis
+whiptail, whiptail lizard
+agama
+frilled lizard, Chlamydosaurus kingi
+alligator lizard
+Gila monster, Heloderma suspectum
+green lizard, Lacerta viridis
+African chameleon, Chamaeleo chamaeleon
+Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+triceratops
+African crocodile, Nile crocodile, Crocodylus niloticus
+American alligator, Alligator mississipiensis
+thunder snake, worm snake, Carphophis amoenus
+ringneck snake, ring-necked snake, ring snake
+hognose snake, puff adder, sand viper
+green snake, grass snake
+king snake, kingsnake
+garter snake, grass snake
+water snake
+vine snake
+night snake, Hypsiglena torquata
+boa constrictor, Constrictor constrictor
+rock python, rock snake, Python sebae
+Indian cobra, Naja naja
+green mamba
+sea snake
+horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+diamondback, diamondback rattlesnake, Crotalus adamanteus
+sidewinder, horned rattlesnake, Crotalus cerastes
+European fire salamander, Salamandra salamandra
+common newt, Triturus vulgaris
+eft
+spotted salamander, Ambystoma maculatum
+axolotl, mud puppy, Ambystoma mexicanum
+bullfrog, Rana catesbeiana
+tree frog, tree-frog
+tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+whistle
+wing
+paintbrush
+hand blower, blow dryer, blow drier, hair dryer, hair drier
+oxygen mask
+snorkel
+loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+microphone, mike
+screen, CRT screen
+mouse, computer mouse
+electric fan, blower
+oil filter
+strainer
+space heater
+stove
+guillotine
+barometer
+rule, ruler
+odometer, hodometer, mileometer, milometer
+scale, weighing machine
+analog clock
+digital clock
+wall clock
+hourglass
+sundial
+parking meter
+stopwatch, stop watch
+digital watch
+stethoscope
+syringe
+magnetic compass
+binoculars, field glasses, opera glasses
+projector
+sunglasses, dark glasses, shades
+loupe, jeweler's loupe
+radio telescope, radio reflector
+bow
+cannon
+assault rifle, assault gun
+rifle
+projectile, missile
+computer keyboard, keypad
+typewriter keyboard
+crane
+lighter, light, igniter, ignitor
+abacus
+cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+slide rule, slipstick
+desktop computer
+hand-held computer, hand-held microcomputer
+notebook, notebook computer
+web site, website, internet site, site
+harvester, reaper
+thresher, thrasher, threshing machine
+printer
+slot, one-armed bandit
+vending machine
+sewing machine
+joystick
+switch, electric switch, electrical switch
+hook, claw
+car wheel
+paddlewheel, paddle wheel
+pinwheel
+potter's wheel
+gas pump, gasoline pump, petrol pump, island dispenser
+carousel, carrousel, merry-go-round, roundabout, whirligig
+swing
+reel
+radiator
+puck, hockey puck
+hard disc, hard disk, fixed disk
+sunglass
+pick, plectrum, plectron
+car mirror
+solar dish, solar collector, solar furnace
+remote control, remote
+disk brake, disc brake
+buckle
+hair slide
+knot
+combination lock
+padlock
+nail
+safety pin
+screw
+muzzle
+seat belt, seatbelt
+ski
+candle, taper, wax light
+jack-o'-lantern
+spotlight, spot
+torch
+neck brace
+pier
+tripod
+maypole
+mousetrap
+spider web, spider's web
+trilobite
+harvestman, daddy longlegs, Phalangium opilio
+scorpion
+black and gold garden spider, Argiope aurantia
+barn spider, Araneus cavaticus
+garden spider, Aranea diademata
+black widow, Latrodectus mactans
+tarantula
+wolf spider, hunting spider
+tick
+centipede
+isopod
+Dungeness crab, Cancer magister
+rock crab, Cancer irroratus
+fiddler crab
+king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+American lobster, Northern lobster, Maine lobster, Homarus americanus
+spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+crayfish, crawfish, crawdad, crawdaddy
+hermit crab
+tiger beetle
+ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+ground beetle, carabid beetle
+long-horned beetle, longicorn, longicorn beetle
+leaf beetle, chrysomelid
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+grasshopper, hopper
+cricket
+walking stick, walkingstick, stick insect
+cockroach, roach
+mantis, mantid
+cicada, cicala
+leafhopper
+lacewing, lacewing fly
+dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+damselfly
+admiral
+ringlet, ringlet butterfly
+monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+cabbage butterfly
+sulphur butterfly, sulfur butterfly
+lycaenid, lycaenid butterfly
+jellyfish
+sea anemone, anemone
+brain coral
+flatworm, platyhelminth
+nematode, nematode worm, roundworm
+conch
+snail
+slug
+sea slug, nudibranch
+chiton, coat-of-mail shell, sea cradle, polyplacophore
+sea urchin
+sea cucumber, holothurian
+iron, smoothing iron
+espresso maker
+microwave, microwave oven
+Dutch oven
+rotisserie
+toaster
+waffle iron
+vacuum, vacuum cleaner
+dishwasher, dish washer, dishwashing machine
+refrigerator, icebox
+washer, automatic washer, washing machine
+Crock Pot
+frying pan, frypan, skillet
+wok
+caldron, cauldron
+coffeepot
+teapot
+spatula
+altar
+triumphal arch
+patio, terrace
+steel arch bridge
+suspension bridge
+viaduct
+barn
+greenhouse, nursery, glasshouse
+palace
+monastery
+library
+apiary, bee house
+boathouse
+church, church building
+mosque
+stupa, tope
+planetarium
+restaurant, eating house, eating place, eatery
+cinema, movie theater, movie theatre, movie house, picture palace
+home theater, home theatre
+lumbermill, sawmill
+coil, spiral, volute, whorl, helix
+obelisk
+totem pole
+castle
+prison, prison house
+grocery store, grocery, food market, market
+bakery, bakeshop, bakehouse
+barbershop
+bookshop, bookstore, bookstall
+butcher shop, meat market
+confectionery, confectionary, candy store
+shoe shop, shoe-shop, shoe store
+tobacco shop, tobacconist shop, tobacconist
+toyshop
+fountain
+cliff dwelling
+yurt
+dock, dockage, docking facility
+brass, memorial tablet, plaque
+megalith, megalithic structure
+bannister, banister, balustrade, balusters, handrail
+breakwater, groin, groyne, mole, bulwark, seawall, jetty
+dam, dike, dyke
+chainlink fence
+picket fence, paling
+worm fence, snake fence, snake-rail fence, Virginia fence
+stone wall
+grille, radiator grille
+sliding door
+turnstile
+mountain tent
+scoreboard
+honeycomb
+plate rack
+pedestal, plinth, footstall
+beacon, lighthouse, beacon light, pharos
+mashed potato
+bell pepper
+head cabbage
+broccoli
+cauliflower
+zucchini, courgette
+spaghetti squash
+acorn squash
+butternut squash
+cucumber, cuke
+artichoke, globe artichoke
+cardoon
+mushroom
+shower curtain
+jean, blue jean, denim
+carton
+handkerchief, hankie, hanky, hankey
+sandal
+ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+safe
+plate
+necklace
+croquet ball
+fur coat
+thimble
+pajama, pyjama, pj's, jammies
+running shoe
+cocktail shaker
+chest
+manhole cover
+modem
+tub, vat
+tray
+balance beam, beam
+bagel, beigel
+prayer rug, prayer mat
+kimono
+hot pot, hotpot
+whiskey jug
+knee pad
+book jacket, dust cover, dust jacket, dust wrapper
+spindle
+ski mask
+beer bottle
+crash helmet
+bottlecap
+tile roof
+mask
+maillot
+Petri dish
+football helmet
+bathing cap, swimming cap
+teddy, teddy bear
+holster
+pop bottle, soda bottle
+photocopier
+vestment
+crossword puzzle, crossword
+golf ball
+trifle
+suit, suit of clothes
+water tower
+feather boa, boa
+cloak
+red wine
+drumstick
+shield, buckler
+Christmas stocking
+hoopskirt, crinoline
+menu
+stage
+bonnet, poke bonnet
+meat loaf, meatloaf
+baseball
+face powder
+scabbard
+sunscreen, sunblock, sun blocker
+beer glass
+hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+guacamole
+lampshade, lamp shade
+wool, woolen, woollen
+hay
+bow tie, bow-tie, bowtie
+mailbag, postbag
+water jug
+bucket, pail
+dishrag, dishcloth
+soup bowl
+eggnog
+mortar
+trench coat
+paddle, boat paddle
+chain
+swab, swob, mop
+mixing bowl
+potpie
+wine bottle
+shoji
+bulletproof vest
+drilling platform, offshore rig
+binder, ring-binder
+cardigan
+sweatshirt
+pot, flowerpot
+birdhouse
+hamper
+ping-pong ball
+pencil box, pencil case
+pay-phone, pay-station
+consomme
+apron
+punching bag, punch bag, punching ball, punchball
+backpack, back pack, knapsack, packsack, rucksack, haversack
+groom, bridegroom
+bearskin, busby, shako
+pencil sharpener
+broom
+mosquito net
+abaya
+mortarboard
+poncho
+crutch
+Polaroid camera, Polaroid Land camera
+space bar
+cup
+racket, racquet
+traffic light, traffic signal, stoplight
+quill, quill pen
+radio, wireless
+dough
+cuirass
+military uniform
+lipstick, lip rouge
+shower cap
+monitor
+oscilloscope, scope, cathode-ray oscilloscope, CRO
+mitten
+brassiere, bra, bandeau
+French loaf
+vase
+milk can
+rugby ball
+paper towel
+earthstar
+envelope
+miniskirt, mini
+cowboy hat, ten-gallon hat
+trolleybus, trolley coach, trackless trolley
+perfume, essence
+bathtub, bathing tub, bath, tub
+hotdog, hot dog, red hot
+coral fungus
+bullet train, bullet
+pillow
+toilet tissue, toilet paper, bathroom tissue
+cassette
+carpenter's kit, tool kit
+ladle
+stinkhorn, carrion fungus
+lotion
+hair spray
+academic gown, academic robe, judge's robe
+dome
+crate
+wig
+burrito
+pill bottle
+chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+theater curtain, theatre curtain
+window shade
+barrel, cask
+washbasin, handbasin, washbowl, lavabo, wash-hand basin
+ballpoint, ballpoint pen, ballpen, Biro
+basketball
+bath towel
+cowboy boot
+gown
+window screen
+agaric
+cellular telephone, cellular phone, cellphone, cell, mobile phone
+nipple
+barbell
+mailbox, letter box
+lab coat, laboratory coat
+fire screen, fireguard
+minibus
+packet
+maze, labyrinth
+pole
+horizontal bar, high bar
+sombrero
+pickelhaube
+rain barrel
+wallet, billfold, notecase, pocketbook
+cassette player
+comic book
+piggy bank, penny bank
+street sign
+bell cote, bell cot
+fountain pen
+Windsor tie
+volleyball
+overskirt
+sarong
+purse
+bolo tie, bolo, bola tie, bola
+bib
+parachute, chute
+sleeping bag
+television, television system
+swimming trunks, bathing trunks
+measuring cup
+espresso
+pizza, pizza pie
+breastplate, aegis, egis
+shopping basket
+wooden spoon
+saltshaker, salt shaker
+chocolate sauce, chocolate syrup
+ballplayer, baseball player
+goblet
+gyromitra
+stretcher
+water bottle
+dial telephone, dial phone
+soap dispenser
+jersey, T-shirt, tee shirt
+school bus
+jigsaw puzzle
+plastic bag
+reflex camera
+diaper, nappy, napkin
+Band Aid
+ice lolly, lolly, lollipop, popsicle
+velvet
+tennis ball
+gasmask, respirator, gas helmet
+doormat, welcome mat
+Loafer
+ice cream, icecream
+pretzel
+quilt, comforter, comfort, puff
+maillot, tank suit
+tape player
+clog, geta, patten, sabot
+iPod
+bolete
+scuba diver
+pitcher, ewer
+matchstick
+bikini, two-piece
+sock
+CD player
+lens cap, lens cover
+thatch, thatched roof
+vault
+beaker
+bubble
+cheeseburger
+parallel bars, bars
+flagpole, flagstaff
+coffee mug
+rubber eraser, rubber, pencil eraser
+stole
+carbonara
+dumbbell
diff --git a/serve/bbf.c b/serve/bbf.c
index b739617b2..e4c311724 100644
--- a/serve/bbf.c
+++ b/serve/bbf.c
@@ -7,21 +7,31 @@
 static void uri_bbf_on_model_string(void* context, char* string);
 static void uri_bbf_on_source_blob(void* context, ebb_buf data);
 
+typedef struct {
+	ccv_bbf_param_t params;
+	int max_dimension;
+} ccv_bbf_uri_param_t;
+
 static const param_dispatch_t param_map[] = {
 	{
 		.property = "accurate",
 		.type = PARAM_TYPE_BOOL,
-		.offset = offsetof(ccv_bbf_param_t, accurate),
+		.offset = offsetof(ccv_bbf_uri_param_t, params) + offsetof(ccv_bbf_param_t, accurate),
 	},
 	{
 		.property = "interval",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_bbf_param_t, interval),
+		.offset = offsetof(ccv_bbf_uri_param_t, params) + offsetof(ccv_bbf_param_t, interval),
+	},
+	{
+		.property = "max_dimension",
+		.type = PARAM_TYPE_INT,
+		.offset = offsetof(ccv_bbf_uri_param_t, max_dimension),
 	},
 	{
 		.property = "min_neighbors",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_bbf_param_t, min_neighbors),
+		.offset = offsetof(ccv_bbf_uri_param_t, params) + offsetof(ccv_bbf_param_t, min_neighbors),
 	},
 	{
 		.property = "model",
@@ -29,11 +39,6 @@ static const param_dispatch_t param_map[] = {
 		.on_string = uri_bbf_on_model_string,
 		.offset = 0,
 	},
-	{
-		.property = "size",
-		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_bbf_param_t, min_neighbors),
-	},
 	{
 		.property = "source",
 		.type = PARAM_TYPE_BODY,
@@ -50,7 +55,7 @@ typedef struct {
 typedef struct {
 	param_parser_t param_parser;
 	bbf_context_t* context;
-	ccv_bbf_param_t params;
+	ccv_bbf_uri_param_t params;
 	ccv_bbf_classifier_cascade_t* cascade;
 	ebb_buf source;
 } bbf_param_parser_t;
@@ -58,7 +63,8 @@ typedef struct {
 static void uri_bbf_param_parser_init(bbf_param_parser_t* parser)
 {
 	param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->params, parser);
-	parser->params = ccv_bbf_default_params;
+	parser->params.params = ccv_bbf_default_params;
+	parser->params.max_dimension = 0;
 	parser->cascade = 0;
 	parser->source.data = 0;
 }
@@ -108,10 +114,10 @@ void* uri_bbf_detect_objects_init(void)
 	assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
 	context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
 		"[{"
-			"\"x\":\"integer\","
-			"\"y\":\"integer\","
-			"\"width\":\"integer\","
-			"\"height\":\"integer\","
+			"\"x\":\"number\","
+			"\"y\":\"number\","
+			"\"width\":\"number\","
+			"\"height\":\"number\","
 			"\"confidence\":\"number\""
 		"}]");
 	return context;
@@ -158,8 +164,16 @@ int uri_bbf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		free(parser);
 		return -1;
 	}
-	ccv_array_t* seq = ccv_bbf_detect_objects(image, &parser->cascade, 1, parser->params);
-	ccv_matrix_free(image);
+	ccv_dense_matrix_t* resize = 0;
+	if (parser->params.max_dimension > 0 && (image->rows > parser->params.max_dimension || image->cols > parser->params.max_dimension))
+	{
+		ccv_resample(image, &resize, 0, ccv_min(parser->params.max_dimension, (int)(image->rows * (float)parser->params.max_dimension / image->cols + 0.5)), ccv_min(parser->params.max_dimension, (int)(image->cols * (float)parser->params.max_dimension / image->rows + 0.5)), CCV_INTER_AREA);
+		ccv_matrix_free(image);
+	} else
+		resize = image;
+	ccv_array_t* seq = ccv_bbf_detect_objects(resize, &parser->cascade, 1, parser->params.params);
+	float width = resize->cols, height = resize->rows;
+	ccv_matrix_free(resize);
 	if (seq == 0)
 	{
 		free(parser);
@@ -176,7 +190,7 @@ int uri_bbf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		{
 			char cell[128];
 			ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-			snprintf(cell, 128, "{\"x\":%d,\"y\":%d,\"width\":%d,\"height\":%d,\"confidence\":%f}", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+			snprintf(cell, 128, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"confidence\":%f}", comp->rect.x / width, comp->rect.y / height, comp->rect.width / width, comp->rect.height / height, comp->classification.confidence);
 			size_t len = strnlen(cell, 128);
 			while (buf->written + len + 1 >= buf->len)
 			{
@@ -188,7 +202,7 @@ int uri_bbf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 			data[buf->written - 1] = (i == seq->rnum - 1) ? ']' : ',';
 		}
 		char http_header[192];
-		snprintf(http_header, 192, ebb_http_header, buf->written);
+		snprintf(http_header, 192, ebb_http_header, buf->written + 1);
 		size_t len = strnlen(http_header, 192);
 		if (buf->written + len + 1 >= buf->len)
 		{
diff --git a/serve/convnet.c b/serve/convnet.c
new file mode 100644
index 000000000..6d515912d
--- /dev/null
+++ b/serve/convnet.c
@@ -0,0 +1,244 @@
+#include "uri.h"
+#include "ccv.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+
+static void uri_convnet_on_model_string(void* context, char* string);
+static void uri_convnet_on_source_blob(void* context, ebb_buf data);
+
+static const param_dispatch_t param_map[] = {
+	{
+		.property = "model",
+		.type = PARAM_TYPE_STRING,
+		.on_string = uri_convnet_on_model_string,
+		.offset = 0,
+	},
+	{
+		.property = "source",
+		.type = PARAM_TYPE_BODY,
+		.on_blob = uri_convnet_on_source_blob,
+		.offset = 0,
+	},
+	{
+		.property = "top",
+		.type = PARAM_TYPE_INT,
+		.offset = 0,
+	},
+};
+
+typedef struct {
+	ccv_convnet_t* convnet;
+	ccv_array_t* words;
+} convnet_and_words_t;
+
+typedef struct {
+	ebb_buf desc;
+	convnet_and_words_t image_net[2];
+} convnet_context_t;
+
+typedef struct {
+	param_parser_t param_parser;
+	convnet_context_t* context;
+	int top;
+	convnet_and_words_t* convnet_and_words;
+	ebb_buf source;
+} convnet_param_parser_t;
+
+static void uri_convnet_param_parser_init(convnet_param_parser_t* parser)
+{
+	param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->top, parser);
+	parser->top = 5;
+	parser->convnet_and_words  = 0;
+	parser->source.data = 0;
+}
+
+static void uri_convnet_on_model_string(void* context, char* string)
+{
+	convnet_param_parser_t* parser = (convnet_param_parser_t*)context;
+	if (strcmp(string, "image-net-2010") == 0)
+		parser->convnet_and_words = &parser->context->image_net[0];
+	else if (strcmp(string, "image-net-2012") == 0)
+		parser->convnet_and_words = &parser->context->image_net[1];
+}
+
+static void uri_convnet_on_source_blob(void* context, ebb_buf data)
+{
+	convnet_param_parser_t* parser = (convnet_param_parser_t*)context;
+	parser->source = data;
+}
+
+static ccv_array_t* uri_convnet_words_read(char* filename)
+{
+	FILE* r = fopen(filename, "rt");
+	if(r)
+	{
+		ccv_array_t* words = ccv_array_new(sizeof(char*), 32, 0);
+		size_t len = 1024;
+		char* word = (char*)malloc(len);
+		ssize_t read;
+		while((read = getline(&word, &len, r)) != -1)
+		{
+			while(read > 1 && isspace(word[read - 1]))
+				read--;
+			word[read] = 0;
+			char* new_word = (char*)malloc(sizeof(char) * (read + 1));
+			memcpy(new_word, word, sizeof(char) * (read + 1));
+			ccv_array_push(words, &new_word);
+		}
+		free(word);
+		return words;
+	}
+	return 0;
+}
+
+void* uri_convnet_classify_init(void)
+{
+	convnet_context_t* context = (convnet_context_t*)malloc(sizeof(convnet_context_t));
+	context->image_net[0].convnet = ccv_convnet_read(0, "../samples/image-net-2010.sqlite3");
+	assert(context->image_net[0].convnet);
+	context->image_net[0].words = uri_convnet_words_read("../samples/image-net-2010.words");
+	assert(context->image_net[0].words);
+	context->image_net[1].convnet = ccv_convnet_read(0, "../samples/image-net-2012.sqlite3");
+	assert(context->image_net[1].convnet);
+	context->image_net[1].words = uri_convnet_words_read("../samples/image-net-2012.words");
+	assert(context->image_net[1].words);
+	assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
+	context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
+		"[{"
+			"\"word\":\"string\","
+			"\"confidence\":\"number\""
+		"}]");
+	return context;
+}
+
+void uri_convnet_classify_destroy(void* context)
+{
+	convnet_context_t* convnet_context = (convnet_context_t*)context;
+	int i, j;
+	for (i = 0; i < 2; i++)
+	{
+		ccv_convnet_free(convnet_context->image_net[i].convnet);
+		for (j = 0; j < convnet_context->image_net[i].words->rnum; j++)
+		{
+			char* word = (char*)ccv_array_get(convnet_context->image_net[i].words, j);
+			free(word);
+		}
+		ccv_array_free(convnet_context->image_net[i].words);
+	}
+	free(convnet_context->desc.data);
+	free(convnet_context);
+}
+
+void* uri_convnet_classify_parse(const void* context, void* parsed, int resource_id, const char* buf, size_t len, uri_parse_state_t state, int header_index)
+{
+	convnet_param_parser_t* parser;
+	if (parsed)
+		parser = (convnet_param_parser_t*)parsed;
+	else {
+		parser = (convnet_param_parser_t*)malloc(sizeof(convnet_param_parser_t));
+		uri_convnet_param_parser_init(parser);
+		parser->context = (convnet_context_t*)context;
+	}
+	switch (state)
+	{
+		case URI_QUERY_STRING:
+		case URI_CONTENT_BODY:
+		case URI_PARSE_TERMINATE:
+		case URI_MULTIPART_HEADER_FIELD:
+		case URI_MULTIPART_HEADER_VALUE:
+		case URI_MULTIPART_DATA:
+			param_parser_execute(&parser->param_parser, resource_id, buf, len, state, header_index);
+			break;
+	}
+	return parser;
+}
+
+int uri_convnet_classify_intro(const void* context, const void* parsed, ebb_buf* buf)
+{
+	convnet_context_t* convnet_context = (convnet_context_t*)context;
+	buf->data = convnet_context->desc.data;
+	buf->len = convnet_context->desc.len;
+	return 0;
+}
+
+int uri_convnet_classify(const void* context, const void* parsed, ebb_buf* buf)
+{
+	if (!parsed)
+		return -1;
+	convnet_param_parser_t* parser = (convnet_param_parser_t*)parsed;
+	param_parser_terminate(&parser->param_parser);
+	if (parser->source.data == 0)
+	{
+		free(parser);
+		return -1;
+	}
+	if (parser->convnet_and_words == 0)
+	{
+		free(parser->source.data);
+		free(parser);
+		return -1;
+	}
+	if (parser->top <= 0 || parser->top > parser->convnet_and_words->words->rnum)
+	{
+		free(parser->source.data);
+		free(parser);
+		return -1;
+	}
+	ccv_dense_matrix_t* image = 0;
+	ccv_read(parser->source.data, &image, CCV_IO_ANY_STREAM | CCV_IO_RGB_COLOR, parser->source.written);
+	free(parser->source.data);
+	if (image == 0)
+	{
+		free(parser);
+		return -1;
+	}
+	ccv_convnet_t* convnet = parser->convnet_and_words->convnet;
+	ccv_dense_matrix_t* input = 0;
+	ccv_convnet_input_formation(convnet, image, &input);
+	ccv_matrix_free(image);
+	ccv_array_t* rank = 0;
+	ccv_convnet_classify(convnet, &input, 1, &rank, parser->top, 1);
+	// print out
+	buf->len = 192 + rank->rnum * 30 + 2;
+	char* data = (char*)malloc(buf->len);
+	data[0] = '[';
+	buf->written = 1;
+	int i;
+	for (i = 0; i < rank->rnum; i++)
+	{
+		char cell[1024];
+		ccv_classification_t* classification = (ccv_classification_t*)ccv_array_get(rank, i);
+		char* word = *(char**)ccv_array_get(parser->convnet_and_words->words, classification->id);
+		snprintf(cell, 1024, "{\"word\":\"%s\",\"confidence\":%f}", word, classification->confidence);
+		size_t len = strnlen(cell, 1024);
+		while (buf->written + len + 1 >= buf->len)
+		{
+			buf->len = (buf->len * 3 + 1) / 2;
+			data = (char*)realloc(data, buf->len);
+		}
+		memcpy(data + buf->written, cell, len);
+		buf->written += len + 1;
+		data[buf->written - 1] = (i == rank->rnum - 1) ? ']' : ',';
+	}
+	// copy the http header
+	char http_header[192];
+	snprintf(http_header, 192, ebb_http_header, buf->written + 1);
+	size_t len = strnlen(http_header, 192);
+	if (buf->written + len + 1 >= buf->len)
+	{
+		buf->len = buf->written + len + 1;
+		data = (char*)realloc(data, buf->len);
+	}
+	memmove(data + len, data, buf->written);
+	memcpy(data, http_header, len);
+	buf->written += len + 1;
+	data[buf->written - 1] = '\n';
+	buf->data = data;
+	buf->len = buf->written;
+	buf->on_release = uri_ebb_buf_free;
+	ccv_array_free(rank);
+	ccv_matrix_free(input);
+	free(parser);
+	return 0;
+}
diff --git a/serve/dpm.c b/serve/dpm.c
index 6e53d33f4..7765253ec 100644
--- a/serve/dpm.c
+++ b/serve/dpm.c
@@ -7,16 +7,26 @@
 static void uri_dpm_on_model_string(void* context, char* string);
 static void uri_dpm_on_source_blob(void* context, ebb_buf data);
 
+typedef struct {
+	ccv_dpm_param_t params;
+	int max_dimension;
+} ccv_dpm_uri_param_t;
+
 static const param_dispatch_t param_map[] = {
 	{
 		.property = "interval",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_dpm_param_t, interval),
+		.offset = offsetof(ccv_dpm_uri_param_t, params) + offsetof(ccv_dpm_param_t, interval),
+	},
+	{
+		.property = "max_dimension",
+		.type = PARAM_TYPE_INT,
+		.offset = offsetof(ccv_dpm_uri_param_t, max_dimension),
 	},
 	{
 		.property = "min_neighbors",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_dpm_param_t, min_neighbors),
+		.offset = offsetof(ccv_dpm_uri_param_t, params) + offsetof(ccv_dpm_param_t, min_neighbors),
 	},
 	{
 		.property = "model",
@@ -33,7 +43,7 @@ static const param_dispatch_t param_map[] = {
 	{
 		.property = "threshold",
 		.type = PARAM_TYPE_FLOAT,
-		.offset = offsetof(ccv_dpm_param_t, threshold),
+		.offset = offsetof(ccv_dpm_uri_param_t, params) + offsetof(ccv_dpm_param_t, threshold),
 	},
 };
 
@@ -46,7 +56,7 @@ typedef struct {
 typedef struct {
 	param_parser_t param_parser;
 	dpm_context_t* context;
-	ccv_dpm_param_t params;
+	ccv_dpm_uri_param_t params;
 	ccv_dpm_mixture_model_t* mixture_model;
 	ebb_buf source;
 } dpm_param_parser_t;
@@ -54,7 +64,8 @@ typedef struct {
 static void uri_dpm_param_parser_init(dpm_param_parser_t* parser)
 {
 	param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->params, parser);
-	parser->params = ccv_dpm_default_params;
+	parser->params.params = ccv_dpm_default_params;
+	parser->params.max_dimension = 0;
 	parser->mixture_model = 0;
 	parser->source.data = 0;
 }
@@ -107,16 +118,16 @@ void* uri_dpm_detect_objects_init(void)
 	assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
 	context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
 		"[{"
-			"\"x\":\"integer\","
-			"\"y\":\"integer\","
-			"\"width\":\"integer\","
-			"\"height\":\"integer\","
+			"\"x\":\"number\","
+			"\"y\":\"number\","
+			"\"width\":\"number\","
+			"\"height\":\"number\","
 			"\"confidence\":\"number\","
 			"\"parts\":[{"
-				"\"x\":\"integer\","
-				"\"y\":\"integer\","
-				"\"width\":\"integer\","
-				"\"height\":\"integer\","
+				"\"x\":\"number\","
+				"\"y\":\"number\","
+				"\"width\":\"number\","
+				"\"height\":\"number\","
 				"\"confidence\":\"number\""
 			"}]"
 		"}]");
@@ -165,8 +176,16 @@ int uri_dpm_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		free(parser);
 		return -1;
 	}
-	ccv_array_t* seq = ccv_dpm_detect_objects(image, &parser->mixture_model, 1, parser->params);
-	ccv_matrix_free(image);
+	ccv_dense_matrix_t* resize = 0;
+	if (parser->params.max_dimension > 0 && (image->rows > parser->params.max_dimension || image->cols > parser->params.max_dimension))
+	{
+		ccv_resample(image, &resize, 0, ccv_min(parser->params.max_dimension, (int)(image->rows * (float)parser->params.max_dimension / image->cols + 0.5)), ccv_min(parser->params.max_dimension, (int)(image->cols * (float)parser->params.max_dimension / image->rows + 0.5)), CCV_INTER_AREA);
+		ccv_matrix_free(image);
+	} else
+		resize = image;
+	ccv_array_t* seq = ccv_dpm_detect_objects(resize, &parser->mixture_model, 1, parser->params.params);
+	float width = resize->cols, height = resize->rows;
+	ccv_matrix_free(resize);
 	if (seq  == 0)
 	{
 		free(parser);
@@ -183,7 +202,7 @@ int uri_dpm_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		{
 			char cell[128];
 			ccv_root_comp_t* comp = (ccv_root_comp_t*)ccv_array_get(seq, i);
-			snprintf(cell, 128, "{\"x\":%d,\"y\":%d,\"width\":%d,\"height\":%d,\"confidence\":%f,\"parts\":[", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+			snprintf(cell, 128, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"confidence\":%f,\"parts\":[", comp->rect.x / width, comp->rect.y / height, comp->rect.width / width, comp->rect.height / height, comp->classification.confidence);
 			size_t len = strnlen(cell, 128);
 			while (buf->written + len >= buf->len)
 			{
@@ -194,7 +213,7 @@ int uri_dpm_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 			buf->written += len;
 			for (j = 0; j < comp->pnum; j++)
 			{
-				snprintf(cell, 128, "{\"x\":%d,\"y\":%d,\"width\":%d,\"height\":%d,\"confidence\":%f}", comp->part[j].rect.x, comp->part[j].rect.y, comp->part[j].rect.width, comp->part[j].rect.height, comp->part[j].confidence);
+				snprintf(cell, 128, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"confidence\":%f}", comp->part[j].rect.x / width, comp->part[j].rect.y / height, comp->part[j].rect.width / width, comp->part[j].rect.height / height, comp->part[j].classification.confidence);
 				len = strnlen(cell, 128);
 				while (buf->written + len + 3 >= buf->len)
 				{
@@ -210,7 +229,7 @@ int uri_dpm_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 			data[buf->written - 1] = (i == seq->rnum - 1) ? ']' : ',';
 		}
 		char http_header[192];
-		snprintf(http_header, 192, ebb_http_header, buf->written);
+		snprintf(http_header, 192, ebb_http_header, buf->written + 1);
 		size_t len = strnlen(http_header, 192);
 		if (buf->written + len + 1 >= buf->len)
 		{
diff --git a/serve/icf.c b/serve/icf.c
index 1e7ba12c7..c95de431f 100644
--- a/serve/icf.c
+++ b/serve/icf.c
@@ -7,16 +7,26 @@
 static void uri_icf_on_model_string(void* context, char* string);
 static void uri_icf_on_source_blob(void* context, ebb_buf data);
 
+typedef struct {
+	ccv_icf_param_t params;
+	int max_dimension;
+} ccv_icf_uri_param_t;
+
 static const param_dispatch_t param_map[] = {
 	{
 		.property = "interval",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_icf_param_t, interval),
+		.offset = offsetof(ccv_icf_uri_param_t, params) + offsetof(ccv_icf_param_t, interval),
+	},
+	{
+		.property = "max_dimension",
+		.type = PARAM_TYPE_INT,
+		.offset = offsetof(ccv_icf_uri_param_t, max_dimension),
 	},
 	{
 		.property = "min_neighbors",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_icf_param_t, min_neighbors),
+		.offset = offsetof(ccv_icf_uri_param_t, params) + offsetof(ccv_icf_param_t, min_neighbors),
 	},
 	{
 		.property = "model",
@@ -33,7 +43,7 @@ static const param_dispatch_t param_map[] = {
 	{
 		.property = "step_through",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_icf_param_t, step_through),
+		.offset = offsetof(ccv_icf_uri_param_t, params) + offsetof(ccv_icf_param_t, step_through),
 	},
 };
 
@@ -45,7 +55,7 @@ typedef struct {
 typedef struct {
 	param_parser_t param_parser;
 	icf_context_t* context;
-	ccv_icf_param_t params;
+	ccv_icf_uri_param_t params;
 	ccv_icf_classifier_cascade_t* cascade;
 	ebb_buf source;
 } icf_param_parser_t;
@@ -53,7 +63,8 @@ typedef struct {
 static void uri_icf_param_parser_init(icf_param_parser_t* parser)
 {
 	param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->params, parser);
-	parser->params = ccv_icf_default_params;
+	parser->params.params = ccv_icf_default_params;
+	parser->params.max_dimension = 0;
 	parser->cascade = 0;
 	parser->source.data = 0;
 }
@@ -103,10 +114,10 @@ void* uri_icf_detect_objects_init(void)
 	assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
 	context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
 		"[{"
-			"\"x\":\"integer\","
-			"\"y\":\"integer\","
-			"\"width\":\"integer\","
-			"\"height\":\"integer\","
+			"\"x\":\"number\","
+			"\"y\":\"number\","
+			"\"width\":\"number\","
+			"\"height\":\"number\","
 			"\"confidence\":\"number\""
 		"}]");
 	return context;
@@ -153,8 +164,16 @@ int uri_icf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		free(parser);
 		return -1;
 	}
-	ccv_array_t* seq = ccv_icf_detect_objects(image, &parser->cascade, 1, parser->params);
-	ccv_matrix_free(image);
+	ccv_dense_matrix_t* resize = 0;
+	if (parser->params.max_dimension > 0 && (image->rows > parser->params.max_dimension || image->cols > parser->params.max_dimension))
+	{
+		ccv_resample(image, &resize, 0, ccv_min(parser->params.max_dimension, (int)(image->rows * (float)parser->params.max_dimension / image->cols + 0.5)), ccv_min(parser->params.max_dimension, (int)(image->cols * (float)parser->params.max_dimension / image->rows + 0.5)), CCV_INTER_AREA);
+		ccv_matrix_free(image);
+	} else
+		resize = image;
+	ccv_array_t* seq = ccv_icf_detect_objects(resize, &parser->cascade, 1, parser->params.params);
+	float width = resize->cols, height = resize->rows;
+	ccv_matrix_free(resize);
 	if (seq == 0)
 	{
 		free(parser);
@@ -171,7 +190,7 @@ int uri_icf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 		{
 			char cell[128];
 			ccv_comp_t* comp = (ccv_comp_t*)ccv_array_get(seq, i);
-			snprintf(cell, 128, "{\"x\":%d,\"y\":%d,\"width\":%d,\"height\":%d,\"confidence\":%f}", comp->rect.x, comp->rect.y, comp->rect.width, comp->rect.height, comp->confidence);
+			snprintf(cell, 128, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"confidence\":%f}", comp->rect.x / width, comp->rect.y / height, comp->rect.width / width, comp->rect.height / height, comp->classification.confidence);
 			size_t len = strnlen(cell, 128);
 			while (buf->written + len + 1 >= buf->len)
 			{
@@ -183,7 +202,7 @@ int uri_icf_detect_objects(const void* context, const void* parsed, ebb_buf* buf
 			data[buf->written - 1] = (i == seq->rnum - 1) ? ']' : ',';
 		}
 		char http_header[192];
-		snprintf(http_header, 192, ebb_http_header, buf->written);
+		snprintf(http_header, 192, ebb_http_header, buf->written + 1);
 		size_t len = strnlen(http_header, 192);
 		if (buf->written + len + 1 >= buf->len)
 		{
diff --git a/serve/makefile b/serve/makefile
index f7212c6d3..0b99993d1 100644
--- a/serve/makefile
+++ b/serve/makefile
@@ -10,8 +10,8 @@ all: libccv.a $(TARGETS)
 clean:
 	${MAKE} clean -C ../lib ; rm -f *.o $(TARGETS) -f
 
-$(TARGETS): serve.o uri.o parsers.o bbf.o dpm.o icf.o sift.o swt.o tld.o async.o ebb.o ebb_request_parser.o libccv.a
-	$(CC) -o $@ serve.o uri.o parsers.o bbf.o dpm.o icf.o sift.o swt.o tld.o async.o ebb.o ebb_request_parser.o $(LDFLAGS)
+$(TARGETS): serve.o uri.o parsers.o bbf.o dpm.o icf.o sift.o swt.o tld.o convnet.o async.o ebb.o ebb_request_parser.o libccv.a
+	$(CC) -o $@ serve.o uri.o parsers.o bbf.o dpm.o icf.o sift.o swt.o tld.o convnet.o async.o ebb.o ebb_request_parser.o $(LDFLAGS)
 
 libccv.a:
 	${MAKE} -C ../lib
diff --git a/serve/sift.c b/serve/sift.c
index f05dd2db8..fe1fb61ae 100644
--- a/serve/sift.c
+++ b/serve/sift.c
@@ -188,7 +188,7 @@ int uri_sift(const void* context, const void* parsed, ebb_buf* buf)
 			f32 += 128;
 		}
 		char http_header[192];
-		snprintf(http_header, 192, ebb_http_header, buf->written);
+		snprintf(http_header, 192, ebb_http_header, buf->written + 1);
 		size_t len = strnlen(http_header, 192);
 		if (buf->written + len + 1 >= buf->len)
 		{
diff --git a/serve/swt.c b/serve/swt.c
index 230b403f3..a9c1c3c5c 100644
--- a/serve/swt.c
+++ b/serve/swt.c
@@ -3,104 +3,117 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
+#ifdef HAVE_TESSERACT
+#include <tesseract/capi.h>
+#endif
 
 static void uri_swt_on_source_blob(void* context, ebb_buf data);
 
+typedef struct {
+	ccv_swt_param_t params;
+	int max_dimension;
+} ccv_swt_uri_param_t;
+
 static const param_dispatch_t param_map[] = {
 	{
 		.property = "aspect_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, aspect_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, aspect_ratio),
 	},
 	{
 		.property = "breakdown",
 		.type = PARAM_TYPE_BOOL,
-		.offset = offsetof(ccv_swt_param_t, breakdown),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, breakdown),
 	},
 	{
 		.property = "breakdown_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, breakdown_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, breakdown_ratio),
 	},
 	{
 		.property = "distance_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, distance_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, distance_ratio),
 	},
 	{
 		.property = "elongate_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, elongate_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, elongate_ratio),
 	},
 	{
 		.property = "height_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, height_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, height_ratio),
 	},
 	{
 		.property = "high_thresh",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, high_thresh),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, high_thresh),
 	},
 	{
 		.property = "intensity_thresh",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, intensity_thresh),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, intensity_thresh),
 	},
 	{
 		.property = "intersect_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, intersect_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, intersect_ratio),
 	},
 	{
 		.property = "interval",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, interval),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, interval),
 	},
 	{
 		.property = "letter_occlude_thresh",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, letter_occlude_thresh),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, letter_occlude_thresh),
 	},
 	{
 		.property = "letter_thresh",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, letter_thresh),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, letter_thresh),
 	},
 	{
 		.property = "low_thresh",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, low_thresh),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, low_thresh),
+	},
+	{
+		.property = "max_dimension",
+		.type = PARAM_TYPE_INT,
+		.offset = offsetof(ccv_swt_uri_param_t, max_dimension),
 	},
 	{
 		.property = "max_height",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, max_height),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, max_height),
 	},
 	{
 		.property = "min_area",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, min_area),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_area),
 	},
 	{
 		.property = "min_height",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, min_height),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_height),
 	},
 	{
 		.property = "min_neighbors",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, min_neighbors),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, min_neighbors),
 	},
 	{
 		.property = "scale_invariant",
 		.type = PARAM_TYPE_BOOL,
-		.offset = offsetof(ccv_swt_param_t, scale_invariant),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, scale_invariant),
 	},
 	{
 		.property = "size",
 		.type = PARAM_TYPE_INT,
-		.offset = offsetof(ccv_swt_param_t, size),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, size),
 	},
 	{
 		.property = "source",
@@ -111,35 +124,44 @@ static const param_dispatch_t param_map[] = {
 	{
 		.property = "std_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, std_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, std_ratio),
 	},
 	{
 		.property = "thickness_ratio",
 		.type = PARAM_TYPE_DOUBLE,
-		.offset = offsetof(ccv_swt_param_t, thickness_ratio),
+		.offset = offsetof(ccv_swt_uri_param_t, params) + offsetof(ccv_swt_param_t, thickness_ratio),
 	},
 };
 
 typedef struct {
 	ebb_buf desc;
+#ifdef HAVE_TESSERACT
+	TessBaseAPI* tesseract;
+#endif
 } swt_context_t;
 
 typedef struct {
 	param_parser_t param_parser;
-	ccv_swt_param_t params;
+	ccv_swt_uri_param_t params;
 	ebb_buf source;
+	swt_context_t* context;
 } swt_param_parser_t;
 
 void* uri_swt_detect_words_init(void)
 {
 	assert(param_parser_map_alphabet(param_map, sizeof(param_map) / sizeof(param_dispatch_t)) == 0);
 	swt_context_t* context = (swt_context_t*)malloc(sizeof(swt_context_t));
+#ifdef HAVE_TESSERACT
+	context->tesseract = TessBaseAPICreate();
+	if (TessBaseAPIInit3(context->tesseract, 0, "eng") != 0)
+		context->tesseract = 0;
+#endif
 	context->desc = param_parser_map_http_body(param_map, sizeof(param_map) / sizeof(param_dispatch_t),
 		"[{"
-			"\"x\":\"integer\","
-			"\"y\":\"integer\","
-			"\"width\":\"integer\","
-			"\"height\":\"integer\""
+			"\"x\":\"number\","
+			"\"y\":\"number\","
+			"\"width\":\"number\","
+			"\"height\":\"number\""
 		"}]");
 	return context;
 }
@@ -147,6 +169,9 @@ void* uri_swt_detect_words_init(void)
 void uri_swt_detect_words_destroy(void* context)
 {
 	swt_context_t* swt_context = (swt_context_t*)context;
+#ifdef HAVE_TESSERACT
+	TessBaseAPIDelete(swt_context->tesseract);
+#endif
 	free(swt_context->desc.data);
 	free(swt_context);
 }
@@ -154,7 +179,8 @@ void uri_swt_detect_words_destroy(void* context)
 static void uri_swt_param_parser_init(swt_param_parser_t* parser)
 {
 	param_parser_init(&parser->param_parser, param_map, sizeof(param_map) / sizeof(param_dispatch_t), &parser->params, parser);
-	parser->params = ccv_swt_default_params;
+	parser->params.params = ccv_swt_default_params;
+	parser->params.max_dimension = 0;
 	parser->source.data = 0;
 }
 
@@ -171,6 +197,7 @@ void* uri_swt_detect_words_parse(const void* context, void* parsed, int resource
 		parser = (swt_param_parser_t*)parsed;
 	else {
 		parser = (swt_param_parser_t*)malloc(sizeof(swt_param_parser_t));
+		parser->context = (swt_context_t*)context;
 		uri_swt_param_parser_init(parser);
 	}
 	switch (state)
@@ -214,10 +241,18 @@ int uri_swt_detect_words(const void* context, const void* parsed, ebb_buf* buf)
 		free(parser);
 		return -1;
 	}
-	ccv_array_t* seq = ccv_swt_detect_words(image, parser->params);
-	ccv_matrix_free(image);
+	ccv_dense_matrix_t* resize = 0;
+	if (parser->params.max_dimension > 0 && (image->rows > parser->params.max_dimension || image->cols > parser->params.max_dimension))
+	{
+		ccv_resample(image, &resize, 0, ccv_min(parser->params.max_dimension, (int)(image->rows * (float)parser->params.max_dimension / image->cols + 0.5)), ccv_min(parser->params.max_dimension, (int)(image->cols * (float)parser->params.max_dimension / image->rows + 0.5)), CCV_INTER_AREA);
+		ccv_matrix_free(image);
+	} else
+		resize = image;
+	ccv_array_t* seq = ccv_swt_detect_words(resize, parser->params.params);
+	float width = resize->cols, height = resize->rows;
 	if (seq  == 0)
 	{
+		ccv_matrix_free(resize);
 		free(parser);
 		return -1;
 	}
@@ -230,10 +265,39 @@ int uri_swt_detect_words(const void* context, const void* parsed, ebb_buf* buf)
 		buf->written = 1;
 		for (i = 0; i < seq->rnum; i++)
 		{
-			char cell[96];
+			char cell[1024];
 			ccv_rect_t* rect = (ccv_rect_t*)ccv_array_get(seq, i);
-			snprintf(cell, 96, "{\"x\":%d,\"y\":%d,\"width\":%d,\"height\":%d}", rect->x, rect->y, rect->width, rect->height);
-			size_t len = strnlen(cell, 96);
+#ifdef HAVE_TESSERACT
+			if (parser->context->tesseract)
+			{
+				char empty[] = "";
+				char* word = TessBaseAPIRect(parser->context->tesseract, resize->data.u8, 1, resize->step, rect->x, rect->y, rect->width, rect->height);
+				if (!word)
+					word = empty;
+				int wordlen = strlen(word); // trust tesseract to return correct thing
+				int j;
+				for (j = 0; j < wordlen; j++)
+					if (!((word[j] >= 'a' && word[j] <= 'z') ||
+							(word[j] >= 'A' && word[j] <= 'Z') ||
+							(word[j] >= '0' && word[j] <= '9') ||
+							word[j] == ' ' ||
+							word[j] == '-')) // replace unsupported char to whitespace
+						word[j] = ' ';
+				for (j = wordlen - 1; j >= 0 && word[j] == ' '; j--); // remove trailing whitespace
+				word[j + 1] = 0, wordlen = j + 1;
+				for (j = 0; j < wordlen && word[j] == ' '; j++); // remove leading whitespace
+				wordlen -= j;
+				memmove(word, word + j, wordlen + 1);
+				if (wordlen > 512) // if the wordlen is greater than 512, trim it
+					word[512] = 0;
+				snprintf(cell, 1024, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f,\"word\":\"%s\"}", rect->x / width, rect->y / height, rect->width / width, rect->height / height, word);
+			} else {
+#endif
+			snprintf(cell, 1024, "{\"x\":%f,\"y\":%f,\"width\":%f,\"height\":%f}", rect->x / width, rect->y / height, rect->width / width, rect->height / height);
+#ifdef HAVE_TESSERACT
+			}
+#endif
+			size_t len = strnlen(cell, 1024);
 			while (buf->written + len + 1 >= buf->len)
 			{
 				buf->len = (buf->len * 3 + 1) / 2;
@@ -244,7 +308,7 @@ int uri_swt_detect_words(const void* context, const void* parsed, ebb_buf* buf)
 			data[buf->written - 1] = (i == seq->rnum - 1) ? ']' : ',';
 		}
 		char http_header[192];
-		snprintf(http_header, 192, ebb_http_header, buf->written);
+		snprintf(http_header, 192, ebb_http_header, buf->written + 1);
 		size_t len = strnlen(http_header, 192);
 		if (buf->written + len + 1 >= buf->len)
 		{
@@ -263,6 +327,7 @@ int uri_swt_detect_words(const void* context, const void* parsed, ebb_buf* buf)
 		buf->len = sizeof(ebb_http_empty_array);
 		buf->on_release = 0;
 	}
+	ccv_matrix_free(resize);
 	ccv_array_free(seq);
 	free(parser);
 	return 0;
diff --git a/serve/tld.c b/serve/tld.c
index b9bff22e8..7e974a2e9 100644
--- a/serve/tld.c
+++ b/serve/tld.c
@@ -454,7 +454,7 @@ int uri_tld_track_object(const void* context, const void* parsed, ebb_buf* buf)
 				"\"close_matches\":%d"
 			"}}\n",
 			parser->uri_params.tld,
-			box.rect.x, box.rect.y, box.rect.width, box.rect.height, box.confidence,
+			box.rect.x, box.rect.y, box.rect.width, box.rect.height, box.classification.confidence,
 			info.perform_track ? "true" : "false",
 			info.perform_learn ? "true" : "false",
 			info.track_success ? "true" : "false",
diff --git a/serve/uri.c b/serve/uri.c
index 6bc53ff6e..18b2b2baa 100644
--- a/serve/uri.c
+++ b/serve/uri.c
@@ -4,7 +4,7 @@
 #include <assert.h>
 #include <stdio.h>
 
-const char ebb_http_header[] = "HTTP/1.1 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: %zd\r\n\r\n";
+const char ebb_http_header[] = "HTTP/1.0 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: %zd\r\n\r\n";
 
 void uri_ebb_buf_free(ebb_buf* buf)
 {
@@ -33,6 +33,15 @@ static uri_dispatch_t uri_map[] = {
 		.delete = 0,
 		.destroy = uri_bbf_detect_objects_destroy,
 	},
+	{
+		.uri = "/convnet/classify",
+		.init = uri_convnet_classify_init,
+		.parse = uri_convnet_classify_parse,
+		.get = uri_convnet_classify_intro,
+		.post = uri_convnet_classify,
+		.delete = 0,
+		.destroy = uri_convnet_classify_destroy,
+	},
 	{
 		.uri = "/dpm/detect.objects",
 		.init = uri_dpm_detect_objects_init,
diff --git a/serve/uri.h b/serve/uri.h
index 1c0d2750a..f931e402f 100644
--- a/serve/uri.h
+++ b/serve/uri.h
@@ -5,10 +5,10 @@
 #include <stddef.h>
 
 /* have to be static const char so that can use sizeof */
-static const char ebb_http_404[] = "HTTP/1.1 404 Not Found\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 6\r\n\r\nfalse\n";
-static const char ebb_http_empty_object[] = "HTTP/1.1 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 3\r\n\r\n{}\n";
-static const char ebb_http_empty_array[] = "HTTP/1.1 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 3\r\n\r\n[]\n";
-static const char ebb_http_ok_true[] = "HTTP/1.1 200 OK\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 5\r\n\r\ntrue\n";
+static const char ebb_http_404[] = "HTTP/1.0 404 Not Found\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 6\r\n\r\nfalse\n";
+static const char ebb_http_empty_object[] = "HTTP/1.0 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 3\r\n\r\n{}\n";
+static const char ebb_http_empty_array[] = "HTTP/1.0 201 Created\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 3\r\n\r\n[]\n";
+static const char ebb_http_ok_true[] = "HTTP/1.0 200 OK\r\nCache-Control: no-cache\r\nContent-Type: application/json; charset=utf-8\r\nContent-Length: 5\r\n\r\ntrue\n";
 /* we should never sizeof ebb_http_header */
 extern const char ebb_http_header[];
 
@@ -247,4 +247,10 @@ int uri_tld_track_object_intro(const void* context, const void* parsed, ebb_buf*
 int uri_tld_track_object(const void* context, const void* parsed, ebb_buf* buf);
 int uri_tld_track_object_free(const void* context, const void* parsed, ebb_buf* buf);
 
+void* uri_convnet_classify_init(void);
+void uri_convnet_classify_destroy(void* context);
+void* uri_convnet_classify_parse(const void* context, void* parsed, int resource_id, const char* buf, size_t len, uri_parse_state_t state, int header_index);
+int uri_convnet_classify_intro(const void* context, const void* parsed, ebb_buf* buf);
+int uri_convnet_classify(const void* context, const void* parsed, ebb_buf* buf);
+
 #endif
diff --git a/site/CNAME b/site/CNAME
new file mode 100644
index 000000000..b80cbee3f
--- /dev/null
+++ b/site/CNAME
@@ -0,0 +1 @@
+libccv.org
diff --git a/site/_config.yml b/site/_config.yml
new file mode 100644
index 000000000..882fa93ac
--- /dev/null
+++ b/site/_config.yml
@@ -0,0 +1,33 @@
+# This is the default format. 
+# For more see: https://github.com/mojombo/jekyll/wiki/Permalinks
+permalink: /:categories/:title 
+
+exclude: [".rbenv-version", "README.md", "Rakefile"]
+auto: false
+pygments: false
+markdown: kramdown
+
+kramdown:
+  use_coderay: true
+
+coderay:
+  coderay_css: class
+
+# Themes are encouraged to use these universal variables 
+# so be sure to set them if your theme uses them.
+#
+title : ccv
+subtitle : a modern computer vision library
+author :
+  name : Liu Liu
+  email : i@liuliu.me
+
+# The production_url is only used when full-domain names are needed
+# such as sitemap.txt 
+# Most places will/should use BASE_PATH to make the urls
+#
+# If you have set a CNAME (pages.github.com) set your custom domain here.
+# Else if you are pushing to username.github.com, replace with your username.
+# Finally if you are pushing to a GitHub project page, include the project name at the end.
+#
+production_url : http://libccv.org
diff --git a/site/_layouts/default.html b/site/_layouts/default.html
new file mode 100644
index 000000000..65789e3fc
--- /dev/null
+++ b/site/_layouts/default.html
@@ -0,0 +1,40 @@
+<!doctype html>
+<html><head><meta charset="utf-8">
+<title>{% if page.title %}{{ page.title }}{% else %}ccv - a modern computer vision library{% endif %}</title>
+<link rel="stylesheet" href="/stylesheets/styles.css">
+<link rel="stylesheet" href="/stylesheets/coderay.css">
+<script src="/javascripts/scale.fix.js"></script>
+<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+<meta http-equiv="X-UA-Compatible" content="chrome=1">
+<!--[if lt IE 9]>
+<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
+<![endif]-->
+<script type="text/javascript">
+var _gaq = _gaq || [];
+_gaq.push(['_setAccount', 'UA-303081-6']);
+_gaq.push(['_trackPageview']);
+(function() {
+	var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
+	ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
+	var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
+})();
+</script>
+</head><body><div class="wrapper">
+<header><h1><a href="/">ccv</a></h1>
+<p>A Modern Computer Vision Library</p>
+<p class="view"><a href="https://github.com/liuliu/ccv">View the Project on GitHub <small>liuliu/ccv</small></a></p>
+<ul>
+<li><a href="https://github.com/liuliu/ccv/zipball/stable">Download <strong>ZIP File</strong></a></li>
+<li><a href="https://github.com/liuliu/ccv/tarball/stable">Download <strong>TAR Ball</strong></a></li>
+<li><a href="https://github.com/liuliu/ccv">Fork On <strong>GitHub</strong></a></li>
+</ul>
+</header>
+<section>{{ content }}</section>
+<footer>
+<p>This project is maintained by <a href="https://liuliu.me/">liuliu</a></p>
+<p><small>Theme originated from <a href="https://github.com/orderedlist">orderedlist</a></small></p>
+</footer>
+</div>
+<!--[if !IE]><script>fixScale(document);</script><!--<![endif]-->
+</body>
+</html>
diff --git a/site/_layouts/page.html b/site/_layouts/page.html
new file mode 100644
index 000000000..ec54421ec
--- /dev/null
+++ b/site/_layouts/page.html
@@ -0,0 +1,17 @@
+---
+layout: default
+---
+<h1>{{ page.title }}</h1>
+{{ page.content | markdownify }}
+<h3><a href="/">&lsaquo;&nbsp;&nbsp;back&nbsp;</a></h3>
+<div id="disqus_thread"></div>
+<script type="text/javascript">
+	var disqus_shortname = 'libccv';
+	(function() {
+		var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
+		dsq.src = 'http://' + disqus_shortname + '.disqus.com/embed.js';
+		(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
+	})();
+</script>
+<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
+
diff --git a/site/_layouts/post.html b/site/_layouts/post.html
new file mode 100644
index 000000000..c186daeb4
--- /dev/null
+++ b/site/_layouts/post.html
@@ -0,0 +1,17 @@
+---
+layout: default
+---
+<h1>{{ page.title }}</h1>
+<p>{{ page.date | date_to_human_string }}</p>
+{{ page.content | markdownify }}
+<h3><a href="/">&lsaquo;&nbsp;&nbsp;back&nbsp;</a></h3>
+<div id="disqus_thread"></div>
+<script type="text/javascript">
+	var disqus_shortname = 'libccv';
+	(function() {
+		var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
+		dsq.src = 'http://' + disqus_shortname + '.disqus.com/embed.js';
+		(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
+	})();
+</script>
+<a href="http://disqus.com" class="dsq-brlink">comments powered by <span class="logo-disqus">Disqus</span></a>
diff --git a/site/_plugins/archive.rb b/site/_plugins/archive.rb
new file mode 100644
index 000000000..a4f7ba3e4
--- /dev/null
+++ b/site/_plugins/archive.rb
@@ -0,0 +1,132 @@
+module Jekyll
+
+	class ArchiveIndex < Page
+		def initialize(site, base, dir, type)
+			@site = site
+			@base = base
+			@dir = dir
+			@name = 'index.html'
+
+			self.process(@name)
+			self.read_yaml(File.join(base, '_layouts'), type + '.html')
+
+			year, month, day = dir.split('/')
+			self.data['year'] = year.to_i
+			self.data['month'] = month.to_i if month
+			self.data['day'] = day.to_i if day
+		end
+
+		def collect(collated_posts, older, newer)
+			self.data['collated_posts'] = collated_posts
+			self.data['previous'] = older
+			self.data['next'] = newer
+		end
+
+	end
+
+	class ArchiveGenerator < Generator
+		safe true
+		attr_accessor :collated_posts
+		attr_accessor :lbyear
+		attr_accessor :ubyear
+
+		def generate(site)
+			self.collated_posts = collate(site)
+
+			self.collated_posts.keys.each do |y|
+				if site.layouts.key? 'archive_yearly'
+					previous_yearly = nil
+					y.downto(self.lbyear) do |py|
+						if self.collated_posts.key? py
+							previous_yearly = Date.civil(py.to_i)
+							break
+						end
+					end
+					next_yearly = nil
+					y.upto(self.ubyear) do |ny|
+						if self.collated_posts.key? ny
+							next_yearly = Date.civil(ny.to_i)
+							break
+						end
+					end
+					write_archive_index(site, y.to_s, 'archive_yearly', self.collated_posts, previous_yearly, next_yearly)
+				end
+				self.collated_posts[ y ].keys.each do |m|
+	 				if site.layouts.key? 'archive_monthly'
+						previous_monthly = nil
+						py, pm = y, m
+						while py >= self.lbyear do
+							pm = pm - 1
+							py, pm = py - 1, 12 if pm < 1
+							if self.collated_posts.key? py and self.collated_posts[ py ].key? pm
+								previous_monthly = Date.civil(py.to_i, pm.to_i)
+								break
+							end
+						end
+						next_monthly = nil
+						ny, nm = y, m
+						while ny <= self.ubyear do
+							nm = nm + 1
+							ny, nm = py + 1, 1 if nm > 12
+							if self.collated_posts.key? ny and self.collated_posts[ ny ].key? nm
+								next_monthly = Date.civil(ny.to_i, nm.to_i)
+								break
+							end
+						end
+						write_archive_index(site, "%04d/%02d" % [ y.to_s, m.to_s ], 'archive_monthly', self.collated_posts, previous_monthly, next_monthly)
+					 end
+					if site.layouts.key? 'archive_daily'
+						self.collated_posts[ y ][ m ].keys.each do |d|
+							previous_daily = nil
+							py, pm, pd = y, m, d
+							while py >= self.lbyear do
+								pd = pd - 1
+								pm, pd = pm - 1, 31 if pd < 1
+								py, pm = py - 1, 12 if pm < 1
+								if self.collated_posts.key? py and self.collated_posts[ py ].key? pm and self.collated_posts[ py ][ pm ].size > 0
+									previous_daily = Date.civil(py.to_i, pm.to_i, pd.to_i)
+									break
+								end
+							end
+							next_daily = nil
+							ny, nm, nd = y, m, d
+							while ny <= self.ubyear do
+								nd = nd + 1
+								nm, nd = pm + 1, 1 if pd > 31
+								ny, nm = py + 1, 1 if pm > 12
+								if self.collated_posts.key? ny and self.collated_posts[ ny ].key? nm and self.collated_posts[ ny ][ nm ].size > 0
+									next_daily = Date.civil(ny.to_i, nm.to_i, nd.to_i)
+									break
+								end
+							end
+							write_archive_index(site, "%04d/%02d/%02d" % [ y.to_s, m.to_s, d.to_s ], 'archive_daily', self.collated_posts, previous_daily, next_daily)
+						end
+					end
+				end
+			end
+		end
+
+		def write_archive_index(site, dir, type, collated_posts, newer, older)
+			archive = ArchiveIndex.new(site, site.source, dir, type)
+			archive.collect(collated_posts, newer, older)
+			archive.render(site.layouts, site.site_payload)
+			archive.write(site.dest)
+			site.static_files << archive
+		end
+
+		def collate(site)
+			collated_posts = {}
+			self.ubyear = self.lbyear = nil
+			site.posts.reverse.each do |post|
+				y, m, d = post.date.year, post.date.month, post.date.day
+				self.lbyear = y if self.lbyear == nil or y < self.lbyear
+				self.ubyear = y if self.ubyear == nil or y > self.ubyear
+				collated_posts[ y ] = {} unless collated_posts.key? y
+				collated_posts[ y ][ m ] = {} unless collated_posts[y].key? m
+				collated_posts[ y ][ m ][ d ] = [] unless collated_posts[ y ][ m ].key? d
+				collated_posts[ y ][ m ][ d ].push(post) unless collated_posts[ y ][ m ][ d ].include?(post)
+			end
+			return collated_posts
+		end
+	end
+end
diff --git a/site/_plugins/debug.rb b/site/_plugins/debug.rb
new file mode 100644
index 000000000..e1dde3979
--- /dev/null
+++ b/site/_plugins/debug.rb
@@ -0,0 +1,38 @@
+# A simple way to inspect liquid template variables.
+# Usage:
+#  Can be used anywhere liquid syntax is parsed (templates, includes, posts/pages)
+#  {{ site | debug }}
+#  {{ site.posts | debug }}
+#
+require 'pp'
+module Jekyll
+  # Need to overwrite the inspect method here because the original
+  # uses < > to encapsulate the psuedo post/page objects in which case
+  # the output is taken for HTML tags and hidden from view.
+  #
+  class Post
+    def inspect
+      "#Jekyll:Post @id=#{self.id.inspect}"
+    end
+  end
+  
+  class Page
+    def inspect
+      "#Jekyll:Page @name=#{self.name.inspect}"
+    end
+  end
+  
+end # Jekyll
+  
+module Jekyll
+  module DebugFilter
+    
+    def debug(obj, stdout=false)
+      puts obj.pretty_inspect if stdout
+      "<pre>#{obj.class}\n#{obj.pretty_inspect}</pre>"
+    end
+
+  end # DebugFilter
+end # Jekyll
+
+Liquid::Template.register_filter(Jekyll::DebugFilter)
\ No newline at end of file
diff --git a/site/_plugins/helpers.rb b/site/_plugins/helpers.rb
new file mode 100644
index 000000000..58638df9c
--- /dev/null
+++ b/site/_plugins/helpers.rb
@@ -0,0 +1,43 @@
+require 'uri'
+
+module Liquid
+	
+	module ExtendedFilters
+
+		def date_to_month(input)
+			Date::MONTHNAMES[input]
+		end
+
+		def date_to_month_abbr(input)
+			Date::ABBR_MONTHNAMES[input]
+		end
+
+		def url_utf8_escape(input)
+			multi = input.split('/')
+			escaped = multi.collect do |x|
+				begin
+					URI.escape(URI.unescape(x))
+				rescue StandardError
+					URI.escape(x)
+				end
+			end
+			escaped.join('/')
+		end
+
+		def date_to_human_string(input)
+			Date::MONTHNAMES[input.month.to_i] + " " + input.day.to_s + case input.day.to_i % 10
+				when 1; "st, "
+				when 2; "nd, "
+				when 3; "rd, "
+				else "th, "
+			end + input.year.to_s
+		end
+
+		def date_to_utc(input)
+			input.getutc
+		end
+
+	end
+
+	Liquid::Template.register_filter(ExtendedFilters)
+end
diff --git a/site/_posts/0000-01-01-ccv-algebra.markdown b/site/_posts/0000-01-01-ccv-algebra.markdown
new file mode 100644
index 000000000..66f3d3e04
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-algebra.markdown
@@ -0,0 +1,90 @@
+---
+layout: page
+lib: ccv
+slug: ccv-algebra
+status: publish
+title: lib/ccv_algebra.c
+desc: linear algebra
+categories:
+- lib
+---
+
+ccv_normalize
+-------------
+
+	double ccv_normalize(ccv_matrix_t* a, ccv_matrix_t** b, int btype, int flag)
+
+Normalize a matrix and return the normalize factor.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **btype**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **flag**: CCV\_L1 or CCV\_L2, for L1 or L2 normalization.
+
+ccv_sat
+-------
+
+	void ccv_sat(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int padding_pattern)
+
+Generate the [Summed Area Table](https://en.wikipedia.org/wiki/Summed_area_table).
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **padding_pattern**: CCV\_NO\_PADDING - the first row and the first column in the output matrix is the same as the input matrix. CCV\_PADDING\_ZERO - the first row and the first column in the output matrix is zero, thus, the output matrix size is 1 larger than the input matrix.
+
+ccv_sum
+-------
+
+	double ccv_sum(ccv_matrix_t* mat, int flag)
+
+Return the sum of all elements in the matrix.
+
+ * **mat**: the input matrix.
+ * **flag**: CCV\_UNSIGNED - compute fabs(x) of the elements first and then sum up. CCV\_SIGNED - compute the sum normally.
+
+ccv_multiply
+------------
+
+	void ccv_multiply(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** c, int type)
+
+Do element-wise matrix multiplication.
+
+ * **a**: the input matrix.
+ * **b**: the input matrix.
+ * **c**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+
+ccv_subtract
+-------------
+
+	void ccv_subtract(ccv_matrix_t* a, ccv_matrix_t* b, ccv_matrix_t** c, int type)
+
+Matrix subtraction.
+
+ * **a**: the input matrix.
+ * **b**: the input matrix.
+ * **c**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+
+ccv_gemm
+--------
+
+	void ccv_gemm(ccv_matrix_t* a, ccv_matrix_t* b, double alpha, ccv_matrix_t* c, double beta, int transpose, ccv_matrix_t** d, int type)
+
+General purpose matrix multiplication. This function has a hard dependency on [cblas](http://www.netlib.org/blas/) library.
+
+As general as it is, it computes:
+
+	alpha * A * B + beta * C
+
+whereas A, B, C are matrix, and alpha, beta are scalar.
+
+ * **a**: the input matrix.
+ * **b**: the input matrix.
+ * **alpha**: the multiplication factor.
+ * **c**: the input matrix.
+ * **beta**: the multiplication factor.
+ * **transpose**: CCV\_A\_TRANSPOSE, CCV\_B\_TRANSPOSE to indicate if matrix A or B need to be transposed first before multiplication.
+ * **d**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-basic.markdown b/site/_posts/0000-01-01-ccv-basic.markdown
new file mode 100644
index 000000000..1f40c1ac6
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-basic.markdown
@@ -0,0 +1,62 @@
+---
+layout: page
+lib: ccv
+slug: ccv-basic
+status: publish
+title: lib/ccv_basic.c
+desc: basic image processing utilities
+categories:
+- lib
+---
+
+ccv_sobel
+---------
+
+	void ccv_sobel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int dx, int dy)
+
+Compute image with [Sobel operator](https://en.wikipedia.org/wiki/Sobel_operator).
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **dx**: the window size of Sobel operator on x-axis, specially optimized for 1, 3
+ * **dy**: the window size of Sobel operator on y-axis, specially optimized for 1, 3
+
+ccv_gradient
+------------
+
+	void ccv_gradient(ccv_dense_matrix_t* a, ccv_dense_matrix_t** theta, int ttype, ccv_dense_matrix_t** m, int mtype, int dx, int dy)
+
+Compute the gradient (angle and magnitude) at each pixel.
+
+ * **a**: the input matrix.
+ * **theta**: the output matrix of angle at each pixel.
+ * **ttype**: the type of output matrix, if 0, ccv will defaults to CCV\_32F.
+ * **m**: the output matrix of magnitude at each pixel.
+ * **mtype**: the type of output matrix, if 0, ccv will defaults to CCV\_32F.
+ * **dx**: the window size of the underlying Sobel operator used on x-axis, specially optimized for 1, 3
+ * **dy**: the window size of the underlying Sobel operator used on y-axis, specially optimized for 1, 3
+
+ccv_flip
+--------
+
+	void ccv_flip(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int btype, int type)
+
+Flip the matrix by x-axis, y-axis or both.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix (it is in-place safe).
+ * **btype**: the type of output matrix, if 0, ccv will use the sample type as the input matrix.
+ * **type**: CCV\_FLIP\_X - flip around x-axis, CCV\_FLIP\_Y - flip around y-axis.
+
+ccv_blur
+--------
+
+	void ccv_blur(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, double sigma)
+
+Using [Gaussian blur](https://en.wikipedia.org/wiki/Gaussian_blur) on a given matrix. It implements a O(n * sqrt(m)) algorithm, n is the size of input matrix, m is the size of Gaussian filtering kernel.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **sigma**: the sigma factor in Gaussian filtering kernel.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-bbf.markdown b/site/_posts/0000-01-01-ccv-bbf.markdown
new file mode 100644
index 000000000..6907b5acc
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-bbf.markdown
@@ -0,0 +1,98 @@
+---
+layout: page
+lib: ccv
+slug: ccv-bbf
+status: publish
+title: lib/ccv_bbf.c
+desc: binary brightness feature
+categories:
+- lib
+---
+
+ccv_bbf_classifier_cascade_new
+------------------------------
+
+	void ccv_bbf_classifier_cascade_new(ccv_dense_matrix_t** posimg, int posnum, char** bgfiles, int bgnum, int negnum, ccv_size_t size, const char* dir, ccv_bbf_new_param_t params)
+
+Create a new BBF classifier cascade from given positive examples and background images. This function has a hard dependency on [GSL](http://www.gnu.org/software/gsl/).
+
+ * **posimg**: an array of positive examples.
+ * **posnum**: number of positive examples.
+ * **bgfiles**: an array of background images.
+ * **bgnum**: number of background images.
+ * **negnum**: number of negative examples that is harvested from background images.
+ * **size**: the image size of positive examples.
+ * **dir**: the working directory to store/retrieve intermediate data.
+ * **params**: a **ccv\_bbf\_new\_param\_t** structure that defines various aspects of the training function.
+
+ccv_bbf_new_param_t
+-------------------
+
+ * **pos\_crit**: positive criteria or the targeted recall ratio, BBF classifier tries to adjust the constant to meet this criteria.
+ * **neg\_crit**: negative criteria or the targeted reject ratio, BBF classifier tries to include more weak features until meet this criteria.
+ * **balance\_k**: weight positive examples differently from negative examples.
+ * **layer**: the maximum layer trained for the classifier cascade.
+ * **feature\_number**: the maximum feature number for each classifier.
+ * **optimizer**: CCV\_BBF\_GENETIC\_OPT, using genetic algorithm to search the best weak feature; CCV\_BBF\_FLOAT\_OPT, using float search to improve the found best weak feature.
+ * **detector**: a **ccv\_bbf\_params\_t** structure that will be used to search negative examples from background images.
+
+ccv_bbf_detect_objects
+----------------------
+
+	ccv_array_t* ccv_bbf_detect_objects(ccv_dense_matrix_t* a, ccv_bbf_classifier_cascade_t** _cascade, int count, ccv_bbf_param_t params)
+
+Using a BBF classifier cascade to detect objects in a given image. If you have several classifier cascades, it is better to use them in one method call. In this way, ccv will try to optimize the overall performance.
+
+ * **a**: the input image.
+ * **cascade**: an array of classifier cascades.
+ * **count**: how many classifier cascades you've passed in.
+ * **params**: a **ccv\_bbf\_param\_t** structure that defines various aspects of the detector.
+
+ccv_bbf_param_t
+----------------
+
+ * **interval**: interval images between the full size image and the half size one. e.g. 2 will generate 2 images in between full size image and half size one: image with full size, image with 5/6 size, image with 2/3 size, image with 1/2 size.
+ * **min\_neighbors**: 0: no grouping afterwards. 1: group objects that intersects each other. > 1: group objects that intersects each other, and only passes these that have at least **min\_neighbors** intersected objects.
+ * **flags**: CCV\_BBF\_NO\_NESTED, if one class of object is inside another class of object, this flag will reject the first object.
+ * **accurate**: BBF will generates 4 spatial scale variations for better accuracy. Set this parameter to 0 will reduce to 1 scale variation, and thus 3 times faster but lower the general accuracy of the detector.
+ * **size**: the smallest object size that will be interesting to us.
+
+ccv_bbf_read_classifier_cascade
+-------------------------------
+
+	ccv_bbf_classifier_cascade_t* ccv_bbf_read_classifier_cascade(const char* directory)
+
+Read BBF classifier cascade from working directory.
+
+ * **directory**: the working directory that trains a BBF classifier cascade.
+
+ccv_bbf_classifier_cascade_read_binary
+--------------------------------------
+
+	ccv_bbf_classifier_cascade_t* ccv_bbf_classifier_cascade_read_binary(char* s)
+
+Load BBF classifier cascade from a memory region.
+
+ * **s**: the memory region of binarized BBF classifier cascade.
+
+ccv_bbf_classifier_cascade_write_binary
+---------------------------------------
+
+	int ccv_bbf_classifier_cascade_write_binary(ccv_bbf_classifier_cascade_t* cascade, char* s, int slen)
+
+Write BBF classifier cascade to a memory region.
+
+ * **cascade**: the BBF classifier cascade.
+ * **s**: the designated memory region.
+ * **slen**: the size of the designated memory region.
+
+It returns the actual size of the binarized BBF classifier cascade, if this size is larger than **slen**, please reallocate the memory region and do it again.
+
+ccv_bbf_classifier_cascade_free
+-------------------------------
+
+	void ccv_bbf_classifier_cascade_free(ccv_bbf_classifier_cascade_t* cascade)
+
+Free up the memory of BBF classifier cascade.
+
+ * **cascade**: the BBF classifier cascade.
diff --git a/site/_posts/0000-01-01-ccv-cache.markdown b/site/_posts/0000-01-01-ccv-cache.markdown
new file mode 100644
index 000000000..a8680c3fb
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-cache.markdown
@@ -0,0 +1,71 @@
+---
+layout: page
+lib: ccv
+slug: ccv-cache
+status: publish
+title: lib/ccv_cache.c
+desc: cache mechanism
+categories:
+- lib
+---
+
+This class implements a trie-based LRU cache that is then used for ccv application-wide cache in [ccv\_memory.c](/lib/ccv-memory).
+
+ccv_cache_init
+--------------
+
+	void ccv_cache_init(ccv_cache_t* cache, ccv_cache_index_free_f ffree, size_t up)
+
+Setup a cache strcture to work with.
+
+ * **cache**: the allocated cache.
+ * **ffree**: the function that will be used to free cached object.
+ * **up**: the upper limit of cache size in bytes.
+
+ccv_cache_get
+-------------
+
+	void* ccv_cache_get(ccv_cache_t* cache, uint64_t sign)
+
+Get an object from cache for its signature. 0 if cannot find the object.
+
+ * **cache**: the cache.
+ * **sign**: the signature.
+
+ccv_cache_out
+-------------
+
+	void* ccv_cache_out(ccv_cache_t* cache, uint64_t sign)
+
+Get an object from cache for its signature and then remove that object from the cache. 0 if cannot find the object.
+
+ * **cache**: the cache.
+ * **sign**: the signature.
+
+ccv_cache_delete
+----------------
+
+	int ccv_cache_delete(ccv_cache_t* cache, uint64_t sign)
+
+Delete an object from cache for its signature and free it. Return -1 if cannot find the object, otherwise return 0.
+
+ * **cache**: the cache.
+ * **sign**: the signature.
+
+ccv_cache_cleanup
+-----------------
+
+	void ccv_cache_cleanup(ccv_cache_t* cache)
+
+Clean up the cache, free all objects inside and other memory space occupied.
+
+ * **cache**: the cache.
+
+ccv_cache_close
+---------------
+
+	void ccv_cache_close(ccv_cache_t* cache)
+
+For current implementation (trie-based LRU cache), it is an alias for ccv\_cache\_cleanup.
+
+ * **cache**: the cache.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-classic.markdown b/site/_posts/0000-01-01-ccv-classic.markdown
new file mode 100644
index 000000000..c0125f4cc
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-classic.markdown
@@ -0,0 +1,65 @@
+---
+layout: page
+lib: ccv
+slug: ccv-classic
+status: publish
+title: lib/ccv_classic.c
+desc: classic computer vision algorithms
+categories:
+- lib
+---
+
+ccv_hog
+-------
+
+	void ccv_hog(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int b_type, int sbin, int size)
+
+[Histogram-of-Oriented-Gradients](https://en.wikipedia.org/wiki/Histogram_of_oriented_gradients) implementation, specifically, it implements the HOG described in *Object Detection with Discriminatively Trained Part-Based Models, Pedro F. Felzenszwalb, Ross B. Girshick, David McAllester and Deva Ramanan*.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **b_type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **sbin**: the number of bins for orientation (default to 9, thus, for **b**, it will have 9 * 2 + 9 + 4 = 31 channels).
+ * **size**: the window size for HOG (default to 8)
+
+ccv_canny
+---------
+
+	void ccv_canny(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int size, double low_thresh, double high_thresh)
+
+[Canny edge detector](https://en.wikipedia.org/wiki/Canny_edge_detector) implementation. For performance size, this is a clean-up reimplementation of OpenCV's Canny edge detector, it has very similar performance characteristic as the OpenCV one. As of today, ccv's Canny edge detector only works with CCV\_8U or CCV\_32S dense matrix type.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**:the type of output matrix, if 0, ccv will create a CCV\_8U \| CCV\_C1 matrix.
+ * **size**: the underlying Sobel filter size.
+ * **low_thresh**: the low threshold that makes the point interesting.
+ * **high_thresh**: the high threshold that makes the point acceptable.
+
+ccv_otsu
+--------
+
+	int ccv_otsu(ccv_dense_matrix_t* a, double* outvar, int range)
+
+[OTSU](https://en.wikipedia.org/wiki/Otsu%27s_method) implementation.
+
+ * **a**: the input matrix.
+ * **outvar**: the inter-class variance.
+ * **range**: the maximum range of data in the input matrix.
+
+It turns the threshold, inclusively. e.g. 5 means 0~5 is in the background, and 6~255 is in the foreground.
+
+ccv_optical_flow_lucas_kanade
+-----------------------------
+
+	void ccv_optical_flow_lucas_kanade(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_array_t* point_a, ccv_array_t** point_b, ccv_size_t win_size, int level, double min_eigen)
+
+[Lucas Kanade](https://en.wikipedia.org/wiki/Lucas%E2%80%93Kanade_Optical_Flow_Method) optical flow implementation with image pyramid extension.
+
+ * **a**: the first frame
+ * **b**: the next frame
+ * **point_a**: the points in first frame, of **ccv\_decimal\_point\_t** type
+ * **point_b**: the output points in the next frame, of **ccv\_decimal\_point\_with\_status\_t** type
+ * **win_size**: the window size to compute each optical flow, it must be a odd number
+ * **level**: how many image pyramids to be used for the computation
+ * **min_eigen**: the minimal eigen-value to pass optical flow computation
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-convnet.markdown b/site/_posts/0000-01-01-ccv-convnet.markdown
new file mode 100644
index 000000000..908bb6c56
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-convnet.markdown
@@ -0,0 +1,177 @@
+---
+layout: page
+lib: ccv
+slug: ccv-convnet
+status: publish
+title: lib/ccv_convnet.c
+desc: Deep Convolutional Networks
+categories:
+- lib
+---
+
+ccv_convnet_new
+---------------
+
+	ccv_convnet_t* ccv_convnet_new(int use_cwc_accel, ccv_size_t input, ccv_convnet_layer_param_t params[], int count);
+
+Create a new (deep) convolutional network with specified parameters. ccv only supports convolutional layer (shared weights), max pooling layer, average pooling layer, full connect layer and local response normalization layer.
+
+
+ * **use_cwc_accel**: whether use CUDA-enabled GPU to accelerate various computations for convolutional network.
+ * **input**: the input size of the image, it is not necessarily the input size of the first convolutional layer.
+ * **params[]**: the C-array of **ccv\_convnet\_layer\_param\_t** that specifies the parameters for each layer.
+ * **count** the size of params[] C-array.
+
+ccv_convnet_layer_param_t
+-------------------------
+
+ * **type**: one of following value to specify the network layer type, **CCV\_CONVNET\_CONVOLUTIONAL**, **CCV\_CONVNET\_FULL\_CONNECT**, **CCV\_CONVNET\_MAX\_POOL**, **CCV\_CONVNET\_AVERAGE\_POOL**, **CCV\_CONVNET\_LOCAL\_RESPONSE\_NORM**.
+ * **bias**: the initialization value for bias if applicable (for convolutional layer and full connect layer).
+ * **sigma**: the normal distribution variance for weights if applicable (for convolutional layer and full connect layer).
+ * **input**: a **ccv\_convnet\_input\_t** specifies the input structure.
+ * **output**: a **ccv\_convnet\_type\_t** specifies the output parameters and structure.
+
+ccv_convnet_input_t
+-------------------
+
+ * **matrix.rows**: the number of rows of the input matrix.
+ * **matrix.cols**: the number of columns of the input matrix.
+ * **matrix.channels**: the number of channels of the input matrix.
+ * **matrix.partition**: the number of partitions of the input matrix, it must be dividable by the number of channels (it is partitioned by channels).
+ * **node.count**: the number of nodes. You should either use **node** or **matrix** to specify the input structure.
+
+ccv_convnet_type_t
+------------------
+
+ * **convolutional.count**: the number of filters for convolutional layer.
+ * **convolutional.strides**: the strides for convolutional filter.
+ * **convolutional.border**: the padding border size for the input matrix.
+ * **convolutional.rows**: the number of rows for convolutional filter.
+ * **convolutional.cols**: the number of columns for convolutional filter.
+ * **convolutional.channels**: the number of channels for convolutional filter.
+ * **convolutional.partition**: the number of partitions for convolutional filter.
+ * **pool.strides**: the strides for pooling layer.
+ * **pool.size**: the size for pooling layer.
+ * **pool.border**: the padding border size for the input matrix.
+ * **rnorm.size**: the size of local response normalization layer.
+ * **rnorm.kappa**: as of b[i] = a[i] / (rnorm.kappa + rnorm.alpha * sum(a, i - rnorm.size / 2, i + rnorm.size / 2)) ^ rnorm.beta
+ * **rnorm.alpha**: see **rnorm.kappa**.
+ * **rnorm.beta**: see **rnorm.kappa**.
+ * **full\_connect.count**: the number of output nodes for full connect layer.
+
+ccv_convnet_verify
+------------------
+
+	int ccv_convnet_verify(ccv_convnet_t* convnet, int output);
+
+Verify the specified parameters make sense as a deep convolutional network. Return 0 if the given deep convolutional network making sense.
+
+ * **convnet**: A deep convolutional network to verify.
+ * **output**: The output number of nodes (for the last full connect layer).
+
+ccv_convnet_supervised_train
+----------------------------
+
+	void ccv_convnet_supervised_train(ccv_convnet_t* convnet, ccv_array_t* categorizeds, ccv_array_t* tests, const char* filename, ccv_convnet_train_param_t params);
+
+Start to train a deep convolutional network with given parameters and data.
+
+ * **convnet**: A deep convolutional network that is initialized.
+ * **categorizeds**: An array of images with its category information for training.
+ * **tests**: An array of images with its category information for validating.
+ * **filename**: The working file to save progress and the trained convolutional network.
+ * **params**: The training parameters.
+
+ccv_convnet_train_param_t
+-------------------------
+
+ * **max\_epoch**: The number of epoch (an epoch sweeps through all the examples) to go through before end the training.
+ * **mini\_batch**: The number of examples for a batch in stochastic gradient descent.
+ * **iterations**: The number of iterations (an iteration is for one batch) before save the progress.
+ * **symmetric**: Whether to exploit the symmetric property of the provided examples.
+ * **color\_gain**: The color variance for data augmentation (0 means no such augmentation).
+ * **layer\_params**: An C-array of **ccv\_convnet\_layer\_train\_param\_t** training parameters for each layer.
+
+ccv_convnet_layer_train_param_t
+-------------------------------
+
+ * **dor**: The dropout rate for this layer, it is only applicable for full connect layer.
+ * **w**: A **ccv\_convnet\_layer\_sgd\_param\_t** specifies the stochastic gradient descent update rule for weight, it is only applicable for full connect layer and convolutional layer.
+ * **bias**: A **ccv\_convnet\_layer\_sgd\_param\_t** specifies the stochastic gradient descent update rule for bias, it is only applicable for full connect layer and convolutional layerweight.
+
+ccv_convnet_layer_sgd_param_t
+-----------------------------
+
+ * **learn\_rate**: new velocity = **momentum** * old velocity - **decay** * **learn\_rate** * old value + **learn\_rate** * delta, new value = old value + new velocity
+ * **decay**: see **learn\_rate**.
+ * **momentum**: see **learn\_rate**.
+
+ccv_convnet_encode
+------------------
+
+	void ccv_convnet_encode(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, ccv_dense_matrix_t** b, int batch);
+
+Use a convolutional network to encode an image into a compact representation.
+
+ * **convnet**: The given convolutional network.
+ * **a**: A C-array of input images.
+ * **b**: A C-array of output matrix of compact representation.
+ * **batch**: The number of input images.
+
+ccv_convnet_classify
+--------------------
+
+	void ccv_convnet_classify(ccv_convnet_t* convnet, ccv_dense_matrix_t** a, int symmetric, ccv_array_t** ranks, int tops, int batch);
+
+Use a convolutional network to classify an image into categories.
+
+ * **convnet**: The given convolutional network.
+ * **a**: A C-array of input images.
+ * **symmetric**: Whether the input is symmetric.
+ * **ranks**: A C-array of **ccv\_array\_t** contains top categories by the convolutional network.
+ * **tops**: The number of top categories return for each image.
+ * **batch**: The number of input images.
+
+ccv_convnet_read
+----------------
+
+	ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename);
+
+Read a convolutional network that persisted on the disk.
+
+ * **use_cwc_accel**: Use CUDA-enabled GPU acceleration.
+ * **filename**: The file on the disk.
+
+ccv_convnet_write
+-----------------
+
+	void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet_write_param_t params);
+
+Write a convolutional network to a disk.
+
+ * **convnet**: A given convolutional network.
+ * **filename**: The file on the disk.
+ * **params**: a **ccv\_convnet\_write\_param\_t** to specify the write parameters.
+
+ccv_convnet_write_param_t
+-------------------------
+
+ * **half\_precision**: Use half precision float point to represent network parameters.
+
+ccv_convnet_compact
+-------------------
+
+	void ccv_convnet_compact(ccv_convnet_t* convnet);
+
+Free up temporary resources of a given convolutional network.
+
+ * **convnet**: A convolutional network.
+
+ccv_convnet_free
+----------------
+
+	void ccv_convnet_free(ccv_convnet_t* convnet);
+
+Free up the memory of a given convolutional network.
+
+ * **convnet**: A convolutional network.
diff --git a/site/_posts/0000-01-01-ccv-daisy.markdown b/site/_posts/0000-01-01-ccv-daisy.markdown
new file mode 100644
index 000000000..5d524f44d
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-daisy.markdown
@@ -0,0 +1,32 @@
+---
+layout: page
+lib: ccv
+slug: ccv-daisy
+status: publish
+title: lib/ccv_daisy.c
+desc: DAISY
+categories:
+- lib
+---
+
+ccv_daisy
+---------
+
+	void ccv_daisy(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_daisy_param_t params)
+
+[DAISY](http://cvlab.epfl.ch/publications/publications/2010/TolaLF10a.pdf) implementation. For more details - DAISY: An Efficient Dense Descriptor Applied to Wide-Baseline Stereo.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **params**: a **ccv\_daisy\_param\_t** structure that defines various aspect of the feature extractor.
+
+ccv_daisy_param_t
+-----------------
+
+ * **radius**: the Gaussian radius.
+ * **rad_q_no**:
+ * **th_q_no**:
+ * **hist_th_q_no**:
+ * **normalize_threshold**:
+ * **normalize_method**:
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-dpm.markdown b/site/_posts/0000-01-01-ccv-dpm.markdown
new file mode 100644
index 000000000..4edcee278
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-dpm.markdown
@@ -0,0 +1,85 @@
+---
+layout: page
+lib: ccv
+slug: ccv-dpm
+status: publish
+title: lib/ccv_dpm.c
+desc: deformable parts model
+categories:
+- lib
+---
+
+ccv_dpm_mixture_model_new
+-------------------------
+
+	void ccv_dpm_mixture_model_new(char** posfiles, ccv_rect_t* bboxes, int posnum, char** bgfiles, int bgnum, int negnum, const char* dir, ccv_dpm_new_param_t params)
+
+Create a new DPM mixture model from given positive examples and background images. This function has hard dependencies on [GSL](http://www.gnu.org/software/gsl/) and [LibLinear](http://www.csie.ntu.edu.tw/~cjlin/liblinear/).
+
+ * **posfiles**: an array of positive images.
+ * **bboxes**: an array of bounding boxes for positive images.
+ * **posnum**: number of positive examples.
+ * **bgfiles**: an array of background images.
+ * **bgnum**: number of background images.
+ * **negnum**: number of negative examples that is harvested from background images.
+ * **dir**: the working directory to store/retrieve intermediate data.
+ * **params**: a **ccv\_dpm\_new\_param\_t** structure that defines various aspects of the training function.
+
+ccv_dpm_new_param_t
+-------------------
+
+ * **components**: the number of root filters in the mixture model.
+ * **parts**: the number of part filters for each root filter.
+ * **grayscale**: whether to exploit color in a given image.
+ * **symmetric**: whether to exploit symmetric property of the object.
+ * **min\_area**: the minimum area that one part classifier can occupy.
+ * **max\_area**: the maximum area that one part classifier can occupy.
+ * **iterations**: how many iterations needed for stochastic gradient descent.
+ * **data\_minings**: how many data mining procedures are needed for discovering hard examples.
+ * **relabels**: how many relabel procedures are needed.
+ * **negative\_cache\_size**: the cache size for negative examples.
+ * **include\_overlap**: the percentage of overlap between expected bounding box and the bounding box from detection. Beyond this threshold, it is ensured to be the same object.
+ * **alpha**: the step size for stochastic gradient descent.
+ * **alpha\_ratio**: decrease the step size for each iteration.
+ * **balance**: to balance the weight of positive examples and negative examples.
+ * **C**: C in SVM.
+ * **percentile\_breakdown**: the percentile use for breakdown threshold.
+ * **detector**: a **ccv\_dpm\_params\_t** structure that will be used to search positive examples and negative examples from background images.
+
+ccv_dpm_detect_objects
+----------------------
+
+	ccv_array_t* ccv_dpm_detect_objects(ccv_dense_matrix_t* a, ccv_dpm_mixture_model_t** _model, int count, ccv_dpm_param_t params)
+
+Using a DPM mixture model to detect objects in a given image. If you have several DPM mixture models, it is better to use them in one method call. In this way, ccv will try to optimize the overall performance.
+
+ * **a**: the input image.
+ * **model**: an array of mixture models.
+ * **count**: how many mixture models you've passed in.
+ * **params**: a **ccv\_dpm\_param\_t** structure that defines various aspects of the detector.
+
+ccv_dpm_param_t
+---------------
+
+ * **interval**: interval images between the full size image and the half size one. e.g. 2 will generate 2 images in between full size image and half size one: image with full size, image with 5/6 size, image with 2/3 size, image with 1/2 size.
+ * **min\_neighbors**: 0: no grouping afterwards. 1: group objects that intersects each other. > 1: group objects that intersects each other, and only passes these that have at least **min\_neighbors** intersected objects.
+ * **flags**: CCV\_DPM\_NO\_NESTED, if one class of object is inside another class of object, this flag will reject the first object.
+ * **threshold**: the threshold the determines the acceptance of an object.
+
+ccv_dpm_read_mixture_model
+--------------------------
+
+	ccv_dpm_mixture_model_t* ccv_dpm_read_mixture_model(const char* directory)
+
+Read DPM mixture model from a model file.
+
+ * **directory**: the model file for DPM mixture model.
+
+ccv_dpm_mixture_model_free
+--------------------------
+
+	void ccv_dpm_mixture_model_free(ccv_dpm_mixture_model_t* model)
+
+Free up the memory of DPM mixture model.
+
+* **model**: the DPM mixture model.
diff --git a/site/_posts/0000-01-01-ccv-icf.markdown b/site/_posts/0000-01-01-ccv-icf.markdown
new file mode 100644
index 000000000..22f19cb9d
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-icf.markdown
@@ -0,0 +1,100 @@
+---
+layout: page
+lib: ccv
+slug: ccv-icf
+status: publish
+title: lib/ccv_icf.c
+desc: integral channel features
+categories:
+- lib
+---
+
+ccv_icf_classifier_cascade_new
+------------------------------
+
+	ccv_icf_classifier_cascade_t* ccv_icf_classifier_cascade_new(ccv_array_t* posfiles, int posnum, ccv_array_t* bgfiles, int negnum, ccv_array_t* testfiles, const char* dir, ccv_icf_new_param_t params)
+
+Create a new ICF classifier cascade from given positive examples and background images. This function has a hard dependency on [GSL](http://www.gnu.org/software/gsl/) and better be used with [libdispatch](http://libdispatch.macosforge.org/) for maximum efficiency.
+
+ * **posfiles**: an array of **ccv\_file\_info\_t** that gives the positive examples and their locations.
+ * **posnum**: the number of positive examples that we want to use (with certain random distortions if so choose).
+ * **bgfiles**: an array of **ccv\_file\_info\_t** that gives the background images.
+ * **negnum**: the number of negative examples will be collected during bootstrapping / initialization.
+ * **testfiles**: an array of **ccv\_file\_info\_t** that gives the validation examples and their locations.
+ * **dir**: the directory that saves the progress.
+ * **params**: a **ccv\_icf\_new\_param\_t** structure that defines various aspects of the training function.
+
+ccv_icf_new_param_t
+-------------------
+
+ * **grayscale**: whether to exploit color in a given image.
+ * **min_dimension**: the minimal size of a ICF feature region.
+ * **size**: a **ccv\_size\_t** structure that defines the width and height of the classifier.
+ * **margin**: a **ccv\_margin\_t** structure that extends the size so that we can includes more information for our classifier.
+ * **feature_size**: the number of ICF features to pool from.
+ * **weak_classifier**: the number of weak classifiers that will be used to construct the strong classifier.
+ * **boostrap**: the number of boostrap to collect negatives.
+ * **deform_angle**: the range of rotations to add distortion, in radius.
+ * **deform_scale**: the range of scale changes to add distortion.
+ * **deform_shift**: the range of translations to add distortion, in pixel.
+ * **acceptance**: the percentage of validation examples will be accepted when soft cascading the classifiers that will be sued for bootstrap.
+ * **detector**: a **ccv\_icf\_param\_t** structure that defines various aspects of the detector.
+
+ccv_icf_classifier_cascade_soft
+-------------------------------
+
+	void ccv_icf_classifier_cascade_soft(ccv_icf_classifier_cascade_t* cascade, ccv_array_t* posfiles, double acceptance)
+
+Compute soft cascade thresholds to speed up the classifier cascade performance.
+
+ * **cascade**: the trained classifier that we want to optimize soft cascade thresholds on.
+ * **posfiles**: an array of **ccv\_array\_t** that gives the positive examples and their locations.
+ * **acceptance**: the percentage of positive examples will be accepted when optimizing the soft cascade thresholds.
+
+ccv_icf_detect_objects
+----------------------
+
+	ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, ccv_classifier_cascade_t** cascades, int count, ccv_icf_param_t params)
+	ccv_array_t* ccv_icf_detect_objects(ccv_dense_matrix_t* a, ccv_multiscale_classifier_cascade_t** cascades, int count, ccv_icf_param_t params)
+
+Using a ICF classifier cascade to detect objects in a given image. If you have several classifier cascades, it is better to use them in one method call. In this way, ccv will try to optimize the overall performance.
+
+ * **a**: the input image.
+ * **cascades**: an array of classifier cascades.
+ * **count**: how many classifier cascades you've passed in.
+ * **params**: a **ccv\_icf\_param\_t** structure that defines various aspects of the detector.
+
+ccv_icf_param_t
+---------------
+
+ * **interval**: interval images between the full size image and the half size one. e.g. 2 will generate 2 images in between full size image and half size one: image with full size, image with 5/6 size, image with 2/3 size, image with 1/2 size.
+ * **min\_neighbors**: 0: no grouping afterwards. 1: group objects that intersects each other. > 1: group objects that intersects each other, and only passes these that have at least **min\_neighbors** intersected objects.
+ * **step\_through**: the step size for detection.
+
+ccv_icf_read_classifier_cascade
+-------------------------------
+
+	ccv_icf_classifier_cascade_t* ccv_icf_read_classifier_cascade(const char* filename)
+
+Read a ICF classifier from a file.
+
+ * **filename**: the file path that contains the trained ICF classifier.
+
+ccv_icf_write_classifier_cascade
+--------------------------------
+
+	void ccv_icf_write_classifier_cascade(ccv_icf_classifier_cascade_t* classifier, const char* filename)
+
+Write a ICF classifier to a file.
+
+ * **classifier**: the classifier that we want to write to file.
+ * **filename**: the file path that we want to persist the ICF classifier.
+
+ccv_classifier_cascade_free
+---------------------------
+
+	void ccv_icf_classifier_cascade_free(ccv_icf_classifier_cascade_t* classifier)
+
+Free up the memory of ICF classifier cascade.
+
+ * **cascade**: the ICF classifier cascade.
diff --git a/site/_posts/0000-01-01-ccv-io.markdown b/site/_posts/0000-01-01-ccv-io.markdown
new file mode 100644
index 000000000..9d7d69710
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-io.markdown
@@ -0,0 +1,60 @@
+---
+layout: page
+lib: ccv
+slug: ccv-io
+status: publish
+title: lib/ccv_io.c
+desc: basic IO utilities
+categories:
+- lib
+---
+
+ccv_read
+--------
+
+	int ccv_read(const char* in, ccv_dense_matrix_t** x, int type)
+
+Read image from a file. This function has soft dependencies on [LibJPEG](http://libjpeg.sourceforge.net/) and [LibPNG](http://www.libpng.org/pub/png/libpng.html). No these libraries, no JPEG nor PNG read support. However, ccv does support BMP read natively (it is a simple format after all).
+
+ * **in**: the file name.
+ * **x**: the output image.
+ * **type**: CCV\_IO\_ANY\_FILE, accept any file format. CCV\_IO\_GRAY, convert to grayscale image. CCV\_IO\_COLOR, convert to color image.
+
+ccv_read
+--------
+
+	int ccv_read(const void* data, ccv_dense_matrix_t** x, int type, int size)
+
+Read image from a a region of memory that conforms a specific image format. This function has soft dependencies on [LibJPEG](http://libjpeg.sourceforge.net/) and [LibPNG](http://www.libpng.org/pub/png/libpng.html). No these libraries, no JPEG nor PNG read support. However, ccv does support BMP read natively (it is a simple format after all).
+
+ * **data**: the data memory.
+ * **x**: the output image.
+ * **type**: CCV\_IO\_ANY\_STREAM, accept any file format. CCV\_IO\_GRAY, convert to grayscale image. CCV\_IO\_COLOR, convert to color image.
+ * **size**: the size of that data memory region.
+
+ccv_read
+--------
+
+	int ccv_read(const void* data, ccv_dense_matrix_t** x, int type, int rows, int cols, int scanline)
+
+Read image from a region of memory that assumes specific layout (RGB, GRAY, BGR, RGBA, ARGB, RGBA, ABGR, BGRA). By default, this method will create a matrix and copy data over to that matrix. With CCV\_IO\_NO\_COPY, it will create a matrix that has data block pointing to the original data memory region. It is your responsibility to release that data memory at an appropriate time after release the matrix.
+
+ * **data**: the data memory.
+ * **x**: the output image.
+ * **type**: CCV\_IO\_ANY\_RAW, CCV\_IO\_RGB\_RAW, CCV\_IO\_BGR\_RAW, CCV\_IO\_RGBA\_RAW, CCV\_IO\_ARGB\_RAW, CCV\_IO\_BGRA\_RAW, CCV\_IO\_ABGR\_RAW, CCV\_IO\_GRAY\_RAW. These in conjunction can be used with CCV\_IO\_NO\_COPY.
+ * **rows**: how many rows in the given data memory region.
+ * **cols**: how many columns in the given data memory region.
+ * **scanline**: the size of a single column in the given data memory region (or known as "bytes per row").
+
+ccv_write
+---------
+
+	int ccv_write(ccv_dense_matrix_t* mat, char* out, int* len, int type, void* conf)
+
+Write image to a file. This function has soft dependencies on [LibJPEG](http://libjpeg.sourceforge.net/) and [LibPNG](http://www.libpng.org/pub/png/libpng.html). No these libraries, no JPEG nor PNG write support.
+
+ * **mat**: the input image.
+ * **out**: the file name.
+ * **len**: the output bytes.
+ * **type**: CCV\_IO\_PNG\_FILE, save to PNG format. CCV\_IO\_JPEG\_FILE, save to JPEG format.
+ * **conf**: configuration.
diff --git a/site/_posts/0000-01-01-ccv-memory.markdown b/site/_posts/0000-01-01-ccv-memory.markdown
new file mode 100644
index 000000000..2cb80a310
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-memory.markdown
@@ -0,0 +1,140 @@
+---
+layout: page
+lib: ccv
+slug: ccv-memory
+status: publish
+title: lib/ccv_memory.c
+desc: memory alloc/dealloc
+categories:
+- lib
+---
+
+ccv_dense_matrix_new
+--------------------
+
+	ccv_dense_matrix_t* ccv_dense_matrix_new(int rows, int cols, int type, void* data, uint64_t sig)
+
+Create a dense matrix with rows, cols, and type.
+
+ * **rows**: rows of the matrix.
+ * **cols**: columns of the matrix.
+ * **type**: matrix supports 4 data types - CCV\_8U, CCV\_32S, CCV\_64S, CCV\_32F, CCV\_64F and up to 255 channels. e.g. CCV\_32F \| 31 will create a matrix with float (32-bit float point) data type with 31 channels (the default type for ccv\_hog).
+ * **data**: if 0, ccv will create the matrix by allocating memory itself. Otherwise, it will use the memory region referenced by 'data'.
+ * **sig**: the signature, using 0 if you don't know what it is.
+
+ccv_dense_matrix_renew
+----------------------
+
+	ccv_dense_matrix_t* ccv_dense_matrix_renew(ccv_dense_matrix_t* x, int rows, int cols, int types, int prefer_type, uint64_t sig)
+
+Check the input matrix, if it is the allowed type, return it, otherwise create one with prefer_type.
+
+ * **x**: the matrix to check.
+ * **rows**: rows of the matrix.
+ * **cols**: columns of the matrix.
+ * **types**: allowed types, it can be a mask of multiple types, e.g. CCV\_8U \| CCV\_32S allows both 8-bit unsigned integer type and 32-bit signed integer type.
+ * **prefer_type**: the default type, it can be only one type.
+ * **sig**: the signature, using 0 if you don't know what it is.
+
+ccv_dense_matrix
+----------------
+
+	ccv_dense_matrix_t ccv_dense_matrix(int rows, int cols, int type, void* data, uint64_t sig)
+
+This method will return a dense matrix allocated on stack, with a data pointer to a custom memory region.
+
+ * **rows**: rows of the matrix.
+ * **cols**: columns of the matrix.
+ * **type**: the type of matrix.
+ * **data**: the data pointer that stores the actual matrix, it cannot be 0.
+ * **sig**: the signature, using 0 if you don't know what it is.
+
+ccv_make_matrix_mutable
+-----------------------
+
+	void ccv_make_matrix_mutable(ccv_matrix_t* mat)
+
+Mark the current matrix as mutable. Under the hood, it will set matrix signature to 0, and mark the matrix as non-collectable.
+
+ * **mat**: the supplied matrix that will be marked as mutable.
+
+ccv_make_matrix_immutable
+-------------------------
+
+	void ccv_make_matrix_immutable(ccv_matrix_t* mat)
+
+Mark the current matrix as immutable. Under the hood, it will generate a signature for the matrix, and mark it as non-collectable. For the convention, if the matrix is marked as immutable, you shouldn't change the content of the matrix, otherwise it will cause surprising behavior. If you want to change the content of the matrix, mark it as mutable first.
+
+ * **mat**: the supplied matrix that will be marked as immutable.
+
+ccv_sparse_matrix_new
+---------------------
+
+	ccv_sparse_matrix_t* ccv_sparse_matrix_new(int rows, int cols, int type, int major, uint64_t sig)
+
+Create a sparse matrix. ccv uses a double hash table for memory-efficient and quick-access sparse matrix.
+
+ * **rows**: rows of the matrix.
+ * **cols**: columns of the matrix.
+ * **type**: the type of the matrix, the same as dense matrix.
+ * **major**: either CCV\_SPARSE\_ROW\_MAJOR or CCV\_SPARSE\_COL\_MAJOR, it determines the underlying data structure of the sparse matrix (either using row or column as the first-level hash table).
+ * **sig**: the signature, using 0 if you don't know what it is.
+
+ccv_matrix_free_immediately
+---------------------------
+
+	void ccv_matrix_free_immediately(ccv_matrix_t* mat)
+
+Skip garbage-collecting process and free the matrix immediately.
+
+ * **mat**: the matrix.
+
+ccv_matrix_free
+---------------
+
+	void ccv_matrix_free(ccv_matrix_t* mat)
+
+In principal, you should always use this method to free a matrix. If you enabled cache in ccv, this method won't immediately free up memory space of the matrix. Instead, it will push the matrix to a cache if applicable so that if you want to create the same matrix again, ccv can shortcut the required matrix/image processing and return it from the cache.
+
+ * **mat**: the matrix.
+
+ccv_enable_cache
+----------------
+
+	void ccv_enable_cache(size_t size)
+
+Enable a application-wide cache for ccv. The cache is bounded by given memory size.
+
+ * **size**: the upper limit of the cache, in bytes.
+
+ccv_enable_default_cache
+------------------------
+
+	void ccv_enable_default_cache(void)
+
+Enable a application-wide cache for ccv at default memory bound (64MiB).
+
+ccv_drain_cache
+---------------
+
+	void ccv_drain_cache(void)
+
+Drain up the cache.
+
+ccv_disable_cache
+-----------------
+
+	void ccv_disable_cache(void)
+
+Drain up and disable the application-wide cache.
+
+ccv_matrix_generate_signature
+-----------------------------
+
+	uint64_t ccv_matrix_generate_signature(const char* msg, int len, uint64_t sig_start, ...)
+
+Generate a matrix signature based on input message and other signatures. This is the core method for ccv cache. In short, ccv does a given image processing by first generating an appropriate signature for that operation. It requires 1). an operation-specific message, which can be generated by concatenate the operation name and parameters. 2). the signature of input matrix(es). After that, ccv will look-up matrix in cache with the newly generated signature. If it exists, ccv will return that matrix and skip the whole operation.
+
+ * **msg**: the concatenated message.
+ * **len**: message length.
+ * **sig_start**: the input matrix(es) signature, end the list with 0.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-numeric.markdown b/site/_posts/0000-01-01-ccv-numeric.markdown
new file mode 100644
index 000000000..ab6d5ccde
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-numeric.markdown
@@ -0,0 +1,78 @@
+---
+layout: page
+lib: ccv
+slug: ccv-numeric
+status: publish
+title: lib/ccv_numeric.c
+desc: numerical algorithms
+categories:
+- lib
+---
+
+ccv_minimize
+------------
+
+	void ccv_minimize(ccv_dense_matrix_t* x, int length, double red, ccv_minimize_f func, ccv_minimize_param_t params, void* data)
+
+Linear-search to minimize function with partial derivatives. It is formed after [minimize.m](http://www.gatsby.ucl.ac.uk/~edward/code/minimize/example.html).
+
+ * **x**: the input vector.
+ * **length**: the length of line.
+ * **red**: the step size.
+ * **func**: int ccv\_minimize\_f)(const ccv\_dense\_matrix\_t* x, double* f, ccv\_dense\_matrix\_t* df, void* data). Compute the function value, and its partial derivatives.
+ * **params**: a **ccv\_minimize\_param\_t** structure that defines various aspect of the minimize function.
+ * **data**: any extra user data.
+
+ccv_minimize_param_t
+--------------------
+
+ * **interp**: interpolate value.
+ * **extrap**: extrapolate value.
+ * **max_iter**: maximum iterations.
+ * **ratio**: increase ratio.
+ * **rho**: decrease ratio.
+ * **sig**: sigma.
+
+ccv_filter
+----------
+
+	void ccv_filter(ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_dense_matrix_t** d, int type, int padding_pattern)
+
+Convolve on dense matrix a with dense matrix b. This function has a soft dependency on [FFTW3](http://fftw.org/). If no FFTW3 exists, ccv will use [KissFFT](http://sourceforge.net/projects/kissfft/) shipped with it. FFTW3 is about 35% faster than KissFFT.
+
+ * **a**: dense matrix a.
+ * **b**: dense matrix b.
+ * **d**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **padding_pattern**: ccv doesn't support padding pattern for now.
+
+ccv_filter_kernel
+-----------------
+
+	void ccv_filter_kernel(ccv_dense_matrix_t* x, ccv_filter_kernel_f func, void* data)
+
+Fill a given dense matrix with a kernel function.
+
+ * **x**: the matrix to be filled with.
+ * **func**: double ccv\_filter\_kernel\_f(double x, double y, void* data), compute the value with given x, y.
+ * **data**: any extra user data.
+
+ccv_distance_transform
+----------------------
+
+	void ccv_distance_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_dense_matrix_t** x, int x_type, ccv_dense_matrix_t** y, int y_type, double dx, double dy, double dxx, double dyy, int flag)
+
+[Distance transform](https://en.wikipedia.org/wiki/Distance_transform). The current implementation follows [Distance Transforms of Sampled Functions](http://www.cs.cornell.edu/~dph/papers/dt.pdf). The dynamic programming technique has O(n) time complexity.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **x**: the x coordinate offset.
+ * **x_type**: the type of output x coordinate offset, if 0, ccv will default to CCV\_32S \| CCV\_C1.
+ * **y**: the y coordinate offset.
+ * **y_type**: the type of output x coordinate offset, if 0, ccv will default to CCV\_32S \| CCV\_C1.
+ * **dx**: the x coefficient.
+ * **dy**: the y coefficient.
+ * **dxx**: the x^2 coefficient.
+ * **dyy**: the y^2 coefficient.
+ * **flag**: CCV\_GSEDT, generalized squared Euclidean distance transform. CCV\_NEGATIVE, negate value in input matrix for computation; effectively, this enables us to compute the maximum distance transform rather than minimum (default one).
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-resample.markdown b/site/_posts/0000-01-01-ccv-resample.markdown
new file mode 100644
index 000000000..876a1f415
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-resample.markdown
@@ -0,0 +1,50 @@
+---
+layout: page
+lib: ccv
+slug: ccv-resample
+status: publish
+title: lib/ccv_resample.c
+desc: image resampling utilities
+categories:
+- lib
+---
+
+ccv_resample
+------------
+
+	void ccv_resample(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int btype, int rows, int cols, int type)
+
+Resample a given matrix to different size, as for now, ccv only supports either downsampling (with CCV\_INTER\_AREA) or upsampling (with CCV\_INTER\_CUBIC).
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **btype**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **rows**: the new row.
+ * **cols**: the new column.
+ * **type**: for now, ccv supports CCV\_INTER\_AREA, which is an extension to [bilinear resampling](https://en.wikipedia.org/wiki/Bilinear_filtering) for downsampling and CCV\_INTER\_CUBIC [bicubic resampling](https://en.wikipedia.org/wiki/Bicubic_interpolation) for upsampling.
+
+ccv_sample_down
+---------------
+
+	void ccv_sample_down(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int src_x, int src_y)
+
+Downsample a given matrix to exactly half size with a [Gaussian filter](https://en.wikipedia.org/wiki/Gaussian_filter). The half size is approximated by floor(rows * 0.5) x floor(cols * 0.5).
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **src\_x**: shift the start point by src\_x.
+ * **src\_y**: shift the start point by src\_y.
+
+ccv_sample_up
+-------------
+
+	void ccv_sample_up(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, int src_x, int src_y)
+
+Upsample a given matrix to exactly double size with a [Gaussian filter](https://en.wikipedia.org/wiki/Gaussian_filter).
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **src\_x**: shift the start point by src\_x.
+ * **src\_y**: shift the start point by src\_y.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-serve.markdown b/site/_posts/0000-01-01-ccv-serve.markdown
new file mode 100644
index 000000000..5aa38741b
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-serve.markdown
@@ -0,0 +1,108 @@
+---
+layout: page
+lib: ccv
+slug: ccv-serve
+status: publish
+title: serve/*.c
+desc: HTTP server
+categories:
+- lib
+---
+
+/bbf/detect.objects
+-------------------
+
+Using [BBF](/doc/doc-bbf) classifier cascade to detect objects in a given image.
+
+ * **'source' or HTTP body**: the image.
+ * **'model'**: what object, for now, it only supports 'face'.
+
+You can look up the rest of parameters at [ccv_bbf.c](/lib/ccv-bbf/#ccvbbfparamt).
+
+Supported methods: GET, POST
+
+/convnet/classify
+-----------------
+
+Using [ConvNet](/doc/doc-convnet) to categorize a given image.
+
+ * **'source' or HTTP body**: the image.
+ * **'model'**: what category set, for now it only supports 'image-net'.
+ * **'top'**: the number of results returned, order by confidence score.
+
+Supported methods: GET, POST
+
+/dpm/detect.objects
+-------------------
+
+Using [DPM](/doc/doc-dpm) mixture model to detect objects in a given image.
+
+ * **'source' or HTTP body**: the image.
+ * **'model'**: what object, it now supports 'pedestrian' and 'car'.
+
+You can look up the rest of parameters at [ccv_dpm.c](/lib/ccv-dpm/#ccvdpmparamt).
+
+Supported methods: GET, POST
+
+/icf/detect.objects
+-------------------
+
+Using [ICF](/doc/doc-icf) classifier cascade to detect objects in a given image.
+
+ * **'source' or HTTP body**: the image.
+ * **'model'**: what object, for now, it only supports 'pedestrian'.
+
+You can look up the rest of parameters at [ccv_icf.c](/lib/ccv-icf/#ccvicfparamt).
+
+Supported methods: GET, POST
+
+/swt/detect.words
+-----------------
+
+Using [SWT](/doc/doc-swt) to detect words / texts in a given image.
+
+ * **'source' or HTTP body**: the image.
+
+You can look up the rest of parameters at [ccv_swt.c](/lib/ccv-swt/#ccvswtparamt).
+
+Supported methods: GET, POST
+
+/tld/track.object
+-----------------
+
+Create a new [TLD](/doc/doc-tld) tracking instance with the initial frame.
+
+ * **'source' or HTTP body**: the initial frame image.
+ * **'x'**: the initial tracking rectangle's top left coordinate.
+ * **'y'**: the initial tracking rectangle's top left coordinate.
+ * **'width'**: the initial tracking rectangle's width.
+ * **'height'**: the initial tracking rectangle's height.
+
+You can look up the rest of parameters at [ccv_tld.c](/lib/ccv-tld/#ccvtldparamt).
+
+Supported methods: GET, POST
+
+On success, it will return the new tracking instance with 'Location' header, you can also find its ID in response['tld'].
+
+/tld/track.object/[\d+]
+-----------------------
+
+Continue a [TLD](/doc/doc-tld) tracking instance with follow up frames.
+
+ * **'source' or HTTP body**: the next frame image.
+ * **'previous'**: the previous frame image, please make sure this is the exact copy of the frame you previous provided, otherwise API will return 'false'.
+
+Supported methods: GET, POST, DELETE
+
+Please make sure that you DELETE the TLD tracking instance once you are done, otherwise the HTTP server cannot reclaim the memory it occupies.
+
+/sift
+-----
+
+Run [SIFT](/doc/doc-sift) feature point extraction on a given image.
+
+ * **'source' or HTTP body**: the image.
+
+You can look up the rest of parameters at [ccv_sift.c](/lib/ccv-sift/#ccvsiftparamt).
+
+Supported methods: GET, POST
diff --git a/site/_posts/0000-01-01-ccv-sift.markdown b/site/_posts/0000-01-01-ccv-sift.markdown
new file mode 100644
index 000000000..7cb949174
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-sift.markdown
@@ -0,0 +1,33 @@
+---
+layout: page
+lib: ccv
+slug: ccv-sift
+status: publish
+title: lib/ccv_sift.c
+desc: scale invariant feature transform
+categories:
+- lib
+---
+
+ccv_sift
+--------
+
+	void ccv_sift(ccv_dense_matrix_t* a, ccv_array_t** _keypoints, ccv_dense_matrix_t** _desc, int type, ccv_sift_param_t params)
+
+Compute [SIFT](https://en.wikipedia.org/wiki/Scale-invariant_feature_transform) key-points.
+
+ * **a**: the input matrix.
+ * **keypoints**: the array of key-points, a ccv\_keypoint\_t structure.
+ * **desc**: the descriptor for each key-point.
+ * **type**: the type of the descriptor, if 0, ccv will default to CCV\_32F.
+ * **params**: a **ccv\_sift\_param\_t** structure that defines various aspect of SIFT function.
+
+ccv_sift_param_t
+----------------
+
+ * **up2x**: if upscale the image for better SIFT accuracy.
+ * **noctaves**: number of octaves.
+ * **nlevels**: number of levels for each octaves.
+ * **edge_threshold**: above this threshold, it will be recognized as edge as be ignored.
+ * **peak_threshold**: above this threshold, it will be recognized as potential feature point.
+ * **norm_threshold**: if norm of the descriptor is smaller than threshold, it ill be ignored.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-swt.markdown b/site/_posts/0000-01-01-ccv-swt.markdown
new file mode 100644
index 000000000..6118cca0b
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-swt.markdown
@@ -0,0 +1,54 @@
+---
+layout: page
+lib: ccv
+slug: ccv-swt
+status: publish
+title: lib/ccv_swt.c
+desc: stroke width transform
+categories:
+- lib
+---
+
+ccv_swt
+-------
+
+	void ccv_swt(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, ccv_swt_param_t params)
+
+Compute the Stroke-Width-Transform image.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of the output matrix, if 0, ccv will default to CCV\_32S \| CCV\_C1.
+ * **params**: a **ccv\_swt\_param\_t** structure that defines various aspect of the SWT function.
+
+ccv_swt_param_t
+---------------
+
+ * **up2x**: upscale the image for better accuracy.
+ * **direction**: SWT direction. (black to white or white to black).
+ * **size**: parameters for [Canny edge detector](/lib/ccv-classic).
+ * **low_thresh**: parameters for [Canny edge detector](/lib/ccv-classic).
+ * **high_thresh**: parameters for [Canny edge detector](/lib/ccv-classic).
+ * **max_height**: the maximum height for a letter.
+ * **min_height**: the minimum height for a letter.
+ * **aspect_ratio**: the maximum aspect ratio for a letter.
+ * **variance_ratio**: the inner-class variance when grouping letters.
+ * **thickness_ratio**: the allowable thickness variance when grouping letters.
+ * **height_ratio**: the allowable height variance when grouping letters.
+ * **intensity_thresh**: the allowable intensity variance when grouping letters.
+ * **distance_ratio**: the allowable distance variance when grouping letters.
+ * **intersect_ratio**: the allowable intersect variance when grouping letters.
+ * **elongate_ratio**: the allowable elongate variance when grouping letters.
+ * **letter_thresh**: the allowable letter threshold.
+ * **breakdown**: if breakdown text line into words.
+ * **breakdown_ratio**: apply [OSTU](/lib/ccv-classic) and if inter-class variance above the threshold, it will be break down into words.
+
+ccv_swt_detect_words
+--------------------
+
+	ccv_array_t* ccv_swt_detect_words(ccv_dense_matrix_t* a, ccv_swt_param_t params)
+
+Return array of regions that are potentially text area.
+
+ * **a**: the input matrix.
+ * **params**: a **ccv\_swt\_param\_t** structure that defines various aspect of the SWT function.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-tld.markdown b/site/_posts/0000-01-01-ccv-tld.markdown
new file mode 100644
index 000000000..1410c442c
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-tld.markdown
@@ -0,0 +1,106 @@
+---
+layout: page
+lib: ccv
+slug: ccv-tld
+status: publish
+title: lib/ccv_tld.c
+desc: track learn detect
+categories:
+- lib
+---
+
+ccv_tld_new
+-----------
+
+	ccv_tld_t* ccv_tld_new(ccv_dense_matrix_t* a, ccv_rect_t box, ccv_tld_param_t params)
+
+Create a new TLD tracking instance from a given first frame image and the tracking rectangle.
+
+ * **a**: the first frame image.
+ * **box**: the initial tracking rectangle.
+ * **params**: a **ccv\_tld\_param\_t** structure that defines various aspects of TLD tracker.
+
+ccv_tld_param_t
+---------------
+
+TLD is complex enough that I will divide its parameters into sections:
+
+Short-term lucas-kanade tracking parameters
+
+ * **win_size**: the window size to compute optical flow
+ * **level**: level of image pyramids
+ * **min_eigen**: the minimal eigenvalue for a valid optical flow computation
+ * **min_forward_backward_error**: the minimal forward backward error
+
+Image pyramid generation parameters (for scale-invariant object detection)
+
+ * **interval**: how many intermediate images in between each image pyramid level (from width => width / 2)
+ * **shift**: how much steps sliding window should move
+
+Samples generation parameters
+
+ * **min_win**: the minimal window size of patches for detection
+ * **include_overlap**: above this threshold, a bounding box will be positively identified as overlapping with target
+ * **exclude_overlap**: below this threshold, a bounding box will be positively identified as not overlapping with target
+
+Ferns classifier parameters
+
+ * **structs**: how many ferns in the classifier
+ * **features**: how many features for each fern
+
+Nearest neighbor classifier parameters
+
+ * **validate_set**: the conservative confidence score will be only computed on a subset of all positive examples, this value gives how large that subset should be
+ * **nnc_same**: above this threshold, a given patch will be identified as the same
+ * **nnc_thres**: the initial threshold for positively recognize a patch
+ * **nnc_verify**: the threshold for a tracking result from short-term tracker be verified as a positive detection
+ * **nnc_beyond**: the upper bound threshold for adaptive computed threshold
+ * **nnc_collect**: the threshold that a negative patch above this will be collected as negative example
+ * **bad_patches**: how many patches should be evaluated in initialization to collect enough negative examples
+
+Deformation parameters to apply perspective transforms on patches for robustness
+
+ * **new_deform**: number of deformations should be applied at initialization
+ * **new_deform_angle**: the maximal angle for x, y and z axis rotation at initialization
+ * **new_deform_scale**: the maximal scale for the deformation at initialization
+ * **new_deform_shift**: the maximal shift for the deformation at initialization
+ * **track_deform**: number of deformations should be applied at running time
+ * **track_deform_angle**: the maximal angle for x, y and z axis rotation at running time
+ * **track_deform_scale**: the maximal scale for the deformation at running time
+ * **track_deform_shift**: the maximal shift for the deformation at running time
+
+Speed up parameters
+
+ * **top_n**: only keep these much positive detections when applying ferns classifier
+ * **rotation**: when >= 1, using "rotation" technique, which, only evaluate a subset of sliding windows for each frame, but after rotation + 1 frames, every sliding window will be evaluated in one of these frames.
+
+ccv_tld_track_object
+--------------------
+
+	ccv_comp_t ccv_tld_track_object(ccv_tld_t* tld, ccv_dense_matrix_t* a, ccv_dense_matrix_t* b, ccv_tld_info_t* info)
+
+ccv doesn't have retain / release semantics. Thus, a TLD instance cannot retain the most recent frame it tracks for future reference, you have to pass that in by yourself.
+
+ * **tld**: the TLD instance for continuous tracking
+ * **a**: the last frame used for tracking (ccv\_tld\_track\_object will check signature of this against the last frame TLD instance tracked)
+ * **b**: the new frame will be tracked
+ * **info**: a **ccv\_tld\_info\_t** structure that will records several aspects of current tracking
+
+ccv_tld_info_t
+--------------
+
+ * **perform_track**: whether we performed tracking or not this time
+ * **perform_learn**: whether we performed learning or not this time
+ * **track_success**: if we have a successful tracking (thus, short term tracker works)
+ * **ferns_detects**: how many regions passed ferns classifier
+ * **nnc_detects**: how many regions passed nearest neighbor classifier
+ * **clustered_detects**: after cluster, how many regions left
+ * **confident_matches**: how many matches we have outside of the tracking region (may cause a re-initialization of the short term tracking)
+ * **close_matches**: how many matches we have inside the tracking (may cause a new learning event)
+
+ccv_tld_free
+------------
+
+	void ccv_tld_free(ccv_tld_t* tld)
+
+ * **tld**: the TLD instance to be freed.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-transform.markdown b/site/_posts/0000-01-01-ccv-transform.markdown
new file mode 100644
index 000000000..9e43af2f4
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-transform.markdown
@@ -0,0 +1,48 @@
+---
+layout: page
+lib: ccv
+slug: ccv-transform
+status: publish
+title: lib/ccv_transform.c
+desc: image transform utilities
+categories:
+- lib
+---
+
+ccv_decimal_slice
+-----------------
+
+	void ccv_decimal_slice(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, float y, float x, int rows, int cols)
+
+Similar to ccv\_slice, it will slice a given matrix into required rows / cols, but it will interpolate the value with bilinear filter if x and y is non-integer.
+
+ * **a**: the given matrix that will be sliced
+ * **b**: the output matrix
+ * **type**: the type of output matrix
+ * **y**: the top point to slice
+ * **x**: the left point to slice
+ * **rows**: the number of rows for destination matrix
+ * **cols**: the number of cols for destination matrix
+
+ccv_perspective_transform_apply
+-------------------------------
+
+	ccv_decimal_point_t ccv_perspective_transform_apply(ccv_decimal_point_t point, ccv_size_t size, float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
+
+Apply a [3D transform](https://en.wikipedia.org/wiki/Perspective_transform#Perspective_projection) against the given point in a given image size, assuming field of view is 60 (in degree).
+
+ * **point**: the point to be transformed in decimal
+ * **size**: the image size
+ * **m00, m01, m02, m10, m11, m12, m20, m21, m22**: the transformation matrix
+
+ccv_perspective_transform
+-------------------------
+
+	void ccv_perspective_transform(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type, float m00, float m01, float m02, float m10, float m11, float m12, float m20, float m21, float m22)
+
+Appy a [3D transform](https://en.wikipedia.org/wiki/Perspective_transform#Perspective_projection) on a given matrix, assuming field of view is 60 (in degree).
+
+ * **a**: the given matrix to be transformed
+ * **b**: the output matrix
+ * **type**: the type of output matrix
+ * **m00, m01, m02, m10, m11, m12, m20, m21, m22**: the transformation matrix
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-ccv-util.markdown b/site/_posts/0000-01-01-ccv-util.markdown
new file mode 100644
index 000000000..e0bdea36b
--- /dev/null
+++ b/site/_posts/0000-01-01-ccv-util.markdown
@@ -0,0 +1,271 @@
+---
+layout: page
+lib: ccv
+slug: ccv-util
+status: publish
+title: lib/ccv_util.c
+desc: data structure utilities
+categories:
+- lib
+---
+
+ccv_get_dense_matrix
+--------------------
+
+	ccv_dense_matrix_t* ccv_get_dense_matrix(ccv_matrix_t* mat)
+
+Check and get dense matrix from general matrix structure.
+
+ * **mat**: a general matrix.
+
+ccv_get_sparse_matrix
+---------------------
+
+	ccv_sparse_matrix_t* ccv_get_sparse_matrix(ccv_matrix_t* mat)
+
+Check and get sparse matrix from general matrix structure.
+
+ * **mat**: a general matrix.
+
+ccv_visualize
+-------------
+
+	void ccv_visualize(ccv_matrix_t* a, ccv_matrix_t** b, int type)
+
+Convert a input matrix into a matrix within visual range, so that one can output it into PNG file for inspection.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+
+ccv_zero
+--------
+
+	void ccv_zero(ccv_matrix_t* mat)
+
+Zero out a given matrix.
+
+ * **mat**: the given matrix.
+
+ccv_any_nan
+-----------
+
+	int ccv_any_nan(ccv_matrix_t *a)
+
+Check if any nan value in the given matrix, and return its position.
+
+ * **a**: the given matrix.
+
+ccv_flatten
+-----------
+
+	void ccv_flatten(ccv_matrix_t* a, ccv_matrix_t** b, int type, int flag)
+
+If a given matrix has multiple channels, this function will compute a new matrix that each cell in the new matrix is the sum of all channels in the same cell of the given matrix.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **flag**: ccv reserved this for now.
+
+ccv_shift
+---------
+
+	void ccv_shift(ccv_matrix_t* a, ccv_matrix_t** b, int type, int lr, int rr)
+
+Compute a new matrix that each element is first left shifted and then right shifted.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **type**: the type of output matrix, if 0, ccv will try to match the input matrix for appropriate type.
+ * **lr**: left shift amount.
+ * **rr**: right shift amount.
+
+ccv_get_sparse_matrix_vector
+----------------------------
+
+	ccv_dense_vector_t* ccv_get_sparse_matrix_vector(ccv_sparse_matrix_t* mat, int index)
+
+Get vector for a sparse matrix.
+
+ * **mat**: the sparse matrix.
+ * **index**: the index of that vector.
+
+ccv_get_sparse_matrix_cell
+--------------------------
+
+	ccv_matrix_cell_t ccv_get_sparse_matrix_cell(ccv_sparse_matrix_t* mat, int row, int col)
+
+Get cell from a sparse matrix.
+
+ * **mat**: the sparse matrix.
+ * **row**: the row index.
+ * **col**: the column index.
+
+ccv_set_sparse_matrix_cell
+--------------------------
+
+	void ccv_set_sparse_matrix_cell(ccv_sparse_matrix_t* mat, int row, int col, void* data)
+
+Set cell for a sparse matrix.
+
+ * **mat**: the sparse matrix.
+ * **row**: the row index.
+ * **col**: the column index.
+ * **data**: the data pointer.
+
+ccv_compress_sparse_matrix
+--------------------------
+
+	void ccv_compress_sparse_matrix(ccv_sparse_matrix_t* mat, ccv_compressed_sparse_matrix_t** csm)
+
+Transform a sparse matrix into compressed representation.
+
+ * **mat**: the sparse matrix.
+ * **csm**: the compressed matrix.
+
+ccv_decompress_sparse_matrix
+----------------------------
+
+	void ccv_decompress_sparse_matrix(ccv_compressed_sparse_matrix_t* csm, ccv_sparse_matrix_t** smt)
+
+Transform a compressed matrix into a sparse matrix.
+
+ * **csm**: the compressed matrix.
+ * **smt**: the sparse matrix.
+
+ccv_matrix_eq
+-------------
+
+	int ccv_matrix_eq(ccv_matrix_t* a, ccv_matrix_t* b)
+
+Compare if two matrix are equal (with type). Return 0 if it is.
+
+ * **a**: the input matrix a.
+ * **b**: the input matrix b.
+
+ccv_slice
+---------
+
+	void ccv_slice(ccv_matrix_t* a, ccv_matrix_t** b, int btype, int y, int x, int rows, int cols)
+
+Slice an input matrix given x, y and row, column size.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **btype**: the type of output matrix, if 0, ccv will use the input matrix type.
+ * **y**: y coordinate.
+ * **x**: x coordinate.
+ * **rows**: row size of targeted matrix.
+ * **cols**: column size of targeted matrix.
+
+ccv_border
+----------
+
+	void ccv_border(ccv_matrix_t* a, ccv_matrix_t** b, ccv_margin_t margin)
+
+Add border to the input matrix.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **margin**: left, top, right, bottom width for the border.
+
+ccv_move
+--------
+
+	void ccv_move(ccv_matrix_t* a, ccv_matrix_t** b, int btype, int y, int x)
+
+Offset input matrix by x, y.
+
+ * **a**: the input matrix.
+ * **b**: the output matrix.
+ * **btype**: the type of output matrix, if 0, ccv will use the input matrix type.
+ * **y**: b(0, 0) = a(x, y).
+ * **x**: b(0, 0) = a(x, y).
+
+ccv_array_new
+-------------
+
+	ccv_array_t* ccv_array_new(int rnum, int rsize)
+
+Create a new, self-growing array.
+
+ * **rnum**: the initial capacity of the array.
+ * **rsize**: the size of each element in the array.
+
+ccv_array_push
+--------------
+
+	void ccv_array_push(ccv_array_t* array, void* r)
+
+Push a new element into the array.
+
+ * **array**: the array.
+ * **r**: the pointer to new element, it will then be copied into the array.
+
+ccv_array_zero
+--------------
+
+	void ccv_array_zero(ccv_array_t* array)
+
+Zero out the array, it won't change the array->rnum however.
+
+ * **array**: the array.
+
+ccv_array_clear
+---------------
+
+	void ccv_array_clear(ccv_array_t* array)
+
+Clear the array, it will reset the array->rnum to 0.
+
+ * **array**: the array.
+
+ccv_array_free
+--------------
+
+	void ccv_array_free(ccv_array_t* array)
+
+Free up the array.
+
+ * **array**: the array.
+
+ccv_array_group
+---------------
+
+	int ccv_array_group(ccv_array_t* array, ccv_array_t** index, ccv_array_group_f gfunc, void* data)
+
+Group elements in the array from its similarity.
+
+ * **array**: the array.
+ * **index**: the output index, same group element will have the same index.
+ * **gfunc**: int ccv\_array\_group\_f(const void* a, const void* b, void* data). Return 1 if a and b are in the same group.
+ * **data**: any extra user data.
+
+ccv_contour_new
+---------------
+
+	ccv_contour_t* ccv_contour_new(int set)
+
+Create a new contour object.
+
+ * **set**: the initial capacity of the contour.
+
+ccv_contour_push
+----------------
+
+	void ccv_contour_push(ccv_contour_t* contour, ccv_point_t point)
+
+Push a point into the contour object.
+
+ * **contour**: the contour.
+ * **point**: the point.
+
+ccv_contour_free
+----------------
+
+	void ccv_contour_free(ccv_contour_t* contour)
+
+Free up the contour object.
+
+ * **contour**: the contour.
diff --git a/site/_posts/0000-01-01-doc-bbf.markdown b/site/_posts/0000-01-01-doc-bbf.markdown
new file mode 100644
index 000000000..0a87720cf
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-bbf.markdown
@@ -0,0 +1,146 @@
+---
+layout: page
+doc: ccv
+slug: bbf
+status: publish
+title: "BBF: Brightness Binary Feature"
+categories:
+- doc
+---
+
+[Library Reference: ccv_bbf.c](/lib/ccv-bbf/)
+
+What's BBF?
+-----------
+
+The original paper refers to:
+YEF∗ Real-Time Object Detection, Yotam Abramson and Bruno Steux
+
+The improved version refers to:
+High-Performance Rotation Invariant Multiview Face Detection, Chang Huang, Haizhou Ai, Yuan Li and Shihong Lao
+
+How it works?
+-------------
+
+That's a long story, please read the paper. But at least I can show you how to
+use the magic:
+
+	./bbfdetect <Your Image contains Faces> ../samples/face | ./bbfdraw.rb <Your Image contains Faces> output.png
+
+Check out the output.png, now you get the idea.
+
+What about the performance?
+---------------------------
+
+The tests are performed with MIT+CMU face detection dataset
+(http://vasc.ri.cmu.edu/idb/html/face/frontal_images/index.html)
+
+**Setup**:
+
+Download the tarball, copy out files in newtest/ test/ and test-low/ to a single
+folder, let's say: all/. Since ccv doesn't support gif format, you need to do file
+format conversion by your own. If you have ImageMagick, it is handy:
+
+	for i in *.gif; do convert $i `basename $i .gif`.png; done;
+
+For the ground truth data, you can copy them out from
+http://vasc.ri.cmu.edu/idb/images/face/frontal_images/list.html Only Test Set A,
+B, C are needed.
+
+bbfdetect needs a list of files, you can generate them by run the command in the
+same directory of bbfdetect binary:
+
+	find <the directory of converted files>/*.png > filelist.txt
+
+**Speed-wise**:
+
+run
+
+	time ./bbfdetect filelist.txt ../samples/face > result.txt
+
+On my computer, it reports:
+
+	real    0m9.304s
+	user    0m9.270s
+	sys     0m0.010s
+
+How about OpenCV's face detector? I run OpenCV with default setting on the same
+computer, and it reports:
+
+	real    0m27.977s
+	user    0m27.860s
+	sys     0m0.050s
+
+You see the difference.
+
+**Accuracy-wise**:
+
+I wrote a little script called bbfvldr.rb that can check the output of bbfdetect
+against ground truth, before run the script, you need to do some house-cleaning
+work on the result.txt:
+
+Basically, the result.txt file will contain the full path to the file, for which,
+we only need the filename, use your favorite editor to remove the directory
+information, for me, it is:
+
+	sed -i "s/\.\.\/test\/faces\///g" result.txt
+
+Suppose you have copied the ground truth to truth.txt file, run the validator:
+
+	./bbfvldr.rb truth.txt result.txt
+
+My result for bbfdetect is:
+
+	82.97% (12)
+
+The former one is detection rate (how many faces are detected), the later one is
+the number of false alarms (how many non-face regions are detected as faces)
+
+The result for OpenCV default face detector is:
+
+	86.69% (15)
+
+Well, we are a little behind, but you can train the detector yourself, just get
+a better data source!
+
+How to train my own detector?
+-----------------------------
+
+In this chapter, I will go over how I trained the face detector myself. To be
+honest, I lost my face detector training data several years ago. Just like
+everyone else, I have to download it somewhere. In the end, I settled with LFW
+(http://vis-www.cs.umass.edu/lfw/). Technically, it is the dataset for face
+recognition, so there are less variations. But that's the largest dataset I can
+find to download. I downloaded the aligned data, cropped with random rotation,
+translation and scale variations, got 13125 faces in 24x24 size.
+
+The bbfcreate also requires negative images, just so happened, I have about 8000
+natural scene images that contains no faces downloaded from Flickr. OK, now I
+have all the data, what's next?
+
+First, you need to create a directory called data/ under the same directory of
+bbfcreate. Then, you need to create two filelists of positive data and negative
+images, for me, it is:
+
+	find ../data/faces/*.bmp > faces.dat
+	find ../data/negs/*.jpg > negs.dat
+
+That's all! Just find a computer powerful enough and run the following line for several
+days:
+
+	./bbfcreate --positive-list faces.dat --background-list negs.dat --negative-count 26250 --working-dir data
+
+The --negative-count parameter denotes how many negative samples extracted for each round,
+experimentally, it is something about twice of the number of your positive ones.
+
+If you configure the makefile well, bbfcreate will use OpenMP to speed up, which will
+eat up all the CPUs. My own training process ran about one week, it is a extremely
+powerful desktop PC, you should expect weeks for the result on modest PC with so many
+samples.
+
+You can stop bbfcreate at any time you want, the most recent result will be saved
+in data/ directory, clean up the directory to restart.
+
+I probably will implement MPI support in near future so that you can run this with
+many computers in parallel, but who nowadays have OpenMPI setup besides supercomputing
+centers?
diff --git a/site/_posts/0000-01-01-doc-cache.markdown b/site/_posts/0000-01-01-doc-cache.markdown
new file mode 100644
index 000000000..2fe26589c
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-cache.markdown
@@ -0,0 +1,55 @@
+---
+layout: page
+doc: ccv
+slug: cache
+status: publish
+title: "Cache: We are Terrible Magicians"
+categories:
+- doc
+---
+
+ccv uses an application-wide transparent cache to de-duplicate matrix computations.
+In the following chapters, I will try to outline how that works, and expose you
+to the inner-working of ccv's core functionalities.
+
+Initial Signature
+-----------------
+
+**ccv_make_matrix_immutable** computes the SHA-1 hash on matrix raw data, and will
+use the first 64-bit as the signature for that matrix.
+
+Derived Signature
+-----------------
+
+Derived signature is computed from the specific operation that is going to perform.
+For example, matrix A and matrix B used to generate matrix C through operation X.
+C's signature is derived from A, B and X.
+
+A Radix-tree LRU Cache
+----------------------
+
+ccv uses a custom radix-tree implementation with generation information. It imposes
+a hard limit on memory usage of 64 MiB, you can adjust this value if you like.
+The custom radix-tree data structure is specifically designed to satisfy our 64-bit
+signature design. If compile with jemalloc, it can be both fast and memory-efficient.
+
+Garbage Collection
+------------------
+
+The matrix signature is important. For every matrix that is freed with **ccv_matrix_free**
+directive, it will first check the signature. If it is a derived signature,
+**ccv_matrix_free** won't free that matrix to OS immediately, instead, it will put
+that matrix back to the application-wide cache. Sparse matrix, matrix without
+signature / with initial signature will be freed immediately.
+
+Shortcut
+--------
+
+For operation X performed with matrix A and B, it will first generate the derived
+signature. The signature will be searched in the application-wide cache in hope
+of finding a result matrix. If such matrix C is found, the operation X will take
+a shortcut and return that matrix to user. Otherwise, it will allocate such matrix,
+set proper signature on it and perform the operation honestly.
+
+After finish this, I found that it may not be the most interesting bit of ccv.
+But still, hope you found it otherwise :-)
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-doc-convnet.markdown b/site/_posts/0000-01-01-doc-convnet.markdown
new file mode 100644
index 000000000..d393c8622
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-convnet.markdown
@@ -0,0 +1,281 @@
+---
+layout: page
+doc: ccv
+slug: dpm
+status: publish
+title: "ConvNet: Deep Convolutional Networks"
+categories:
+- doc
+---
+
+[Library Reference: ccv_convnet.c](/lib/ccv-convnet/)
+
+What's ConvNet?
+---------------
+
+Convolutional neural network is a specific artificial neural network topology that
+is inspired by biological visual cortex and tailored for computer vision tasks by
+Yann LeCun in early 1990s. See <http://deeplearning.net/tutorial/lenet.html> for
+introduction.
+
+The convolutional neural network implemented in ccv is based on Alex Krizhevsky's
+ground-breaking work presented in:
+
+ImageNet Classification with Deep Convolutional Neural Networks, Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton, NIPS 2012
+
+The parameters are modified based on Matthew D. Zeiler's work presented in:
+
+Visualizing and Understanding Convolutional Networks, Matthew D. Zeiler, and Rob Fergus, Arxiv 1311.2901 (Nov 2013)
+
+How it works?
+-------------
+
+Long story short, with advances in GPGPU programming, we can have very large neural networks
+(with over 50 million parameters) trained on millions of images. It turns out that once you
+have both and a bag of tricks (dropout, pooling etc.), the resulted neural networks can achieve
+good image classification results.
+
+	./cnnclassify ../samples/dex.png ../samples/image-net-2010.sqlite3 | ./cnndraw.rb ../samples/image-net-2010.words ../samples/dex.png output.png
+
+Check output.png, the neural networks suggest a few possible relevant classes in the top
+left chart.
+
+What about the performance?
+---------------------------
+
+ConvNet on the very large scale is not extremely fast. There are a few implementations available
+for ConvNet that focused on speed performance, such as [Caffe from Berkeley](http://caffe.berkeleyvision.org/),
+or [OverFeat from NYU](http://cilvr.nyu.edu/doku.php?id=software:overfeat:start). Although not
+explicitly optimized for speed (ccv chooses correctness over speed in this preliminary implementation),
+the ConvNet implementation presented in ccv speed-wise is inline with other implementations.
+
+Therefore, the analysis related to performance is implemented on ImageNet dataset and the network
+topology followed the exact specification detailed in the paper.
+
+Accuracy-wise:
+
+The test is performed on ILSVRC 2010 test dataset and ILSVRC 2012 validation dataset.
+
+For ILSVRC2010 dataset, The training stopped to improve at around 60 epochs, at that time, the central
+patch from test set obtained 36.56% of top-1 missing rate (lower is better) and the training set
+obtained 32.2% of top-1 missing rate. In Alex's paper, they reported 37.5% top-1 missing rate when
+averaging 10 patches, and 39% top-1 missing rate when using the central patch in test set.
+
+For ILSVRC2012 dataset, the training stopped to improve at around 70 epochs, at that time, the central
+patch from validation set obtained 41.4% of top-1 missing rate (lower is better) and the training set
+obtained 37.8% of top-1 missing rate. In Alex's paper, they reported 40.5% top-1 missing rate when
+averaging 10 patches. In Matt's paper, they reported 38.4% top-1 missing rate when using 1 convnet as
+configured in Fig.3 and averaging 10 patches.
+
+Assuming you have ILSVRC 2012 validation set files ordered in image-net-2012-val.txt, run
+
+	./cnnclassify image-net-2012-val.txt ../samples/image-net-2012.sqlite3 > image-net-2012-classify.txt
+
+For complete validation set to finish, this command takes half an hour on GPU, and if you don't have GPU
+enabled, it will take about half a day to run on CPU.
+
+Assuming you have the ILSVRC 2012 validation ground truth data in LSVRC2012_val_ground_truth.txt
+
+	./cnnvldtr.rb LSVRC2012_val_ground_truth.txt image-net-2012-classify.txt
+
+will reports the top-1 missing rate as well as top-5 missing rate.
+
+For 32-bit float point image-net-2012.sqlite3 on GPU, the top-1 missing rate is 38.17%, 2.33% better
+than Alex's result with 1 convnet, and 0.23% better than Matt's result with 1 convnet and configured
+with Fig.3. The top-5 missing rate is 16.22%, 1.98% better than Alex's and 0.28% better than Matt's.
+For half precision image-net-2012.sqlite3 (the one included in ./samples/), the top-1 missing rate is
+38.18% and the top-5 missing rate is 16.17%.
+
+See <http://www.image-net.org/challenges/LSVRC/2013/results.php#cls> for the current state-of-the-art,
+ccv's implementation is still about 5% behind Clarifai (Matt's commercial implementation, later claimed
+to be 10.7%: <http://www.clarifai.com/>) and 2% behind OverFeat on top-5 missing rate.
+
+For 32-bit float point image-net-2012.sqlite3 on CPU, the top-1 missing rate is 38.51%, and the top-5
+missing rate is 16.57%.
+
+For 32-bit float point image-net-2010.sqlite3 on GPU, the top-1 missing rate is 33.91%, and the top-5
+missing rate is 14.08%.
+
+You can download the 32-bit float point versions with ./samples/download-image-net.sh
+
+Speed-wise:
+
+The experiment conducted on a computer with Core i7 3770, NVIDIA TITAN graphic card at stock
+frequency, and Samsung MZ-7TE500BW 500GiB SSD with clang, libdispatch, libatlas and GNU
+Scientific Library.
+
+The CPU version of forward pass (from RGB image input to the classification result) takes about
+700ms per image. This is achieved with multi-threaded convolutional kernel computation. Decaf (
+the CPU counter-part of Caffe) reported their forward pass at around 0.5s per image with
+unspecified hardware over 10 patches (the same as ccv's cnnclassify implementation). I cannot
+get sensible number off OverFeat on my machine (it reports about 1.4s for forward pass, that
+makes little sense). Their reported number are 1s per image on unspecified configuration with
+unspecified hardware (I suspect that their unspecified configuration does much more than the
+averaging 10 patches ccv or Decaf does).
+
+For AlexNet, the GPU version does forward pass + backward error propagate for batch size of 256
+in about 1.6s. Thus, training ImageNet convolutional network takes about 9 days with 100 epochs.
+Caffe reported their forward pass + backward error propagate for batch size of 256 in about 1.8s
+on Tesla K20 (known to be about 30% slower cross the board than TITAN). In the paper, Alex
+reported 90 epochs within 6 days on two GeForce 580. In "Multi-GPU Training of ConvNets" (Omry Yadan,
+Keith Adams, Yaniv Taigman, and Marc'Aurelio Ranzato, arXiv:1312.5853), Omry mentioned that they did
+100 epochs of AlexNet in 10.5 days on 1 GPU), which suggests my time is within line of these
+implementations.
+
+For MattNet, the GPU version does forward pass + backward error propagate for batch size of 128
+in about 1.0s.
+
+As a preliminary implementation, I didn't spend enough time to optimize these operations in ccv if
+any at all. For example, [cuda-convnet](http://code.google.com/p/cuda-convnet/) implements its
+functionalities in about 10,000 lines of code, Caffe implements with 14,000 lines of code, as of
+this release, ccv implements with about 4,300 lines of code. For the future, the low-hanging
+optimization opportunities include using SIMD instruction, doing FFT in densely convolved layers
+etc.
+
+How to train my own image classifier?
+-------------------------------------
+
+First, you need to figure out your network topology. For all intents and purposes, I will walk you
+through how to train with ImageNet LSVRC 2010 data.
+
+You need three things: the actual ImageNet dataset (and metadata), a CUDA GPU with no less than 6GiB
+on-board memory and a sufficient large SSD device to hold ImageNet dataset (otherwise loading data
+from your rotational disk will take more time than the actual computation).
+
+I downloaded the ImageNet dataset from this torrent:
+
+Assuming you've downloaded / bought all these and installed on your computer, get a hot tea, it will
+take a while to get all the puzzles and riddles in place for the training starts.
+
+The ImageNet metadata for 2010 challenge can be downloaded from
+<http://www.image-net.org/challenges/LSVRC/2010/download-public>
+
+Unfortunately, the metadata are stored in Matlab proprietary format, there are some conversion work
+to be done. Here will demonstrate how to use Octave to do this. Install Octave on Linux-like system
+is easy, for me on Ubuntu, it is about one line:
+
+	sudo apt-get install octave
+
+Assuming you've downloaded devkit-1.0 from the above link, and found meta.mat file somewhere in that
+tarball, launching Octave interactive environment and run:
+
+	file = fopen('meta.txt', 'w+')
+	for i = 1:1000
+		fprintf(file, "%d %s %d\n", synsets(i).ILSVRC2010_ID, synsets(i).WNID, synsets(i).num_train_images)
+	endfor
+	fclose(file)
+
+The newly created meta.txt file will give us the class id, the WordNet id, and the number of training
+image available for each class.
+
+The ImageNet data downloaded from the torrent puts the training images into directories named by the
+WordNet ids.
+
+	find <ImageNet dataset>/train/ -name "*.JPEG" > train-file.txt
+
+I use this script to generate format that ccv understands: <https://gist.github.com/liuliu/8393461>
+
+The test dataset is ordered numerically, thus,
+
+	find <ImageNet dataset>/test/ -name "*.JPEG" > test-file.txt
+
+will generate file list corresponding to ILSVRC2010_test_ground_truth.txt for class ids.
+
+This script: <https://gist.github.com/liuliu/8393516> will generate the plain text that ccv understands
+for tests.
+
+These images need to be first pre-processed to correct size for training.
+
+I partially replaced ./bin/image-net.c with this snippet: <https://gist.github.com/liuliu/8906523> to
+generate files suffixed with ".resize.png". Compile and run:
+
+	./image-net --train-list ~/Fast/imageNet/train-file.txt --test-list ~/Fast/imageNet/test-file.txt --base-dir ~/Fast/imageNet --working-dir image-net.sqlite3
+
+The resize will take about 3 hours, and after that, train.txt and test.txt are generated from
+train-file.txt and test-file.txt by suffixing .resize.png on every line.
+
+Now, everything is ready. Assuming you have a TITAN GPU as I do, it takes 9 days. And follows Alex procedure,
+the learn_rate will be decreased three times, for the specific image-net.sqlite3 you see in ./samples, I
+started with 0.01 learn_rate, decreased to 0.001 at 30th epoch, and then decreased to 0.0001 at 60th epoch,
+and then decreased to 0.00001 at 80th epoch.
+
+The generated image-net.sqlite3 file is about 600MiB in size because it contains data needed for training
+and resume. You can either open this file with sqlite command-line tool (it is a vanilla sqlite database
+file), and do:
+
+	drop table function_state;
+	drop table momentum_data;
+	vacuum;
+
+The file size will shrink to about 200MiB. You can achieve further reduction in file size by rewrite it into
+half-precision, with ccv_convnet_write and write_param.half_precision = 1. The resulted image-net.sqlite3
+is exactly what I included in ./samples.
+
+Can I use the ImageNet pre-trained data model?
+----------------------------------------------
+
+ccv is released under FreeBSD 3-clause license, and the pre-trained data models ./samples/image-net-2010.sqlite3
+and ./samples/image-net-2012.sqlite3 are released under Creative Commons Attribution 4.0 International License.
+You can use it, modify it practically anywhere and anyhow with proper attribution. As far as I can tell, this is
+the first pre-trained data model released under commercial-friendly license (Caffe itself is released under
+FreeBSD license but its pre-trained data model is "research only" and OverFeat is released under custom research
+only license).
+
+Differences between ccv's implementation, Caffe's, Alex's and Matt's
+--------------------------------------------------------------------
+
+Although the network topology of ccv's implementation followed closely to Matt's, the reported results
+diverged significantly enough for me to document the differences in implementation details.
+
+Network Topology:
+
+ccv's local response normalization layer followed the convolutional layer, and the pooling layer is after
+the local response normalization. This is briefly mentioned in Alex's paper, but in Caffe, their local
+response normalization layer followed the pooling layer.
+
+The input dimension to ccv's implemented network is 225x225, and in Caffe, it is 227x227. Alex's paper
+as well as Matt's mentioned their input size is 224x224. For 225x225, it implies a 1 pixel padding around
+the input image such that with 7x7 filter and 2 stride size, a 111x111 output will be generated. However,
+the output of the first convolutional layer in Matt's paper is 110x110.
+
+Data Preparation:
+
+Caffe's implementation resizes image into 256x256 size without retaining aspect ratio. Alex's implementation
+resizes image into sizes such that the minimal dimension is 256 while retains the aspect ratio (at least
+as the paper implied) and cropped the image into 256x256 size. ccv's implementation resizes image into sizes
+such that the minimal dimension is 257 while retains the aspect ratio (downsamples with CCV_INTER_AREA
+interpolation and upsamples with CCV_INTER_CUBIC interpoliation if needed). ccv's implementation obtains
+the mean image from center cropped 257x257 images.
+
+Data Augmentation:
+
+Caffe's implementation randomly crops image from 256x256 to 227x227. Alex's implementation randomly crops
+image from 256x256 to 224x224 and then applied color augmentation with Gaussian random coefficient sampled
+with sigma == 0.1. ccv's implementation randomly crops image from the aspect retained sizes into 257x257,
+subtract the mean image and then randomly crops it into 225x225, color augmentation is applied with Gaussian
+random coefficient sampled with sigma == 0.001. All three implementations did horizontal mirroring as a
+data augmentation technique.
+
+Averaged Classification:
+
+Caffe averages the softmax output of 10 patches from the test image by first resize image into 256x256 without
+retaining aspect ratio, and then the first 5 patches of size 227x227 cropped from top left, top right, center,
+bottom left, bottom right of the resized test image, the second 5 patches are the horizontal mirrors of the
+first 5 patches.
+
+Alex's implementation averages the softmax output of 10 patches from the test image by first resize image into
+sizes such that the minimal dimension is 256 while retains the aspect ratio and then center-crops into 256x256.
+The 10 patches of size 224x224 are sampled from the 256x256 crop the same way as Caffe did.
+
+ccv's GPU implementation averages the softmax output of 30 patches from the test image by first resize the image
+into sizes such that the minimal dimension is 257. Then it makes 3 crops from top left, center, and bottom right
+so that the cropped image is 257x257. The cropped images subtract mean image, and then each cropped from
+top left, top right, center, bottom left, bottom right into 225x225. This generates 15 patches, and each one
+of them has its horizontally-mirrored counter-part.
+
+ccv's CPU implementation for efficiency considerations averages the softmax output of 10 patches from the test
+image by first resize the image into sizes such that the minimal dimension is 257. The mean image is upsampled
+into the same size with CCV_INTER_CUBIC and then is subtracted from the resized image. The top left, top right,
+center, bottom left, bottom right patches of 225x225 is extracted and horizontally mirrored to generate the 10
+patches.
+
diff --git a/site/_posts/0000-01-01-doc-dpm.markdown b/site/_posts/0000-01-01-doc-dpm.markdown
new file mode 100644
index 000000000..4af91beec
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-dpm.markdown
@@ -0,0 +1,137 @@
+---
+layout: page
+doc: ccv
+slug: dpm
+status: publish
+title: "DPM: Deformable Parts Model"
+categories:
+- doc
+---
+
+[Library Reference: ccv_dpm.c](/lib/ccv-dpm/)
+
+What's DPM?
+-----------
+
+The original paper refers to:
+Object Detection with Discriminatively Trained Part-Based Models, Pedro F. Felzenszwalb, Ross B. Girshick, David McAllester and Deva Ramanan
+
+How it works?
+-------------
+
+That's a long story. In very high level, DPM assumes an object is constructed by
+its parts. Thus, the detector will first found a match of its whole, and then
+using its part models to fine-tune the result. For details, please read the
+paper. What I can show you, is how to use it:
+
+	./dpmdetect <Your Image contains Pedestrians> ../samples/pedestrian.m | ./dpmdraw.rb <Your Image contains Pedestrians> output.png
+
+Checkout output.png, see what happens?
+
+What about performance?
+-----------------------
+
+DPM is not known for its speed. Its ability to identify difficult objects, is
+the selling point. However, this implementation tries to optimize for speed as
+well. For a 640x480 photo, this implementation will be done in about one second,
+without multi-thread support.
+
+Accuracy-wise:
+
+There are two off-the-shelf implementations. One is the DPM in Matlab from the inventor,
+the other is the HOG detector from OpenCV. For the task to detect pedestrians in a
+given image, we use INRIA 2008 dataset, and it provides both training and testing
+data. With OpenCV stock peopledetect sample program (scale factor changed to 1.09
+in order to match our DPM setting (interval = 8)), we get:
+
+	47.37% (133)
+
+The former one is the detection rate (how many objects have been successfully
+detected), the later is the number of false alarms (the detected region doesn't
+contain the expected object).
+
+The dpmvldtr.rb compares the ground truth bounding box with the detected bounding
+box by OpenCV, if the overlap area is larger than 60% of the biggest bounding box
+area among the two), it will be counted as a true positive. Otherwise, it will be
+counted as a false positive (false alarm).
+
+Another implementation is from the DPM inventor, it is a Matlab implementation,
+and the author has a specially trained detector for INRIA 2008 dataset (at -0.3
+threshold).
+
+	75.38% (55)
+
+The DPM implementation in ccv was trained for three days using the default parameters
+with INRIA training data. Let's see how it performs.
+
+	./dpmdetect filelist.txt ../samples/pedestrian.m > result.txt
+	./dpmvldtr.rb <INRIA dataset>/Test/annotations result.txt
+
+The result is (at 0.8 threshold):
+
+	76.74% (49)
+
+Speed-wise:
+
+Let's time it on INRIA dataset (288 images).
+
+	time ./dpmdetect filelist.txt ../samples/pedestrian.m
+
+On my laptop, it reports:
+
+	real    8m19.444s
+	user    8m15.187s
+	sys     0m3.332s
+
+OpenCV's HOG detector should be much faster because its algorithm is much simpler
+than DPM, but how fast it is?
+
+	real    1m55.861s
+	user    1m54.171s
+	sys     0m0.136s
+
+Their detector is about 4.34 times faster.
+
+How to train my own detector?
+-----------------------------
+
+Yes, this implementation comes with a tool to train your own detector too. In this
+chapter, I will go through how I trained the pedestrian.m detector that shipped
+with ccv source code. The CLI for training program is in /bin:
+
+	./dpmcreate --help
+
+Will show you the options it has.
+
+The nice part of training pedestrian detector is that there is a good training
+dataset available today on INRIA website <http://pascal.inrialpes.fr/data/human/>.
+I use a small script ./dpmext.rb to extract INRIA format bounding box data into
+ccv format, which takes the following form:
+
+	<File Path> x y width height \n
+
+I extracted that into pedestrian.samples file:
+
+	./dpmext.rb <INRIA dataset>/Train/annotations/ > pedestrian.samples
+
+It comes with negative dataset too:
+
+	find <INRIA dataset>/Train/neg/ -name "*.png" > no-pedestrian.samples
+
+Make a working directory and you can start now:
+
+	./dpmcreate --positive-list pedestrian.samples --background-list no-pedestrian.samples --negative-count 12000 --model-component 1 --model-part 8 --working-dir <Working directory> --base-dir <INRIA dataset>/Train/pos/
+
+It takes about 3 days on my laptop to get meaningful data, and unfortunately,
+current implementation doesn't support OpenMP, and you have to be patient.
+
+Good luck!
+
+Other models?
+-------------
+
+I've trained one more mixture model: samples/car.m
+
+It has been trained with VOC2011 trainval dataset, and the result on validation dataset:
+
+	46.19% (16)
diff --git a/site/_posts/0000-01-01-doc-http.markdown b/site/_posts/0000-01-01-doc-http.markdown
new file mode 100644
index 000000000..e2062648c
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-http.markdown
@@ -0,0 +1,111 @@
+---
+layout: page
+doc: ccv
+slug: http
+status: publish
+title: "HTTP: A REST-ful API"
+categories:
+- doc
+---
+
+[Library Reference: serve/*.c](/lib/ccv-serve/)
+
+How it works?
+-------------
+
+Go to ccv/serve. This functionality requires support of [libdispatch](http://libdispatch.macosforge.org/) and [libev](http://software.schmorp.de/pkg/libev). Luckily, these libraries are easy to install, for example, on Ubuntu 12.04 LTS, you can simply:
+
+	sudo apt-get install libdispatch-dev libev-dev
+
+and it is done. If you are on Mac OSX, you can simply:
+
+	brew install libev
+
+and it is done.
+
+On Mac OSX, you have to manually remove -ldispatch in ccv/serve/makefile, other than that, you are one 'make' away:
+
+	cd serve/ && make && ./ccv
+
+Now, it is up and running!
+
+How can I use it?
+-----------------
+
+Following chapters assumed that you have basic understanding of curl.
+
+The HTTP API as it is now, only supports 5 major ccv functionalities: [BBF](/doc/doc-bbf), [DPM](/doc/doc-dpm), [SWT](/doc/doc-swt), [TLD](/doc/doc-tld) and [SIFT](/doc/doc-sift). All these APIs are discoverable, you can simply:
+
+	curl localhost:3350
+
+and it will return you the list of API endpoints that you can navigate, try one:
+
+	curl localhost:3350/dpm/detect.objects
+
+It returns:
+
+{:lang="json"}
+	{
+	   "request":{
+	      "interval":"integer",
+	      "min_neighbors":"integer",
+	      "model":"string",
+	      "source":"blob",
+	      "threshold":"number"
+	   },
+	   "response":[
+	      {
+	         "x":"integer",
+	         "y":"integer",
+	         "width":"integer",
+	         "height":"integer",
+	         "confidence":"number",
+	         "parts":[
+	            {
+	               "x":"integer",
+	               "y":"integer",
+	               "width":"integer",
+	               "height":"integer",
+	               "confidence":"number"
+	            }
+	         ]
+	      }
+	   ]
+	}
+
+All responses from ccv are JSON encoded, like the example above. Particularly, the above JSON encodes what a POST request should look like, and what kind of JSON data structure you can expect as return. From the description, we knew that we should encode file into source field, and specify what model you want to use:
+
+	curl -F source=@"pedestrian.png" -F model="pedestrian" localhost:3350/dpm/detect.objects
+
+The above query should give you a series of detected rectangles that denotes pedestrians in the given image.
+
+ccv supports multipart-encoded parameter, as well as query strings, the above query is equivalent to:
+
+	curl -F source=@"pedestrian.png" "localhost:3350/dpm/detect.objects?model=pedestrian"
+
+Or:
+
+	curl --data-binary @"pedestrian.png" "localhost:3350/dpm/detect.objects?model=pedestrian"
+
+Any 'source' parameters in ccv HTTP API can be passed directly as HTTP body.
+
+A more advanced example would be TLD. [Read more](/lib/ccv-serve).
+
+Under the hood?
+---------------
+
+On Life-cycle of a Request
+
+Whenever you issued a HTTP request, ccv received such request with libev, in asynchronous fashion, and then dispatch a processing function to another thread with libdispatch, when the data is ready, ccv will dispatch back to main event loop and send result back. Thus, requests to ccv won't block each other. Although silly, dummy GET requests to ccv HTTP API endpoints can easily peak to around 25K requests per second, you shouldn't worry too much about its HTTP performance (you should more worry about its computer vision algorithms' performance).
+
+On Error Message
+
+ccv's HTTP endpoints doesn't provide informative error messages, if you issued a request that it cannot handle, will return 400 with 'false' in its body. It may not be a problem for most of the API endpoints, but it would be for some advanced ones.
+
+On Parameters
+
+All HTTP API's parameters can be easily interpreted through the C API documentation, ccv chooses reasonable defaults from start, so any of them are optional.
+
+On Security
+
+The HTTP API endpoints are not intended to be exposed to public Internet, you should hide these behind firewalls.
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-doc-icf.markdown b/site/_posts/0000-01-01-doc-icf.markdown
new file mode 100644
index 000000000..a645e91c0
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-icf.markdown
@@ -0,0 +1,114 @@
+---
+layout: page
+doc: ccv
+slug: icf
+status: publish
+title: "ICF: Integral Channel Features"
+categories:
+- doc
+---
+
+[Library Reference: ccv_icf.c](/lib/ccv-icf/)
+
+What's ICF?
+-----------
+
+The original paper refers to:
+
+Integral Channel Features, P. Dollar, Z. Tu, P. Perona, and S. Belongie, BMVC 2009
+
+The improved version refers to:
+
+Pedestrian Detection at 100 Frames per Second, R. Benenson, M. Mathias, R. Timofte, and L. Van Gool, CVPR 2012
+
+Seeking the Strongest Rigid Detector, R. Benenson, M. Mathias, R. Timofte, and L. Van Gool, CVPR 2013
+
+How it works?
+-------------
+
+This is a long story, you should read the original paper and the two follow ups to get
+the idea why ICF is the strongest rigid detector, ccv does this though:
+
+	./icfdetect <Your Image contains Pedestrians> ../samples/pedestrian.icf | ./icfdraw.rb <Your Image contains Pedestrians> output.png
+
+Checkout the output.png, all pedestrians should have a red box on them.
+
+What about performance?
+-----------------------
+
+Speed-wise:
+
+ICF has two modes, one is presented on the original paper, by resizing input into different
+scales, and then run the same classifier again and again on these resized inputs. The
+second is presented in the improved version, by running multiple classifiers that are
+trained on different scales on the same input.
+
+The second approach will be the faster alternative, unfortunately, I am unable to obtain
+a reasonable recall / precision with the second approach.
+
+Running in the first mode, on a computer with Core i7 3770K, with INRIA 2008 test set,
+the figures are:
+
+	real    2m19.18s
+	user    2m16.30s
+	sys     0m2.79s
+
+It is still slower than HOG, but faster than DPM implementation in libccv.
+
+Accuracy-wise:
+
+The pedestrian.icf model provided in ./samples are trained with INRIA 2008 training
+dataset, but with additional 7542 negative samples collected from VOC2011. The model is
+trained at size 31x74, with 6px margins on each side.
+
+The provided model is then tested with INRIA 2008 test dataset, if bounding boxes
+overlap is greater than 0.5 of the bigger bounding boxes, it is a true positive.
+The validation script is available at ./bin/icfvldtr.rb.
+
+	76.23% (52)
+
+Which has roughly the same recall as DPM implementation provided in ccv, with roughly 
+the same false alarms too.
+
+How to train my own detector?
+-----------------------------
+
+ccv provides utilities to train your own object models. Specifically, for ICF, these
+utilities are available at ./bin/icfcreate and ./bin/icfoptimize.
+
+	./icfcreate --help
+
+Will show you the parameters that ccv supports when training an object model.
+
+If you have libdispatch installed and properly enabled on your machine, ccv will utilize
+all your CPU cores to speed up the training process.
+
+The INRIA pedestrian dataset can be downloaded from:
+
+	http://pascal.inrialpes.fr/data/human/
+
+The annotation format is substantially different from what ccv requires, I use this
+simple script to extract annotations from INRIA dataset:
+
+	https://gist.github.com/liuliu/6349801
+
+You also want to have a collection of background (none pedestrian) files, I combined
+data from both INRIA and VOC2011 to generates that list:
+
+	find ../data/negs/*.jpg > no-pedestrian.txt
+
+After all these ready, and have a PC with enough computational power:
+
+	./icfcreate --positive-list pedestrian.icf_samples --background-list no-pedestrian.txt --validate-list pedestrian.icf_test --negative-count 10000 --positive-count 10000 --feature-size 50000 --weak-classifier-count 2000 --size 30x90 --margin 10,10,10,10 --working-dir icf-data --acceptance 0.7 --base-dir ../data/INRIAPerson/Train/pos/
+
+The classifier cascade will be bootstrapping 3 times, pooling from 50,000 features,
+and the final boosted classifier will have 2,000 weak classifier. On the PC that I
+am running (with SSD / hard-drive hybrid (through flashcache), 32GiB memory and Core
+i7 3770K), it takes a day to finish training one classifier. At minimal, you should
+have about 16GB available memory to get the program finish running.
+
+The final-cascade file in your working directory is the classifier model file that
+you can use. Using ./bin/icfoptimize, you should be able to set proper soft cascading
+thresholds for the classifier to speed up detection:
+
+	./icfoptimize --positive-list pedestrian.icf_test --classifier-cascade icf-data/final-cascade --acceptance 0.7 --base-dir ../data/INRIAPerson/Test/pos/
diff --git a/site/_posts/0000-01-01-doc-sift.markdown b/site/_posts/0000-01-01-doc-sift.markdown
new file mode 100644
index 000000000..1cfbb6c4d
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-sift.markdown
@@ -0,0 +1,49 @@
+---
+layout: page
+doc: ccv
+slug: sift
+status: publish
+title: "SIFT: Scale Invariant Feature Transform"
+categories:
+- doc
+---
+
+[Library Reference: ccv_sift.c](/lib/ccv-sift/)
+
+What's SIFT?
+------------
+
+SIFT paper refers to:
+Distinctive Image Features from Scale-Invariant Keypoints, David G. Lowe
+
+The current implementation in ccv was largely influenced by VLFeat:
+http://www.vlfeat.org/
+
+How to run the sample program?
+------------------------------
+
+There is a sample program under bin/siftmatch that at your disposal, to run it,
+just simply type:
+
+	./siftmatch ../samples/book.png ../samples/scene.png
+
+The output may not be most interesting thing for you, want to see some images?
+There is siftdraw.rb script to do that, pipe the command:
+
+	./siftmatch ../samples/book.png ../samples/scene.png | ./siftdraw.rb ../samples/book.png ../samples/scene.png output.png
+
+Check out output.png, there are interesting lines between the book and the scene.
+
+There is a way to show more amazing result, but with a little external help,
+a program called homest (http://www.ics.forth.gr/~lourakis/homest/), it may
+requires levmar program (http://www.ics.forth.gr/~lourakis/levmar/) as well.
+compile homest until you get the homest_demo binary somewhere, and pipe the command
+like this:
+
+	./siftmatch ../samples/book.png ../samples/scene.png | ./siftdraw.rb ../samples/book.png ../samples/scene.png output.png <directory to homest>/homest_demo
+
+You see, somehow, SIFT recognized the book in the scene, amazing, ah?
+
+I haven't decided yet that if I need to include some functions like ccv_find_homography
+in the future release, homest is a good research package but for industrial use, I have
+some doubts.
diff --git a/site/_posts/0000-01-01-doc-swt.markdown b/site/_posts/0000-01-01-doc-swt.markdown
new file mode 100644
index 000000000..7096de1b6
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-swt.markdown
@@ -0,0 +1,67 @@
+---
+layout: page
+doc: ccv
+slug: swt
+status: publish
+title: "SWT: Stroke Width Transform"
+categories:
+- doc
+---
+
+[Library Reference: ccv_swt.c](/lib/ccv-swt/)
+
+*(This documentation is still largely work in progress, use with caution)*
+
+What's SWT?
+-----------
+
+The original paper refers to: Stroke Width Transform, Boris Epshtein, Yonathan Wexler,
+and Eyal Ofek 2010.
+
+How it works?
+-------------
+
+It is a long story, as always, please read their paper. SWT tries to capture the
+only text effective features and using geometric signature of text to filter out
+non-text areas. As a result, SWT gives you reliable text regions that is language
+neutral. Try it yourself:
+
+	./swtdetect <Your Image contains Text> | ./swtdraw.rb <Your Image contains Text> output.png
+
+Checkout output.png, luckily, the text area is labeled.
+
+What about performance?
+-----------------------
+
+SWT is quite fast. The SWT without scale-invariant support (multi-scale) can run
+on a 640x480 photo for well under 50 milliseconds on my laptop. By extending SWT
+to multi-scale, the accuracy increased by about 10% with about 2~4 times longer
+running time.
+
+Accuracy-wise:
+
+ccv's SWT implementation performs on ICDAR 2003 dataset achieved similar performance
+with what Epshtein et al. reported in their paper, namely, with the old measure
+method described in ICDAR 2003 contest, ccv's implementation was able to achieve
+precision rate at 66% and recall rate at 59% (numbers reported in the paper are
+precision rate 73% and recall rate at 60%).
+
+However, these results are quite out-dated, and by using [ICDAR 2011 dataset](http://robustreading.opendfki.de/wiki/SceneText),
+more meaningful comparison is possible.
+
+With ccv's scale-invariant SWT implementation, and do parameter search on ICDAR
+2011's training dataset, I was able to achieve:
+
+	precision: 59%
+	recall: 61%
+	harmonic mean: 60%
+
+Which would rank around 2nd to 3rd place in the chart. Please note that other
+methods in comparison are language specific, thus, were trained with additional
+character shape information using SVM or Adaboost where as SWT is language neutral
+and doesn't use any language specific features.
+
+Speed-wise:
+
+How can I adopt SWT for my application?
+---------------------------------------
\ No newline at end of file
diff --git a/site/_posts/0000-01-01-doc-tld.markdown b/site/_posts/0000-01-01-doc-tld.markdown
new file mode 100644
index 000000000..21c0757a8
--- /dev/null
+++ b/site/_posts/0000-01-01-doc-tld.markdown
@@ -0,0 +1,110 @@
+---
+layout: page
+doc: ccv
+slug: tld
+status: publish
+title: "TLD: Track Learn Detect"
+categories:
+- doc
+---
+
+[Library Reference: ccv_tld.c](/lib/ccv-tld/)
+
+What's TLD?
+-----------
+
+This algorithm, also known as "Predator" algorithm, developed by Zdenek Kalal. For
+more information, please visit his homepage: <http://info.ee.surrey.ac.uk/Personal/Z.Kalal/tld.html>
+
+How it works?
+-------------
+
+This is a long story, please read Zdenek's paper. Here is how it works in command-line
+if you compiled ccv with FFMPEG support:
+
+	./tld <Your Video> x y width height
+
+It will output each tracking coordinates for each frame.
+
+What about performance?
+-----------------------
+
+TLD is implemented closely after Zdenek's paper, but still, varies in quite a few
+aspects significantly. I've done excessive tests to make sure performance, in
+terms of accuracy and speed matches the original implementation.
+
+Accuracy-wise:
+
+TLD uses randomization algorithm, thus, the result can vary from time to time,
+I managed to run ccv's TLD implementation on test videos with "rotation == 0" and
+default parameters. With 3 runs and then pick the median, I've able to generate
+some meaningful data to analyze on.
+
+On motocross.mpg:
+
+	detections : 901
+	true detections : 1412
+	correct detections : 833
+	precision : 0.924528
+	recall : 0.589943
+	f-measure : 0.720277
+
+The result on the same video reported in: Zdenek Kalal, Jiri Matas and Krystian Mikolajczyk, Online Learning of Robust Object Detectors during Unstable Tracking:
+
+	precision : 0.96
+	recall : 0.54
+
+On pedestrian3.mpg:
+
+	After 69th frame failed to recover (out of 140 frames)
+
+The result on the same video reported in: Zdenek Kalal, Jiri Matas and Krystian Mikolajczyk, P-N Learning: Bootstrapping Binary Classifiers by Structural Constraints:
+
+	After 27th frame failed to recover (out of 140 frames)
+
+Note that a few runs I can get outperformed results than Zdenek's implementation
+sometimes, but choose to ignore these instead.
+
+All these results are obtained with alantrrs' evaluate_vis.py script in <https://github.com/alantrrs/OpenTLD/blob/master/datasets/evaluate_vis.py> and the dataset in
+that repository. Thanks alantrrs!
+
+Speed-wise:
+
+By enable "rotation" technique, you can achieve near real-time performance on QVGA
+video, with minor accuracy loss. With "rotation == 1" (default parameter), TLD
+spends around 15ms on tracking, 50ms on detecting, 50ms on learning for 320x240
+video on single thread of i7-2620M 2.7GHz.
+
+Under the hood?
+---------------
+
+ccv's TLD implementation varies from Zdenek's original Matlab implementation in
+several significant ways:
+
+**1). Tracking:**
+
+Zdenek's implementation uses a smaller LK window for computation (5x5), whereas
+ccv's implementation uses a 15x15 window for such.
+
+**2). Ferns Detection (Random Forest):**
+
+Zdenek's implementation uses random forest for object detection (in short, the
+probability for each feature add up), whereas ccv's implementation uses ferns
+for object detection (using multiplication of probabilities, A.K.A. semi-naive
+Bayes classifier). To compensate such choice, ccv's implementation uses 40 ferns,
+and for each fern, uses 18 features (the default parameter), and the default
+ferns threshold for ccv's implementation is 0.
+
+**3). Nearest-neighbor Classifier:**
+
+Zdenek's implementation uses aspect-ratio normalized examples (15x15); these
+examples are normalized so that a simple multiply can yield correlation confidence.
+ccv's implementation uses aspect-aware examples (constraint to area size of 400);
+examples are left as it is and using normalized coefficient computation to get
+confidence score.
+
+**4). Pseudo-random Number Generator:**
+
+Zdenek's implementation uses srand() for random number generation, and seed it
+with 0. ccv's implementation uses a Mersenne-Twister random number generator with
+an environment-dependent seed.
\ No newline at end of file
diff --git a/site/_posts/2010-02-06-call-for-a-new-lightweight-c-based-computer-vision-library.markdown b/site/_posts/2010-02-06-call-for-a-new-lightweight-c-based-computer-vision-library.markdown
new file mode 100644
index 000000000..8f20197e0
--- /dev/null
+++ b/site/_posts/2010-02-06-call-for-a-new-lightweight-c-based-computer-vision-library.markdown
@@ -0,0 +1,21 @@
+---
+date: '2010-02-06 14:10:48'
+layout: post
+slug: call-for-a-new-lightweight-c-based-computer-vision-library
+status: publish
+title: Call for a new, lightweight, c-based computer vision library
+categories:
+- post
+---
+
+Since 2005, most of my computer vision work were done with OpenCV. It is an amazingly hand-optimized piece of software. A large number of modern applications are based on OpenCV framework. It is a useful toolset. However, for all this years, I finally feel the need to make a more lightweight, pure-c/function based library. There are some ideas:
+
+1. It should be fast. There is no need to build a toolset that is slow. Former work such as lapack and gsl are a better choices rather than reinvent the wheel. For the same reason, It is necessary to fork basic routines from OpenCV, such as Canny detector, kalman filter, etc.
+
+2. Better memory management, cache everything. OpenCV partially implemented a memory management routine, but failed to have a cache mechanism, partly because there are too many functions and it hard to break in and add another layer.
+
+3. Less but more about modern algorithms. Implementing a fewer but niche algorithms and give intuitive examples. Keep compatibility with OpenCV (through interpreting functions).
+
+4. Give some love to distributed system, and modern compilers (LLVM & Clang).
+
+I am aware of that many vision works are never made the way to mass (VLFeat for example), but that's the plan.
diff --git a/site/_posts/2011-05-03-application-driven-philosophy.markdown b/site/_posts/2011-05-03-application-driven-philosophy.markdown
new file mode 100644
index 000000000..77c4ff036
--- /dev/null
+++ b/site/_posts/2011-05-03-application-driven-philosophy.markdown
@@ -0,0 +1,13 @@
+---
+date: '2011-05-03 10:05:22'
+layout: post
+slug: application-driven-philosophy
+status: publish
+title: Application driven philosophy
+categories:
+- post
+---
+
+In the set off statement of ccv, I listed one property of it to be "modern", which means rather than provides a truck-load of obsolete algorithms, ccv intended to provide best-of-its-kind algorithm among wide range of applications. Last September, I even went further and claimed that the first 4 applications for ccv would be: 1). object matching; 2). object detection; 3). text detection; 4). 3d reconstruction. These statements set the tone for ccv development known now as application-driven.
+
+There are a lot of evidence in ccv code base to provide the actual usage of this method. ccv_sample_down was implemented when I was implementing BBF object detection, which requires the image pyramid. However, ccv_sample_up was not implemented until SIFT implementation needs to up-sampling the image in order to get better result. Until today, a very common feature for image processing, know as rescale is not fully implemented yet. ccv_resample function still lacks of scale-up option, because in all these applications I've implemented, there is no need for that.
diff --git a/site/_posts/2012-06-18-an-elephant-in-the-room.markdown b/site/_posts/2012-06-18-an-elephant-in-the-room.markdown
new file mode 100644
index 000000000..016338d9a
--- /dev/null
+++ b/site/_posts/2012-06-18-an-elephant-in-the-room.markdown
@@ -0,0 +1,17 @@
+---
+date: '2012-06-18 23:01:00'
+layout: post
+slug: an-elephant-in-the-room
+status: publish
+title: An elephant in the room
+categories:
+- post
+---
+
+There is an elephant in the room. Why go through all this hassles when there is [OpenCV](http://opencv.org/)? Well, OpenCV is a solid and well-crafted software. But after these years, there are quite few things it failed to address, and these things are exactly what ccv prioritizes for.
+
+OpenCV is known for its hand-crafted low-level classic computer vision algorithms. Its Canny filter, Kalman filter, or LK tracker are the best optimized for real world tasks. Its high-level computer vision algorithms, such as SIFT, SURF or cutting-edge fern-based feature point detectors are great in quality.
+
+However, it, especially with the newest C++ interface, shows the sign of becoming a full-featured framework. To accomplish that, it needs a big build system, and carefully modified script to fit into mobile and server environment. In the end, it becomes harder to modify and adapt.
+
+ccv chose a different path. It strives to be a drop-in statically-linked library. To minimize the code base, it gives up non-essential functionalities aggressively. It is not a library for you to experiment different algorithms. It is a library for you to use in your applications.
\ No newline at end of file
diff --git a/site/_posts/2012-06-29-introducing-ccv-milestone.markdown b/site/_posts/2012-06-29-introducing-ccv-milestone.markdown
new file mode 100644
index 000000000..266ce1286
--- /dev/null
+++ b/site/_posts/2012-06-29-introducing-ccv-milestone.markdown
@@ -0,0 +1,31 @@
+---
+date: '2012-06-29 22:37:00'
+layout: post
+slug: introducing-ccv-milestone
+status: publish
+title: Introducing ccv, reached 0.1 milestone
+categories:
+- post
+---
+
+It has been two years since the first commit. Two years ago, I [promised](/post/application-driven-philosophy/) 1). an object detection algorithm; 2). a text detection algorithm; 3). a feature point algorithm; and 4). a 3d reconstruction algorithm. With the 0.1, there is:[^1]
+
+ 1). [a very fast detection algorithm for rigid object:](/doc/doc-bbf)
+
+ ![bbf](/photo/2012-06-29-face.png)
+
+ 2). [an accurate object detection algorithm for somewhat difficult object:](/doc/doc-dpm)
+
+ ![dpm](/photo/2012-06-29-pedestrian.png)
+
+ 3). [a state-of-art text detection algorithm:](/doc/doc-swt)
+
+ ![swt](/photo/2012-06-29-text.png)
+
+ 4). [the long-standing feature point detection algorithm:](/doc/doc-sift)
+
+ ![sift](/photo/2012-06-29-sift.png)
+
+Have fun!
+
+[^1]: _all images are generated without post-processing_
\ No newline at end of file
diff --git a/site/_posts/2012-10-02-ccv-0.2-is-almost-there.markdown b/site/_posts/2012-10-02-ccv-0.2-is-almost-there.markdown
new file mode 100644
index 000000000..e3a903978
--- /dev/null
+++ b/site/_posts/2012-10-02-ccv-0.2-is-almost-there.markdown
@@ -0,0 +1,28 @@
+---
+date: '2012-10-02 22:17:00'
+layout: post
+slug: ccv-0.2-almost-there
+status: publish
+title: ccv 0.2 is almost there
+categories:
+- post
+---
+
+In the next 2 days, I will cut a new branch from unstable to replace current stable
+branch, and announcing 0.2 release.
+
+0.2 supposes to be a bug-fix release, thus, you won't expect a significant new
+features in it, but whatever, here is what's new:
+
+1). thread-safety by having application-wide cache thread-local;
+
+2). support ccv_read from memory;
+
+3). preliminary MSER implementation;
+
+4). more concise documentation and improvements on SWT;
+
+5). DPM supports real mixture model creation, with new car.m model in ./samples/;
+
+It has been 3 months since the last release, and hopefully, after this release,
+the upcoming schedule will be more regularized.
\ No newline at end of file
diff --git a/site/_posts/2012-10-04-ccv-cut-the-0.2-stable-and-whats-new.markdown b/site/_posts/2012-10-04-ccv-cut-the-0.2-stable-and-whats-new.markdown
new file mode 100644
index 000000000..f6448c19a
--- /dev/null
+++ b/site/_posts/2012-10-04-ccv-cut-the-0.2-stable-and-whats-new.markdown
@@ -0,0 +1,42 @@
+---
+date: '2012-10-04 23:56:00'
+layout: post
+slug: ccv-cut-the-0.2-stable-and-whats-new
+status: publish
+title: ccv cut the 0.2 stable, and what's new
+categories:
+- post
+---
+
+I've cut the stable branch off unstable yesterday, and a r0.2-rc branch as well.
+This marks the 0.2 version of ccv finally here.
+
+ccv 0.2 expects to be a bug-fix release, and should address several "obvious"
+problems have after the first release. Thus, two important issues get addressed
+in this release:
+
+1). thread-safety. The first release version of ccv is not thread-safe when
+enabled the application-wide cache, because access and mutation on this cache is
+not thread-safe. In this release, the application-wide cache becomes thread
+local, thus, have its thread-safety now.
+
+2). read in-memory data. It is always possible to create a ccv_dense_matrix_t
+yourself and fill it out with in-memory data, but the lacking of canonical way
+to read in-memory data into ccv seems problematic in the community. In this
+release, ccv_read will take more options, and one of them is to read in-memory
+self-describe data (such as in-memory JPEG or PNG), another is to read RGB plain
+formatted bytes directly. [read more](/lib/ccv-io/).
+
+In 0.2, DPM finally can train multi-components model (a.k.a. the "real" mixture
+model), and I've included a car model for you to try out:
+
+![car](/photo/2012-10-04-car.png)
+
+Largely still *work-in-progress*, the SWT included a swtcreate tool for you to
+tune SWT parameters to fit your use-cases, and the text segregation gets better
+too:
+
+![text-break](/photo/2012-10-04-text-break.png)
+
+The last 3 months have been "bumpy" for me, and after the summer school in
+Berkeley, I should be able to release ccv more regularly. Hope you enjoy.
diff --git a/site/_posts/2012-11-10-ccv-now-has-a-state-of-art-tracking-algorithm.markdown b/site/_posts/2012-11-10-ccv-now-has-a-state-of-art-tracking-algorithm.markdown
new file mode 100644
index 000000000..96b8fa23b
--- /dev/null
+++ b/site/_posts/2012-11-10-ccv-now-has-a-state-of-art-tracking-algorithm.markdown
@@ -0,0 +1,19 @@
+---
+date: '2012-11-10 14:42:00'
+layout: post
+slug: ccv-now-has-a-state-of-art-tracking-algorithm
+status: publish
+title: ccv now has a state-of-art tracking algorithm
+categories:
+- post
+---
+
+In the next few minutes, I will cut the 0.3-rc1 branch, and that will put the 0.3 version of ccv out of the door (well, not exactly, because all the development on unstable branch is public). Ever since 0.1 version, I've tried to consciously focus on different areas for each cycle, mainly, an odd version should be a version with new features, and an even version should be a version with performance improvement, bug fixes and renovated API design. It is exciting for me to unleash 0.3 version, with a major feature included: a tracking algorithm.
+
+From the beginning, ccv always focuses on implementing modern computer vision algorithms in a well-engineering way, e.g. algorithms that can be applicable in wide areas and is state-of-art in main stream computer vision research. That's why ccv is the first one implemented BBF in open source, and the first one implemented DPM (both training and detection) in C. Now, ccv is the first one that implements the famous long-term tracking algorithm: TLD (a.k.a. ["Predator" algorithm](http://info.ee.surrey.ac.uk/Personal/Z.Kalal/tld.html)) in C.
+
+<iframe width="460" height="315" style="margin-bottom:16px" src="http://www.youtube.com/embed/IW2Y-zWAn0w" frameborder="0" allowfullscreen></iframe>
+
+See more discussions and experiments on [TLD: Track Learn Detect](/doc/doc-tld).
+
+Thanks to [Zdenek Kalal](http://info.ee.surrey.ac.uk/Personal/Z.Kalal/) for sharing his research with the rest of the world.
\ No newline at end of file
diff --git a/site/_posts/2012-11-30-ccv-talks-http.markdown b/site/_posts/2012-11-30-ccv-talks-http.markdown
new file mode 100644
index 000000000..65f2e48e3
--- /dev/null
+++ b/site/_posts/2012-11-30-ccv-talks-http.markdown
@@ -0,0 +1,19 @@
+---
+date: '2012-11-30 22:35:00'
+layout: post
+slug: ccv-talks-http
+status: publish
+title: In 0.4, ccv talks HTTP
+categories:
+- post
+---
+
+For even number of ccv release, it often comes with bug fixes and API renovation. In the next two days, I am going to cut ccv 0.4 release, which brings you a major API renovation: an API over HTTP.
+
+From the beginning, ccv strives to be an easy-to-use computer vision library, and is inspired with two use cases: 1). a server-side library that can be integrated into core infrastructure; 2). a client-side library for embedded devices, that can be portable and run reasonable fast on majority platforms.
+
+There are conflicts between the two use cases, but surprisingly, there are more commons. For example, both environments require a library that is easy to drop in (with less dependencies), and easy to compile with (happily being statically linked). There are differences: for example, on server-side, functionalities in ccv may mainly be invoked remotely with another language, such as Python, Ruby or JavaScript (with Node.js).
+
+It is important for ccv being easily invokable, however, it is impossible to maintain a few high quality language bindings. The compromise in ccv is to expose its functionalities through a universally supported protocol: HTTP. Although it is chatty and often inefficient, the most functionalities in ccv is CPU intensive anyway, thus, the particular choice of protocol is unlikely to be the bottleneck. Besides, it maps well with ccv's function-driven interface.
+
+[Read more about how to use ccv over HTTP ›](/doc/doc-http)
diff --git a/site/_posts/2013-09-02-ccv-0.5-with-a-new-pedestrian-detector.markdown b/site/_posts/2013-09-02-ccv-0.5-with-a-new-pedestrian-detector.markdown
new file mode 100644
index 000000000..6021fbec3
--- /dev/null
+++ b/site/_posts/2013-09-02-ccv-0.5-with-a-new-pedestrian-detector.markdown
@@ -0,0 +1,19 @@
+---
+date: '2013-09-02 22:35:00'
+layout: post
+slug: ccv-0.5-with-a-new-pedestrian-detector
+status: publish
+title: ccv 0.5 with a new pedestrian detector
+categories:
+- post
+---
+
+It has been half a year since the last release of ccv. I haven't kept up my words of releasing a new version of ccv every four weeks. The colorful spring and energetic summer are playful. Hopefully during the depressing fall and bitter winter, I could keep up the pace.
+
+The odd number release of ccv will include new features. In this version, ccv shipped with a preliminary version of [ICF (Integral Channel Features) implementation](/doc/doc-icf). ICF has been advertised as the strongest rigid object detector. In practice, ccv-trained ICF-based pedestrian detector can achieve similar accuracy as DPM while being 3 times faster.
+
+Because this is the first release of ccv after the implementation of its [HTTP interface](/doc/doc-http). I should mention that the HTTP interface will be kept up to date with newest features in ccv core library. In this particular case, HTTP interface supports ICF out of box.
+
+Thanks to Rodrigo Benenson for being patient and timely on replying several questions I have on ICF implementation.
+
+![hack-square](/photo/2013-09-02-hack-square.png)
diff --git a/site/_posts/2014-03-27-ccv-0.6-open-sources-near-state-of-the-art-image-classifier-under-creative-commons.markdown b/site/_posts/2014-03-27-ccv-0.6-open-sources-near-state-of-the-art-image-classifier-under-creative-commons.markdown
new file mode 100644
index 000000000..031989dc1
--- /dev/null
+++ b/site/_posts/2014-03-27-ccv-0.6-open-sources-near-state-of-the-art-image-classifier-under-creative-commons.markdown
@@ -0,0 +1,37 @@
+---
+date: '2014-03-27 07:35:00'
+layout: post
+slug: ccv-0.6-open-sources-near-state-of-the-art-image-classifier-under-creative-commons
+status: publish
+title: ccv 0.6 open sources near state-of-the-art image classifier under Creative Commons
+categories:
+- post
+---
+
+In previous posts, I mentioned that the even numbered release will be bugfixes. However, 0.6 is a bit different.
+
+![now-go-back-and-play-forza](/photo/2014-03-27-dex.png)
+
+For the past one and half year, deep learning, particularly deep convolutional neural network based image classification made waves in the vision community. For a library aiming at providing state-of-the-art implementations, it would be frustrating to not having a competent image classifier implemented after over a year the ground-breaking work published. In the meantime, there are a few open source libraries provided complete ([Caffe](http://caffe.berkeleyvision.org/)) / incomplete ([OverFeat](http://cilvr.nyu.edu/doku.php?id=software:overfeat:start), [cuda-convnet](http://code.google.com/p/cuda-convnet/)) implementations of the said image classifier. However, all of them are focusing on research related activities (see their licenses: [1](http://caffe.berkeleyvision.org/getting_pretrained_models.html), [2](https://github.com/sermanet/OverFeat/blob/master/LICENSE)). Thus, for the past 5 months, I've been working on an image classifier in ccv with deep convolutional neural network.
+
+This version's ccv distributed a image classifier that is trained with ILSVRC 2010 data set of 1000 classes, with top-1 missing rate at 36.83% and top-5 missing rate at 16.25%, thus, close to the state-of-the-art image classifier (Clarifai in ILSVRC 2013 gets top-5 missing rate at 11.19%: <http://www.image-net.org/challenges/LSVRC/2013/results.php>, the 16.25% top-5 missing rate is reproduced with ILSVRC 2010 test data set, which is known to be much less challenging). [See more about this image classifier](/doc/doc-convnet).
+
+The license for these data models (the said image classifier, and the pedestrian detectors, the car detector, the face detector) provided in ccv ./samples is changed from [BSD 3-clause license](https://raw.github.com/liuliu/ccv/unstable/COPYING) to [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/) in hope that this more clarified license will help the adoption of these trained data models.
+
+As always, the new image classifier is available through ccv's [RESTful interface](/doc/doc-http) at http://localhost:3350/convnet/classify. You can also play with ccv at <http://docomputersdream.org/>
+
+Since this version is an anomaly in terms of release cycle, next two versions of ccv will be devoted to bugfixes and performance improvement. There is also a plan to enter ILSVRC 2014 and publish results on FDDB and LFW for the sake of keeping ccv's implementation fresh and competitive.
+
+Other changes / bugfixes in ccv 0.6:
+
+1). Moved from hand-written configure script to autoconf (which still provides link / flag information);
+
+2). <http://ci.libccv.org/> is online to monitor builds for unstable branch and free of static analyzer reports: <http://ci.libccv.org/analyze/>;
+
+3). Fixed a bug in ./serve/ccv that returned HTTP header claims to be 1.1 but never keeps the connection open;
+
+4). RESTful interface for SWT added Tesseract (OCR) support;
+
+5). Fixed ICF implementation problem with non-standard float point representation;
+
+6). Fixed multi-thread bug with fftw3 usage;
diff --git a/site/_posts/2014-04-25-closing-the-gap-between-open-source-and-proprietary.markdown b/site/_posts/2014-04-25-closing-the-gap-between-open-source-and-proprietary.markdown
new file mode 100644
index 000000000..ce1cbf7f8
--- /dev/null
+++ b/site/_posts/2014-04-25-closing-the-gap-between-open-source-and-proprietary.markdown
@@ -0,0 +1,19 @@
+---
+date: '2014-04-25 23:15:00'
+layout: post
+slug: closing-the-gap-between-open-source-and-proprietary
+status: publish
+title: closing the gap between open source and proprietary
+categories:
+- post
+---
+
+In 0.6 release, ccv's deep learning based image classifier achieved 16.26% top-5 missing rate on imageNet 2010. However, the state of the art uses imageNet 2012 data set as the standard, and it is hard to do apple to orange comparison.
+
+For the past 3 weeks, I was able to obtain the imageNet 2012 dataset, therefore, do the apple to apple comparison with the state of the art.
+
+The newly trained data model on imageNet 2012 was able to obtain 16.22% top-5 missing rate on imageNet 2012 dataset, which is about 3% better than [Caffe](http://caffe.berkeleyvision.org/)'s implementation, and about 0.55% shying away from 1-convnet implementation from [OverFeat](http://cilvr.nyu.edu/doku.php?id=software:overfeat:start). This implementation is still 5% behind the state of the art [Clarifai](http://www.clarifai.com) though.
+
+This is a good step towards closing the gap between open source implementation and proprietary implementation.
+
+![dont-be-too-cute-dex](/photo/2014-04-25-dex.png)
diff --git a/site/doc/index.html b/site/doc/index.html
new file mode 100644
index 000000000..57e2ac64a
--- /dev/null
+++ b/site/doc/index.html
@@ -0,0 +1,12 @@
+---
+layout: default
+is_index: true
+---
+
+<h1>Documentation</h1><ul class="main-list">
+{% for post in site.posts %}
+	{% if post.doc %}
+<li><a href="{{ post.url }}">{{ post.title }}</a></li>
+	{% endif %}
+{% endfor %}
+</ul><h3><a href="/">&lsaquo;&nbsp;&nbsp;back&nbsp;</a></h3>
diff --git a/site/index.html b/site/index.html
new file mode 100644
index 000000000..3e88a5e3f
--- /dev/null
+++ b/site/index.html
@@ -0,0 +1,53 @@
+---
+layout: default
+is_index: true
+---
+{% for news in site.posts limit:1 %}
+<h2><a class="news" href="{{ news.url }}">{{ news.title }}&nbsp;&rsaquo;</a></h2>
+{% endfor %}
+<h1>Intro</h1>
+<p>Around 2010, when Lian and I was working on our gesture recognition demo, out
+of the frustration to abstract redundant image preprocessing operations into a
+set of clean and concise functions, I started to consider moving away from the
+stack. Why? Well, after two years, ccv is the very answer.</p>
+<h2>Cached Image Preprocessing</h2>
+<p>Many computer vision tasks nowadays consist of quite a few preprocessing
+layers: image pyramid generation, color space conversion etc. These potentially
+redundant operations cannot be easily eliminated within a mature API. ccv
+provides a built-in cache mechanism that, while maintains a clean function
+interface, effectively does transparent cache for you.
+<a href="/doc/doc-cache"> - How?</a></p>
+<h2>Easy to Embed</h2>
+<p>While it depends on quite a few libraries for the best performance and
+complete feature, ccv's majority functionalities will still work without these
+libraries. You can even drop the ccv source code into your project, and it will
+work!</p>
+<h2>Modern Computer Vision Algorithms</h2>
+<p>One core concept of ccv development is "application driven". As a result, ccv
+end up implementing a handful state-of-art algorithms. It includes
+<a href="/doc/doc-bbf">a very fast detection algorithm for rigid object</a>
+(face etc.), <a href="/doc/doc-icf">a strong rigid object detection algorithm</a>
+(pedestrian etc.), <a href="/doc/doc-dpm">an accurate object detection algorithm
+for somewhat difficult object</a> (car, cat etc.), <a href="/doc/doc-convnet">a
+deep-learning based near state-of-the-art image classifier</a>, <a href="/doc/doc-swt">a
+state-of-the-art text detection algorithm</a>, <a href="/doc/doc-tld">a long term
+object tracking algorithm</a>, and <a href="/doc/doc-sift">the long-standing
+feature point detection algorithm</a>.</p>
+<p>For computer vision community, there is no shortage of good algorithms, good
+implementation is what it lacks of. After years, we stuck in between either the
+high-performance, battle-tested but old algorithm implementations, or the new,
+shining but Matlab algorithms. ccv is my take on this problem, hope you enjoy
+it.</p>
+<h2>License</h2>
+<p>ccv source code is distributed under <a href="https://raw.github.com/liuliu/ccv/unstable/COPYING">BSD 3-clause License</a>.<p>
+<p>ccv's data models and documentations are distributed under <a href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.<p>
+<h2><a href="/tutorial">Getting Started</a></h2>
+<h2><a href="/doc">Documentation</a></h2>
+<h2><a href="/lib">Library Reference</a></h2>
+<h2>Dated Posts</h2><ul class="sub-list">
+{% for post in site.posts offset:1 %}
+	{% unless post.lib or post.doc %}
+<li><a href="{{ post.url }}">{{ post.title }}. - {{ post.date | date_to_human_string }}</a></li>
+	{% endunless %}
+{% endfor %}
+</ul>
diff --git a/site/javascripts/scale.fix.js b/site/javascripts/scale.fix.js
new file mode 100644
index 000000000..08716c006
--- /dev/null
+++ b/site/javascripts/scale.fix.js
@@ -0,0 +1,20 @@
+fixScale = function(doc) {
+
+	var addEvent = 'addEventListener',
+	    type = 'gesturestart',
+	    qsa = 'querySelectorAll',
+	    scales = [1, 1],
+	    meta = qsa in doc ? doc[qsa]('meta[name=viewport]') : [];
+
+	function fix() {
+		meta.content = 'width=device-width,minimum-scale=' + scales[0] + ',maximum-scale=' + scales[1];
+		doc.removeEventListener(type, fix, true);
+	}
+
+	if ((meta = meta[meta.length - 1]) && addEvent in doc) {
+		fix();
+		scales = [.25, 1.6];
+		doc[addEvent](type, fix, true);
+	}
+
+};
\ No newline at end of file
diff --git a/site/lib/index.html b/site/lib/index.html
new file mode 100644
index 000000000..e5582194e
--- /dev/null
+++ b/site/lib/index.html
@@ -0,0 +1,12 @@
+---
+layout: default
+is_index: true
+---
+
+<h1>Library Reference</h1><ul class="main-list">
+{% for post in site.posts %}
+	{% if post.lib %}
+<li><a href="{{ post.url }}">{% if post.desc %}{{ post.desc }}: {% endif %}{{ post.title }}</a></li>
+	{% endif %}
+{% endfor %}
+</ul><h3><a href="/">&lsaquo;&nbsp;&nbsp;back&nbsp;</a></h3>
diff --git a/site/photo/2012-06-29-face.png b/site/photo/2012-06-29-face.png
new file mode 100644
index 000000000..af0a5e6f1
Binary files /dev/null and b/site/photo/2012-06-29-face.png differ
diff --git a/site/photo/2012-06-29-pedestrian.png b/site/photo/2012-06-29-pedestrian.png
new file mode 100644
index 000000000..d00217cf9
Binary files /dev/null and b/site/photo/2012-06-29-pedestrian.png differ
diff --git a/site/photo/2012-06-29-sift.png b/site/photo/2012-06-29-sift.png
new file mode 100644
index 000000000..9fbb26e70
Binary files /dev/null and b/site/photo/2012-06-29-sift.png differ
diff --git a/site/photo/2012-06-29-text.png b/site/photo/2012-06-29-text.png
new file mode 100644
index 000000000..ebcb4a039
Binary files /dev/null and b/site/photo/2012-06-29-text.png differ
diff --git a/site/photo/2012-10-04-car.png b/site/photo/2012-10-04-car.png
new file mode 100644
index 000000000..6058bffeb
Binary files /dev/null and b/site/photo/2012-10-04-car.png differ
diff --git a/site/photo/2012-10-04-text-break.png b/site/photo/2012-10-04-text-break.png
new file mode 100644
index 000000000..06cb26d26
Binary files /dev/null and b/site/photo/2012-10-04-text-break.png differ
diff --git a/site/photo/2013-09-02-hack-square.png b/site/photo/2013-09-02-hack-square.png
new file mode 100644
index 000000000..99b280d42
Binary files /dev/null and b/site/photo/2013-09-02-hack-square.png differ
diff --git a/site/photo/2014-03-27-dex.png b/site/photo/2014-03-27-dex.png
new file mode 100644
index 000000000..63f3095bc
Binary files /dev/null and b/site/photo/2014-03-27-dex.png differ
diff --git a/site/photo/2014-04-25-dex.png b/site/photo/2014-04-25-dex.png
new file mode 100644
index 000000000..7f0b21ea3
Binary files /dev/null and b/site/photo/2014-04-25-dex.png differ
diff --git a/site/stylesheets/coderay.css b/site/stylesheets/coderay.css
new file mode 100644
index 000000000..8ce902998
--- /dev/null
+++ b/site/stylesheets/coderay.css
@@ -0,0 +1,128 @@
+.CodeRay {
+  font-family: Consolas,Menlo,"Liberation Mono",Courier,monospace;
+  color: #000;
+}
+
+.CodeRay pre {
+}
+
+div.CodeRay { }
+span.CodeRay { white-space: pre; border: 0px; padding: 2px }
+
+table.CodeRay { border-collapse: collapse; width: 100%; padding: 2px }
+table.CodeRay td {
+  padding: 1em 0.5em;
+  vertical-align: top;
+}
+
+.CodeRay .line-numbers, .CodeRay .no {
+  background-color: #ECECEC;
+  color: #AAA;
+  text-align: center;
+}
+
+.CodeRay .line-numbers a {
+  color: #AAA;
+}
+
+.CodeRay .line-numbers tt { font-weight: bold }
+.CodeRay .line-numbers .highlighted { color: red }
+.CodeRay .line { display: block; float: left; width: 100%; }
+.CodeRay span.line-numbers { padding: 0px 3px 0px 3px; width: 26px; display: inline-block; margin-right: 7px; }
+.CodeRay .code { width: 100%; padding-right: 10px }
+
+ol.CodeRay { font-size: 1.0em }
+ol.CodeRay li { white-space: pre }
+
+.CodeRay .code pre { overflow: auto; padding-left: 8px; padding-right: 8px; }
+.CodeRay .debug { color:white ! important; background:blue ! important; }
+
+.CodeRay .annotation { color:#007 }
+.CodeRay .attribute-name { color:#f08 }
+.CodeRay .attribute-value { color:#700 }
+.CodeRay .binary { color:#509; font-weight:bold }
+.CodeRay .comment  { color:#998; font-style: italic;}
+.CodeRay .char { color:#04D }
+.CodeRay .char .content { color:#04D }
+.CodeRay .char .delimiter { color:#039 }
+.CodeRay .class { color:#458; font-weight:bold }
+.CodeRay .complex { color:#A08; font-weight:bold }
+.CodeRay .constant { color:teal; }
+.CodeRay .color { color:#0A0 }
+.CodeRay .class-variable { color:#369 }
+.CodeRay .decorator { color:#B0B; }
+.CodeRay .definition { color:#099; font-weight:bold }
+.CodeRay .directive { color:#088; font-weight:bold }
+.CodeRay .delimiter { color:black }
+.CodeRay .doc { color:#970 }
+.CodeRay .doctype { color:#34b }
+.CodeRay .doc-string { color:#D42; font-weight:bold }
+.CodeRay .escape  { color:#666; font-weight:bold }
+.CodeRay .entity { color:#800; font-weight:bold }
+.CodeRay .error { color:#F00; background-color:#FAA }
+.CodeRay .exception { color:#C00; font-weight:bold }
+.CodeRay .filename { color:#099; }
+.CodeRay .function { color:#900; font-weight:bold }
+.CodeRay .global-variable { color:teal; font-weight:bold }
+.CodeRay .hex { color:#058; font-weight:bold }
+.CodeRay .integer  { color:#099; }
+.CodeRay .include { color:#B44; font-weight:bold }
+.CodeRay .inline { color: black }
+.CodeRay .inline .inline { background: #ccc }
+.CodeRay .inline .inline .inline { background: #bbb }
+.CodeRay .inline .inline-delimiter { color: #D14; }
+.CodeRay .inline-delimiter { color: #D14; }
+.CodeRay .important { color:#f00; }
+.CodeRay .interpreted { color:#B2B; font-weight:bold }
+.CodeRay .instance-variable { color:teal }
+.CodeRay .label { color:#970; font-weight:bold }
+.CodeRay .local-variable { color:#963 }
+.CodeRay .octal { color:#40E; font-weight:bold }
+.CodeRay .operator { }
+.CodeRay .predefined-constant {  font-weight:bold }
+.CodeRay .predefined { color:#369; font-weight:bold }
+.CodeRay .preprocessor { color:#579; }
+.CodeRay .pseudo-class { color:#00C; font-weight:bold }
+.CodeRay .predefined-type { color:#074; font-weight:bold }
+.CodeRay .reserved, .keyword  { color:#000; font-weight:bold }
+
+.CodeRay .key { color: #808; }
+.CodeRay .key .delimiter { color: #606; }
+.CodeRay .key .char { color: #80f; }
+.CodeRay .value { color: #088; }
+
+.CodeRay .regexp { background-color:#fff0ff }
+.CodeRay .regexp .content { color:#808 }
+.CodeRay .regexp .delimiter { color:#404 }
+.CodeRay .regexp .modifier { color:#C2C }
+.CodeRay .regexp .function  { color:#404; font-weight: bold }
+
+.CodeRay .string { color: #D20; }
+.CodeRay .string .string { }
+.CodeRay .string .string .string { background-color:#ffd0d0 }
+.CodeRay .string .content { color: #D14; }
+.CodeRay .string .char { color: #D14; }
+.CodeRay .string .delimiter { color: #D14; }
+
+.CodeRay .shell { color:#D14 }
+.CodeRay .shell .content { }
+.CodeRay .shell .delimiter { color:#D14 }
+
+.CodeRay .symbol { color:#990073 }
+.CodeRay .symbol .content { color:#A60 }
+.CodeRay .symbol .delimiter { color:#630 }
+
+.CodeRay .tag { color:#070 }
+.CodeRay .tag-special { color:#D70; font-weight:bold }
+.CodeRay .type { color:#339; font-weight:bold }
+.CodeRay .variable  { color:#036 }
+
+.CodeRay .insert { background: #afa; }
+.CodeRay .delete { background: #faa; }
+.CodeRay .change { color: #aaf; background: #007; }
+.CodeRay .head { color: #f8f; background: #505 }
+
+.CodeRay .insert .insert { color: #080; font-weight:bold }
+.CodeRay .delete .delete { color: #800; font-weight:bold }
+.CodeRay .change .change { color: #66f; }
+.CodeRay .head .head { color: #f4f; }
diff --git a/site/stylesheets/styles.css b/site/stylesheets/styles.css
new file mode 100644
index 000000000..90415ef99
--- /dev/null
+++ b/site/stylesheets/styles.css
@@ -0,0 +1,286 @@
+@import url(https://fonts.googleapis.com/css?family=Lato:300italic,700italic,300,700);
+
+body {
+  padding:50px;
+  font:0.92em Lato, "Helvetica Neue", Helvetica, Arial, sans-serif;
+  background:#ffffff;
+  color:#484848;
+  font-weight:300;
+}
+
+h1, h2, h3, h4, h5, h6 {
+  color:#222;
+  margin:0 0 20px;
+}
+
+p, ul, ol, table, pre, dl {
+  line-height: 1.35;
+  margin:0 0 20px;
+}
+
+h1, h2, h3 {
+  line-height:1.3;
+}
+
+h1 {
+  font-size:2.1em;
+}
+
+h2 {
+  font-size:1.7em;
+  color:#303030;
+}
+
+h3, h4, h5, h6 {
+  font-size:1.2em;
+  color:#303030;
+}
+
+a {
+  color:#268bd2;
+  font-weight:400;
+  text-decoration:none;
+}
+
+a small {
+  font-size:0.9em;
+  color:#586e75;
+  margin-top:-0.05em;
+  display:block;
+}
+
+h1 a {
+  color:#222;
+  font-weight:bold;
+}
+
+h2 a.news {
+  color:#268bd2;
+  font-weight:bold;
+  font-size:1.25em;
+}
+
+ul.main-list {
+  font-size: 1.15em;
+  list-style-type: none;
+  line-height: 1.8;
+  padding-left: 6px;
+}
+
+ul.sub-list {
+  font-size: 1em;
+  list-style-type: none;
+  line-height: 1.5;
+  padding-left: 6px;
+}
+
+ul.sub-list li {
+  margin-bottom: 5px;
+}
+
+.wrapper {
+  width:860px;
+  margin:0 auto;
+}
+
+section h1 {
+  font-size: 1.7em;
+}
+
+section h2 {
+  font-size: 1.2em;
+}
+
+blockquote {
+  border-left:1px solid #e5e5e5;
+  margin:0;
+  padding:0 0 0 20px;
+  font-style:italic;
+}
+
+pre {
+  padding:8px 15px;
+  background: #f8f8f8;  
+  border-radius:5px;
+  border:1px solid #e5e5e5;
+  overflow-x: auto;
+}
+
+table {
+  width:100%;
+  border-collapse:collapse;
+}
+
+th, td {
+  text-align:left;
+  padding:5px 10px;
+  border-bottom:1px solid #e5e5e5;
+}
+
+dt {
+  color:#444;
+  font-weight:700;
+}
+
+th {
+  color:#444;
+}
+
+img {
+  max-width:100%;
+}
+
+header {
+  width:270px;
+  float:left;
+  position:fixed;
+}
+
+header ul {
+  list-style:none;
+  height:40px;
+  
+  padding:0;
+  
+  background: #eee;
+  background: -moz-linear-gradient(top, #f8f8f8 0%, #dddddd 100%);
+  background: -webkit-gradient(linear, left top, left bottom, color-stop(0%,#f8f8f8), color-stop(100%,#dddddd));
+  background: -webkit-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: -o-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: -ms-linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  background: linear-gradient(top, #f8f8f8 0%,#dddddd 100%);
+  
+  border-radius:5px;
+  border:1px solid #d2d2d2;
+  box-shadow:inset #fff 0 1px 0, inset rgba(0,0,0,0.03) 0 -1px 0;
+  width:270px;
+}
+
+header li {
+  width:89px;
+  float:left;
+  border-right:1px solid #d2d2d2;
+  height:40px;
+}
+
+header ul a {
+  line-height:1;
+  font-size:0.85em;
+  color:#999;
+  display:block;
+  text-align:center;
+  padding-top:6px;
+  height:40px;
+}
+
+strong {
+  color:#222;
+  font-weight:700;
+}
+
+header ul li + li {
+  width:88px;
+  border-left:1px solid #fff;
+}
+
+header ul li + li + li {
+  border-right:none;
+  width:89px;
+}
+
+header ul a strong {
+  font-size:1.1em;
+  display:block;
+  color:#222;
+}
+
+section {
+  width:500px;
+  float:right;
+  padding-bottom:50px;
+}
+
+small {
+  font-size:0.85em;
+}
+
+hr {
+  border:0;
+  background:#aaa;
+  height:1px;
+  margin:0 0 20px;
+}
+
+footer {
+  width:270px;
+  float:left;
+  position:fixed;
+  bottom:50px;
+}
+
+@media print, screen and (max-width: 960px) {
+  
+  div.wrapper {
+    width:auto;
+    margin:0;
+  }
+  
+  header, section, footer {
+    float:none;
+    position:static;
+    width:auto;
+  }
+  
+  header {
+    padding-right:320px;
+  }
+  
+  section {
+    border:1px solid #e5e5e5;
+    border-width:1px 0;
+    padding:20px 0;
+    margin:0 0 20px;
+  }
+  
+  header a small {
+    display:inline;
+  }
+  
+  header ul {
+    position:absolute;
+    right:50px;
+    top:52px;
+  }
+}
+
+@media print, screen and (max-width: 720px) {
+  body {
+    word-wrap:break-word;
+  }
+  
+  header {
+    padding:0;
+  }
+  
+  header ul, header p.view {
+    position:static;
+  }
+}
+
+@media print, screen and (max-width: 480px) {
+  body {
+    padding:15px;
+  }
+  
+  header ul {
+    display:none;
+  }
+}
+
+@media print {
+  body {
+    padding:0.4in;
+    font-size:1.0em;
+    color:#444;
+  }
+}
diff --git a/site/tutorial/index.markdown b/site/tutorial/index.markdown
new file mode 100644
index 000000000..5623651ec
--- /dev/null
+++ b/site/tutorial/index.markdown
@@ -0,0 +1,56 @@
+---
+layout: page
+title: Getting Started
+---
+
+Install
+-------
+ccv is very lightweight. There is no concept of 'install'. If you want to use ccv, statically linking to it is sufficient. To the extreme, it is recommended to drop ccv source code into your project and use it directly.
+
+For old-schooler, run 'make' in /lib directory will generate the libccv.a archive. From there, you can statically link to it by -lccv. For example, you have ccv source compiled in ~/ccv/, and you want to compile ~/studies/sift.c with ccv.
+
+	clang -L"~/ccv/lib" -I"~/ccv/lib" sift.c -lccv `cat ~/ccv/lib/.LN` -lm
+
+That it. The only magic sauce is ~/ccv/lib/.LN, which gives you all the dependencies you have to link to when you compile ccv the first time. If your ccv compiled with no dependency, it is empty (and ccv works with zero dependency).
+
+Read a Photo
+------------
+Let's start with something small.
+
+{:lang="c"}
+	#include <ccv.h>
+	int main(int argc, char** argv)
+	{
+		ccv_dense_matrix_t* image = 0;
+		ccv_read(argv[1], &image, CCV_IO_GRAY | CCV_IO_ANY_FILE);
+		ccv_write(image, argv[2], 0, CCV_IO_PNG_FILE, 0);
+		return 0;
+	}
+
+If your ccv build has dependency on libjpeg or libpng, the code above is sufficient to load any JPEG or PNG file into memory and save a grayscale version to the disk.
+
+Detect a Face
+-------------
+Yes, knowing how to read a photo is sufficient to write an application that can do, for example, face detection.
+
+{:lang="c"}
+	#include <ccv.h>
+	int main(int argc, char** argv)
+	{
+		ccv_dense_matrix_t* image = 0;
+		ccv_read(argv[1], &image, CCV_IO_GRAY | CCV_IO_ANY_FILE);
+		ccv_bbf_classifier_cascade_t* cascade = ccv_bbf_read_classifier_cascade(argv[2]);
+		ccv_array_t* faces = ccv_bbf_detect_objects(image, &cascade, 1, ccv_bbf_default_params);
+		int i;
+		for (i = 0; i < faces->rnum; i++)
+		{
+			ccv_comp_t* face = (ccv_comp_t*)ccv_array_get(faces, i);
+			printf("%d %d %d %d\n", face->rect.x, face->rect.y, face->rect.width, face->rect.height);
+		}
+		ccv_array_free(faces);
+		ccv_bbf_classifier_cascade_free(cascade);
+		ccv_matrix_free(image);
+		return 0;
+	}
+
+That's it.
diff --git a/test/ccv_case.h b/test/ccv_case.h
index 3ca4d4ead..eea63261f 100644
--- a/test/ccv_case.h
+++ b/test/ccv_case.h
@@ -1,6 +1,8 @@
 #ifndef _GUARD_ccv_case_h_
 #define _GUARD_ccv_case_h_
 
+#include <math.h>
+
 #define REQUIRE_MATRIX_EQ(a, b, err, ...) { \
 if (ccv_matrix_eq(a, b) != 0) \
 { \
@@ -19,4 +21,20 @@ if (ccv_matrix_eq(a, __case_b__) != 0) \
 } \
 ccv_matrix_free(__case_b__); }
 
+#define REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(type, a, b, len, angle, magnitude, err, ...) { \
+int __case_i__; \
+double __dot_prod__ = 0, __mag_a__ = 0, __mag_b__ = 0; \
+for (__case_i__ = 0; __case_i__ < (len); __case_i__++) \
+{ \
+	__dot_prod__ += (double)(((type*)(a))[__case_i__] * ((type*)(b))[__case_i__]); \
+	__mag_a__ += (double)(((type*)(a))[__case_i__] * ((type*)(a))[__case_i__]); \
+	__mag_b__ += (double)(((type*)(b))[__case_i__] * ((type*)(b))[__case_i__]); \
+} \
+__mag_a__ = sqrt(__mag_a__), __mag_b__ = sqrt(__mag_b__); \
+if (acos(__dot_prod__ / (__mag_a__ * __mag_b__)) * 180 / 3.141592653 > angle || fabs(__mag_a__ - __mag_b__) / ccv_max(ccv_max(__mag_a__, __mag_b__), 1) > magnitude) \
+{ \
+	printf("\n\t\033[0;31mREQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE\033[0;0m: %s:%d: angle: %lg | %lg, magnitude: %lg != %lg | +-%lg, " err, __FILE__, __LINE__, (double)(acos(__dot_prod__ / (__mag_a__ * __mag_b__)) * 180 / 3.141592653), (double)angle, __mag_a__, __mag_b__, (double)(magnitude), ##__VA_ARGS__); \
+	ABORT_CASE; \
+} }
+
 #endif
diff --git a/test/functional/convnet.tests.c b/test/functional/convnet.tests.c
index 53229327a..130251e80 100644
--- a/test/functional/convnet.tests.c
+++ b/test/functional/convnet.tests.c
@@ -1,6 +1,7 @@
 #include "ccv.h"
 #include "case.h"
 #include "ccv_case.h"
+#include "3rdparty/dsfmt/dSFMT.h"
 
 TEST_CASE("convolutional network of 11x11 on 225x225 with uniform weights")
 {
@@ -13,22 +14,24 @@ TEST_CASE("convolutional network of 11x11 on 225x225 with uniform weights")
 				.rows = 225,
 				.cols = 225,
 				.channels = 3,
+				.partition = 1,
 			},
 		},
 		.output = {
 			.convolutional = {
-				.count = 1,
+				.count = 4,
 				.strides = 4,
 				.border = 1,
 				.rows = 11,
 				.cols = 11,
 				.channels = 3,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(225, 225), &params, 1);
 	int i, x, y;
-	for (i = 0; i < 11 * 11 * 3; i++)
+	for (i = 0; i < 11 * 11 * 3 * 4; i++)
 		convnet->layers[0].w[i] = 1;
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(225, 225, CCV_32F | CCV_C3, 0, 0);
 	for (i = 0; i < 225 * 225 * 3; i++)
@@ -37,10 +40,11 @@ TEST_CASE("convolutional network of 11x11 on 225x225 with uniform weights")
 	ccv_convnet_encode(convnet, &a, &b, 1);
 	ccv_matrix_free(a);
 	REQUIRE(b->rows == 55 && b->cols == 55, "11x11 convolves on 225x255 with strides 4 should produce 55x55 matrix");
-	ccv_dense_matrix_t* c = ccv_dense_matrix_new(55, 55, CCV_32F | CCV_C1, 0, 0);
+	ccv_dense_matrix_t* c = ccv_dense_matrix_new(55, 55, CCV_32F | 4, 0, 0);
 	for (y = 0; y < 55; y++)
 		for (x = 0; x < 55; x++)
-			c->data.f32[y * 55 + x] = ((x == 0 && y == 0) || (x == 0 && y == 54) || (x == 54 && y == 0) || (x == 54 && y == 54)) ? 300 : ((x == 0 || y == 0 || x == 54 || y == 54) ? 330 : 363);
+			for (i = 0; i < 4; i++)
+			c->data.f32[(y * 55 + x) * 4 + i] = ((x == 0 && y == 0) || (x == 0 && y == 54) || (x == 54 && y == 0) || (x == 54 && y == 54)) ? 300 : ((x == 0 || y == 0 || x == 54 || y == 54) ? 330 : 363);
 	REQUIRE_MATRIX_EQ(b, c, "55x55 matrix should be exactly a matrix fill 363, with 300 on the corner and 330 on the border");
 	ccv_matrix_free(b);
 	ccv_matrix_free(c);
@@ -58,22 +62,24 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with uniform weights")
 				.rows = 27,
 				.cols = 27,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
 			.convolutional = {
-				.count = 1,
+				.count = 4,
 				.strides = 1,
 				.border = 2,
 				.rows = 5,
 				.cols = 5,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(27, 27), &params, 1);
 	int i, x, y;
-	for (i = 0; i < 5 * 5; i++)
+	for (i = 0; i < 5 * 5 * 4; i++)
 		convnet->layers->w[i] = 1;
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(27, 27, CCV_32F | CCV_C1, 0, 0);
 	for (i = 0; i < 27 * 27; i++)
@@ -82,21 +88,24 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with uniform weights")
 	ccv_convnet_encode(convnet, &a, &b, 1);
 	REQUIRE(b->rows == 27 && b->cols == 27, "5x5 convolves on 27x27 with border 2 should produce 27x27 matrix");
 	ccv_matrix_free(a);
-	ccv_dense_matrix_t* c = ccv_dense_matrix_new(27, 27, CCV_32F | CCV_C1, 0, 0);
+	ccv_dense_matrix_t* c = ccv_dense_matrix_new(27, 27, CCV_32F | 4, 0, 0);
 	for (y = 0; y < 27; y++)
 		for (x = 0; x < 27; x++)
-			if ((x == 0 && y == 0) || (x == 0 && y == 26) || (x == 26 && y == 0) || (x == 26 && y == 26))
-				c->data.f32[y * 27 + x] = 9;
-			else if ((x == 0 && y == 1) || (x == 0 && y == 25) || (x == 1 && y == 0) || (x == 1 && y == 26) || (x == 25 && y == 0) || (x == 25 && y == 26) || (x == 26 && y == 1) || (x == 26 && y == 25))
-				c->data.f32[y * 27 + x] = 12;
-			else if (x == 0 || y == 0 || x == 26 || y == 26)
-				c->data.f32[y * 27 + x] = 15;
-			else if ((x == 1 && y == 1) || (x == 1 && y == 25) || (x == 25 && y == 1) || (x == 25 && y == 25))
-				c->data.f32[y * 27 + x] = 16;
-			else if (x == 1 || y == 1 || x == 25 || y == 25)
-				c->data.f32[y * 27 + x] = 20;
-			else
-				c->data.f32[y * 27 + x] = 25;
+			for (i = 0; i < 4; i++)
+			{
+				if ((x == 0 && y == 0) || (x == 0 && y == 26) || (x == 26 && y == 0) || (x == 26 && y == 26))
+					c->data.f32[(y * 27 + x) * 4 + i] = 9;
+				else if ((x == 0 && y == 1) || (x == 0 && y == 25) || (x == 1 && y == 0) || (x == 1 && y == 26) || (x == 25 && y == 0) || (x == 25 && y == 26) || (x == 26 && y == 1) || (x == 26 && y == 25))
+					c->data.f32[(y * 27 + x) * 4 + i] = 12;
+				else if (x == 0 || y == 0 || x == 26 || y == 26)
+					c->data.f32[(y * 27 + x) * 4 + i] = 15;
+				else if ((x == 1 && y == 1) || (x == 1 && y == 25) || (x == 25 && y == 1) || (x == 25 && y == 25))
+					c->data.f32[(y * 27 + x) * 4 + i] = 16;
+				else if (x == 1 || y == 1 || x == 25 || y == 25)
+					c->data.f32[(y * 27 + x) * 4 + i] = 20;
+				else
+					c->data.f32[(y * 27 + x) * 4 + i] = 25;
+			}
 	REQUIRE_MATRIX_EQ(b, c, "27x27 matrix should be exactly a matrix fill 25, with 9, 16 on the corner and 12, 15, 20 on the border");
 	ccv_matrix_free(b);
 	ccv_matrix_free(c);
@@ -114,23 +123,26 @@ TEST_CASE("convolutional network of 11x11 on 225x225 with non-uniform weights")
 				.rows = 225,
 				.cols = 225,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
 			.convolutional = {
-				.count = 1,
+				.count = 4,
 				.strides = 4,
 				.border = 1,
 				.rows = 11,
 				.cols = 11,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(225, 225), &params, 1);
 	int i, x, y;
-	for (i = 0; i < 11 * 11; i++)
-		convnet->layers[0].w[i] = i + 1;
+	for (x = 0; x < 4; x++)
+		for (i = 0; i < 11 * 11; i++)
+			convnet->layers[0].w[x * 11 * 11 + i] = i + 1;
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(225, 225, CCV_32F | CCV_C1, 0, 0);
 	for (i = 0; i < 225 * 225; i++)
 		a->data.f32[i] = i + 1;
@@ -138,41 +150,47 @@ TEST_CASE("convolutional network of 11x11 on 225x225 with non-uniform weights")
 	ccv_convnet_encode(convnet, &a, &b, 1);
 	ccv_matrix_free(a);
 	REQUIRE(b->rows == 55 && b->cols == 55, "11x11 convolves on 225x255 with strides 4 should produce 55x55 matrix");
-	ccv_dense_matrix_t* c = ccv_dense_matrix_new(55, 55, CCV_32F | CCV_C1, 0, 0);
+	ccv_dense_matrix_t* c = ccv_dense_matrix_new(55, 55, CCV_32F | 4, 0, 0);
 	float sum = 0;
 	// first column
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 10; x++)
 			sum += ((y + 1) * 11 + x + 2) * (y * 225 + x + 1);
-	c->data.f32[0] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[i] = sum;
 	sum = 0;
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 11; x++)
 			sum += ((y + 1) * 11 + x + 1) * (y * 225 + (x + 3) + 1);
 	for (x = 1; x < 54; x++)
-		c->data.f32[x] = sum + (x - 1) * 4 * (11 * 11 + 12) * 11 * 10 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[x * 4 + i] = sum + (x - 1) * 4 * (11 * 11 + 12) * 11 * 10 / 2;
 	sum = 0;
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 10; x++)
 			sum += ((y + 1) * 11 + x + 1) * (y * 225 + (x + 215) + 1);
-	c->data.f32[54] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[54 * 4 + i] = sum;
 	// last column
 	sum = 0;
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 10; x++)
 			sum += (y * 11 + x + 2) * ((y + 215) * 225 + x + 1);
-	c->data.f32[55 * 54] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[55 * 54 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 11; x++)
 			sum += (y * 11 + x + 1) * ((y + 215) * 225 + (x + 3) + 1);
 	for (x = 1; x < 54; x++)
-		c->data.f32[55 * 54 + x] = sum + (x - 1) * 4 * (10 * 11 + 1) * 11 * 10 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[(55 * 54 + x) * 4 + i] = sum + (x - 1) * 4 * (10 * 11 + 1) * 11 * 10 / 2;
 	sum = 0;
 	for (y = 0; y < 10; y++)
 		for (x = 0; x < 10; x++)
 			sum += (y * 11 + x + 1) * ((y + 215) * 225 + (x + 215) + 1);
-	c->data.f32[55 * 54 + 54] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(55 * 54 + 54) * 4 + i] = sum;
 	float border[] = {
 		0, 0
 	};
@@ -188,16 +206,19 @@ TEST_CASE("convolutional network of 11x11 on 225x225 with non-uniform weights")
 			sum += (y * 11 + x + 1) * ((y + 3) * 225 + (x + 3) + 1);
 	for (y = 1; y < 54; y++)
 	{
-		c->data.f32[y * 55] = border[0];
+		for (i = 0; i < 4; i++)
+			c->data.f32[y * 55 * 4 + i] = border[0];
 		for (x = 1; x < 54; x++)
-			c->data.f32[y * 55 + x] = sum + (x - 1) * 4 * (11 * 11 + 1) * 11 * 11 / 2;
-		c->data.f32[y * 55 + 54] = border[1];
+			for (i = 0; i < 4; i++)
+				c->data.f32[(y * 55 + x) * 4 + i] = sum + (x - 1) * 4 * (11 * 11 + 1) * 11 * 11 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[(y * 55 + 54) * 4 + i] = border[1];
 		sum += 225 * 4 * (11 * 11 + 1) * 11 * 11 / 2;
 		border[0] += 225 * 4 * ((11 * 11 + 1) * 11 * 11 / 2 - (10 * 11 + 1 + 1) * 11 / 2);
 		border[1] += 225 * 4 * ((11 * 11 + 1) * 11 * 11 / 2 - (11 * 11 + 11) * 11 / 2);
 	}
 	// regularize the output so it is within the tolerance
-	for (i = 0; i < 55 * 55; i++)
+	for (i = 0; i < 55 * 55 * 4; i++)
 		c->data.f32[i] = c->data.f32[i] * 1e-7, b->data.f32[i] = b->data.f32[i] * 1e-7;
 	REQUIRE_MATRIX_EQ(b, c, "55x55 matrix should be exactly the same");
 	ccv_matrix_free(b);
@@ -216,23 +237,26 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with non-uniform weights")
 				.rows = 27,
 				.cols = 27,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
 			.convolutional = {
-				.count = 1,
+				.count = 4,
 				.strides = 1,
 				.border = 2,
 				.rows = 5,
 				.cols = 5,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(27, 27), &params, 1);
 	int i, x, y;
-	for (i = 0; i < 5 * 5; i++)
-		convnet->layers->w[i] = i + 1;
+	for (x = 0; x < 4; x++)
+		for (i = 0; i < 5 * 5; i++)
+			convnet->layers->w[x * 5 * 5 + i] = i + 1;
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(27, 27, CCV_32F | CCV_C1, 0, 0);
 	for (i = 0; i < 27 * 27; i++)
 		a->data.f32[i] = i + 1;
@@ -240,115 +264,135 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with non-uniform weights")
 	ccv_convnet_encode(convnet, &a, &b, 1);
 	REQUIRE(b->rows == 27 && b->cols == 27, "5x5 convolves on 27x27 with border 2 should produce 27x27 matrix");
 	ccv_matrix_free(a);
-	ccv_dense_matrix_t* c = ccv_dense_matrix_new(27, 27, CCV_32F | CCV_C1, 0, 0);
+	ccv_dense_matrix_t* c = ccv_dense_matrix_new(27, 27, CCV_32F | 4, 0, 0);
 	// the first column
 	float sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 3; x++)
 			sum += ((y + 2) * 5 + x + 3) * (y * 27 + x + 1);
-	c->data.f32[0] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 4; x++)
 			sum += ((y + 2) * 5 + x + 2) * (y * 27 + x + 1);
-	c->data.f32[1] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 5; x++)
 			sum += ((y + 2) * 5 + x + 1) * (y * 27 + x + 1);
 	for (x = 2; x < 25; x++)
-		c->data.f32[x] = sum + (x - 2) * 36 * 15 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[x * 4 + i] = sum + (x - 2) * 36 * 15 / 2;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 4; x++)
 			sum += ((y + 2) * 5 + x + 1) * (y * 27 + x + 24);
-	c->data.f32[25] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[25 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 3; x++)
 			sum += ((y + 2) * 5 + x + 1) * (y * 27 + x + 25);
-	c->data.f32[26] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[26 * 4 + i] = sum;
 	// the second column
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 3; x++)
 			sum += ((y + 1) * 5 + x + 3) * (y * 27 + x + 1);
-	c->data.f32[27] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[27 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 4; x++)
 			sum += ((y + 1) * 5 + x + 2) * (y * 27 + x + 1);
-	c->data.f32[28] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[28 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 5; x++)
 			sum += ((y + 1) * 5 + x + 1) * (y * 27 + x + 1);
 	for (x = 2; x < 25; x++)
-		c->data.f32[27 + x] = sum + (x - 2) * 31 * 20 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[(27 + x) * 4 + i] = sum + (x - 2) * 31 * 20 / 2;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 4; x++)
 			sum += ((y + 1) * 5 + x + 1) * (y * 27 + x + 24);
-	c->data.f32[52] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[52 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 3; x++)
 			sum += ((y + 1) * 5 + x + 1) * (y * 27 + x + 25);
-	c->data.f32[53] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[53 * 4 + i] = sum;
 	sum = 0;
 	// the last 2nd column
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 3; x++)
 			sum += (y * 5 + x + 3) * ((y + 23) * 27 + x + 1);
-	c->data.f32[27 * 25] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[27 * 25 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 4; x++)
 			sum += (y * 5 + x + 2) * ((y + 23) * 27 + x + 1);
-	c->data.f32[27 * 25 + 1] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 25 + 1) * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 5; x++)
 			sum += (y * 5 + x + 1) * ((y + 23) * 27 + x + 1);
 	for (x = 2; x < 25; x++)
-		c->data.f32[27 * 25 + x] = sum + (x - 2) * 21 * 20 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[(27 * 25 + x) * 4 + i] = sum + (x - 2) * 21 * 20 / 2;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 4; x++)
 			sum += (y * 5 + x + 1) * ((y + 23) * 27 + x + 24);
-	c->data.f32[27 * 25 + 25] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 25 + 25) * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 4; y++)
 		for (x = 0; x < 3; x++)
 			sum += (y * 5 + x + 1) * ((y + 23) * 27 + x + 25);
-	c->data.f32[27 * 25 + 26] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 25 + 26) * 4 + i] = sum;
 	// the last column
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 3; x++)
 			sum += (y * 5 + x + 3) * ((y + 24) * 27 + x + 1);
-	c->data.f32[27 * 26] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[27 * 26 * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 4; x++)
 			sum += (y * 5 + x + 2) * ((y + 24) * 27 + x + 1);
-	c->data.f32[27 * 26 + 1] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 26 + 1) * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 5; x++)
 			sum += (y * 5 + x + 1) * ((y + 24) * 27 + x + 1);
 	for (x = 2; x < 25; x++)
-		c->data.f32[27 * 26 + x] = sum + (x - 2) * 16 * 15 / 2;
+		for (i = 0; i < 4; i++)
+			c->data.f32[(27 * 26 + x) * 4 + i] = sum + (x - 2) * 16 * 15 / 2;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 4; x++)
 			sum += (y * 5 + x + 1) * ((y + 24) * 27 + x + 24);
-	c->data.f32[27 * 26 + 25] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 26 + 25) * 4 + i] = sum;
 	sum = 0;
 	for (y = 0; y < 3; y++)
 		for (x = 0; x < 3; x++)
 			sum += (y * 5 + x + 1) * ((y + 24) * 27 + x + 25);
-	c->data.f32[27 * 26 + 26] = sum;
+	for (i = 0; i < 4; i++)
+		c->data.f32[(27 * 26 + 26) * 4 + i] = sum;
 	float border[] = {
 		0, 0, 0, 0
 	};
@@ -370,12 +414,15 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with non-uniform weights")
 			sum += (y * 5 + x + 1) * (y * 27 + x + 1);
 	for (y = 2; y < 25; y++)
 	{
-		c->data.f32[y * 27] = border[0] + (y - 2) * 27 * (3 + 4 + 5 + 8 + 9 + 10 + 13 + 14 + 15 + 18 + 19 + 20 + 23 + 24 + 25);
-		c->data.f32[y * 27 + 1] = border[1] + (y - 2) * 27 * (2 + 3 + 4 + 5 + 7 + 8 + 9 + 10 + 12 + 13 + 14 + 15 + 17 + 18 + 19 + 20 + 22 + 23 + 24 + 25);
-		for (x = 2; x < 25; x++)
-			c->data.f32[y * 27 + x] = sum + ((y - 2) * 27 + x - 2) * 26 * 25 / 2;
-		c->data.f32[y * 27 + 25] = border[2] + (y - 2) * 27 * (1 + 2 + 3 + 4 + 6 + 7 + 8 + 9 + 11 + 12 + 13 + 14 + 16 + 17 + 18 + 19 + 21 + 22 + 23 + 24);
-		c->data.f32[y * 27 + 26] = border[3] + (y - 2) * 27 * (1 + 2 + 3 + 6 + 7 + 8 + 11 + 12 + 13 + 16 + 17 + 18 + 21 + 22 + 23);
+		for (i = 0; i < 4; i++)
+		{
+			c->data.f32[y * 27 * 4 + i] = border[0] + (y - 2) * 27 * (3 + 4 + 5 + 8 + 9 + 10 + 13 + 14 + 15 + 18 + 19 + 20 + 23 + 24 + 25);
+			c->data.f32[(y * 27 + 1) * 4 + i] = border[1] + (y - 2) * 27 * (2 + 3 + 4 + 5 + 7 + 8 + 9 + 10 + 12 + 13 + 14 + 15 + 17 + 18 + 19 + 20 + 22 + 23 + 24 + 25);
+			for (x = 2; x < 25; x++)
+				c->data.f32[(y * 27 + x) * 4 + i] = sum + ((y - 2) * 27 + x - 2) * 26 * 25 / 2;
+			c->data.f32[(y * 27 + 25) * 4 + i] = border[2] + (y - 2) * 27 * (1 + 2 + 3 + 4 + 6 + 7 + 8 + 9 + 11 + 12 + 13 + 14 + 16 + 17 + 18 + 19 + 21 + 22 + 23 + 24);
+			c->data.f32[(y * 27 + 26) * 4 + i] = border[3] + (y - 2) * 27 * (1 + 2 + 3 + 6 + 7 + 8 + 11 + 12 + 13 + 16 + 17 + 18 + 21 + 22 + 23);
+		}
 	}
 	REQUIRE_MATRIX_EQ(b, c, "27x27 matrix should be exactly the same");
 	ccv_matrix_free(b);
@@ -383,6 +430,99 @@ TEST_CASE("convolutional network of 5x5 on 27x27 with non-uniform weights")
 	ccv_convnet_free(convnet);
 }
 
+TEST_CASE("convolutional network of 5x5x4 on 27x27x8 partitioned by 2")
+{
+	ccv_convnet_layer_param_t params = {
+		.type = CCV_CONVNET_CONVOLUTIONAL,
+		.bias = 0,
+		.sigma = 0.01,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 4,
+				.partition = 2,
+			},
+		},
+		.output = {
+			.convolutional = {
+				.count = 8,
+				.strides = 1,
+				.border = 2,
+				.rows = 5,
+				.cols = 5,
+				.channels = 4,
+				.partition = 2,
+			},
+		},
+	};
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(27, 27), &params, 1);
+	int i, k;
+	for (i = 0; i < convnet->layers->wnum; i++)
+		convnet->layers->w[i] = i;
+	for (i = 0; i < convnet->layers->net.convolutional.count; i++)
+		convnet->layers->bias[i] = i + 1;
+	ccv_dense_matrix_t* a = ccv_dense_matrix_new(27, 27, CCV_32F | 4, 0, 0);
+	for (i = 0; i < 27 * 27 * 4; i++)
+		a->data.f32[i] = 20 - i;
+	ccv_dense_matrix_t* b = 0;
+	ccv_convnet_encode(convnet, &a, &b, 1);
+	ccv_convnet_layer_param_t partitioned_params = {
+		.type = CCV_CONVNET_CONVOLUTIONAL,
+		.bias = 0,
+		.sigma = 0.01,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 2,
+				.partition = 1,
+			},
+		},
+		.output = {
+			.convolutional = {
+				.count = 4,
+				.strides = 1,
+				.border = 2,
+				.rows = 5,
+				.cols = 5,
+				.channels = 2,
+				.partition = 1,
+			},
+		},
+	};
+	ccv_convnet_t* partitioned_convnet = ccv_convnet_new(0, ccv_size(27, 27), &partitioned_params, 1);
+	memcpy(partitioned_convnet->layers->w, convnet->layers->w, sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(partitioned_convnet->layers->bias, convnet->layers->bias, sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	ccv_dense_matrix_t* aa = ccv_dense_matrix_new(27, 27, CCV_32F | 2, 0, 0);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 2; k++)
+			aa->data.f32[i * 2 + k] = a->data.f32[i * 4 + k];
+	ccv_dense_matrix_t* bb = ccv_dense_matrix_new(27, 27, CCV_32F | 8, 0, 0);
+	ccv_dense_matrix_t* cc = 0;
+	ccv_convnet_encode(partitioned_convnet, &aa, &cc, 1);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 4; k++)
+			bb->data.f32[i * 8 + k] = cc->data.f32[i * 4 + k];
+	memcpy(partitioned_convnet->layers->w, convnet->layers->w + (convnet->layers->wnum / 2), sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(partitioned_convnet->layers->bias, convnet->layers->bias + (convnet->layers->net.convolutional.count / 2), sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 2; k++)
+			aa->data.f32[i * 2 + k] = a->data.f32[i * 4 + 2 + k];
+	ccv_convnet_encode(partitioned_convnet, &aa, &cc, 1);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 4; k++)
+			bb->data.f32[i * 8 + 4 + k] = cc->data.f32[i * 4 + k];
+	REQUIRE_MATRIX_EQ(b, bb, "27x27x8 matrix computed from convnet with partition and partitioned convnet should be exactly the same");
+	ccv_matrix_free(a);
+	ccv_matrix_free(b);
+	ccv_matrix_free(aa);
+	ccv_matrix_free(bb);
+	ccv_matrix_free(cc);
+	ccv_convnet_free(convnet);
+	ccv_convnet_free(partitioned_convnet);
+}
+
 TEST_CASE("full connect network from 13x13x128 to 2048")
 {
 	ccv_convnet_layer_param_t params = {
@@ -394,6 +534,7 @@ TEST_CASE("full connect network from 13x13x128 to 2048")
 				.rows = 13,
 				.cols = 13,
 				.channels = 128,
+				.partition = 1,
 			},
 			.node = {
 				.count = 13 * 13 * 128,
@@ -401,11 +542,12 @@ TEST_CASE("full connect network from 13x13x128 to 2048")
 		},
 		.output = {
 			.full_connect = {
+				.relu = 0,
 				.count = 2048,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(13, 13), &params, 1);
 	int i;
 	for (i = 0; i < 13 * 13 * 128 * 2048; i++)
 		convnet->layers->w[i] = 1;
@@ -414,6 +556,7 @@ TEST_CASE("full connect network from 13x13x128 to 2048")
 		a->data.f32[i] = 1;
 	ccv_dense_matrix_t* b = 0;
 	ccv_convnet_encode(convnet, &a, &b, 1);
+	ccv_matrix_free(a);
 	REQUIRE(b->rows == 2048 && b->cols == 1, "full connect network output should be 2048 neurons");
 	ccv_dense_matrix_t* c = ccv_dense_matrix_new(2048, 1, CCV_32F | CCV_C1, 0, 0);
 	for (i = 0; i < 2048; i++)
@@ -435,6 +578,7 @@ TEST_CASE("maximum pool network of 55x55 with window of 3x3 and stride of 2")
 				.rows = 55,
 				.cols = 55,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -445,7 +589,7 @@ TEST_CASE("maximum pool network of 55x55 with window of 3x3 and stride of 2")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(55, 55), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(55, 55, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 55 * 55; i++)
@@ -474,6 +618,7 @@ TEST_CASE("maximum pool network of 57x57 with window of 3x3 and stride of 3")
 				.rows = 57,
 				.cols = 57,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -484,7 +629,7 @@ TEST_CASE("maximum pool network of 57x57 with window of 3x3 and stride of 3")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(57, 57), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(57, 57, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 57 * 57; i++)
@@ -513,6 +658,7 @@ TEST_CASE("maximum pool network of 54x54 with window of 2x2 and stride of 2")
 				.rows = 54,
 				.cols = 54,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -523,7 +669,7 @@ TEST_CASE("maximum pool network of 54x54 with window of 2x2 and stride of 2")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(54, 54), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(54, 54, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 54 * 54; i++)
@@ -552,6 +698,7 @@ TEST_CASE("average pool network of 55x55 with window of 3x3 and stride of 2")
 				.rows = 55,
 				.cols = 55,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -562,7 +709,7 @@ TEST_CASE("average pool network of 55x55 with window of 3x3 and stride of 2")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(55, 55), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(55, 55, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 55 * 55; i++)
@@ -591,6 +738,7 @@ TEST_CASE("average pool network of 57x57 with window of 3x3 and stride of 3")
 				.rows = 57,
 				.cols = 57,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -601,7 +749,7 @@ TEST_CASE("average pool network of 57x57 with window of 3x3 and stride of 3")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(57, 57), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(57, 57, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 57 * 57; i++)
@@ -630,6 +778,7 @@ TEST_CASE("average pool network of 54x54 with window of 2x2 and stride of 2")
 				.rows = 54,
 				.cols = 54,
 				.channels = 1,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -640,7 +789,7 @@ TEST_CASE("average pool network of 54x54 with window of 2x2 and stride of 2")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(54, 54), &params, 1);
 	ccv_dense_matrix_t* a = ccv_dense_matrix_new(54, 54, CCV_32F | CCV_C1, 0, 0);
 	int i, x, y;
 	for (i = 0; i < 54 * 54; i++)
@@ -658,6 +807,81 @@ TEST_CASE("average pool network of 54x54 with window of 2x2 and stride of 2")
 	ccv_convnet_free(convnet);
 }
 
+TEST_CASE("local response normalization with partitioned by 2")
+{
+	ccv_convnet_layer_param_t params = {
+		.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 10,
+				.partition = 2,
+			},
+		},
+		.output = {
+			.rnorm = {
+				.size = 3,
+				.kappa = 2,
+				.alpha = 1e-4,
+				.beta = 0.75,
+			},
+		},
+	};
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(27, 27), &params, 1);
+	int i, k;
+	ccv_dense_matrix_t* a = ccv_dense_matrix_new(27, 27, CCV_32F | 10, 0, 0);
+	for (i = 0; i < 27 * 27 * 10; i++)
+		a->data.f32[i] = i;
+	ccv_dense_matrix_t* b = 0;
+	ccv_convnet_encode(convnet, &a, &b, 1);
+	ccv_convnet_layer_param_t partitioned_params = {
+		.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 5,
+				.partition = 1,
+			},
+		},
+		.output = {
+			.rnorm = {
+				.size = 3,
+				.kappa = 2,
+				.alpha = 1e-4,
+				.beta = 0.75,
+			},
+		},
+	};
+	ccv_convnet_t* partitioned_convnet = ccv_convnet_new(0, ccv_size(27, 27), &partitioned_params, 1);
+	ccv_dense_matrix_t* aa = ccv_dense_matrix_new(27, 27, CCV_32F | 5, 0, 0);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 5; k++)
+			aa->data.f32[i * 5 + k] = a->data.f32[i * 10 + k];
+	ccv_dense_matrix_t* bb = ccv_dense_matrix_new(27, 27, CCV_32F | 10, 0, 0);
+	ccv_dense_matrix_t* cc = 0;
+	ccv_convnet_encode(partitioned_convnet, &aa, &cc, 1);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 5; k++)
+			bb->data.f32[i * 10 + k] = cc->data.f32[i * 5 + k];
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 5; k++)
+			aa->data.f32[i * 5 + k] = a->data.f32[i * 10 + 5 + k];
+	ccv_convnet_encode(partitioned_convnet, &aa, &cc, 1);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 5; k++)
+			bb->data.f32[i * 10 + 5 + k] = cc->data.f32[i * 5 + k];
+	REQUIRE_MATRIX_EQ(b, bb, "27x27x10 matrix computed from convnet with partition and partitioned convnet should be exactly the same");
+	ccv_matrix_free(a);
+	ccv_matrix_free(b);
+	ccv_matrix_free(aa);
+	ccv_matrix_free(bb);
+	ccv_matrix_free(cc);
+	ccv_convnet_free(convnet);
+	ccv_convnet_free(partitioned_convnet);
+}
+
 // we probably won't cover all static functions in this test, disable annoying warnings
 #pragma GCC diagnostic ignored "-Wunused-function"
 // so that we can test static functions, note that CASE_TESTS is defined in case.h, which will disable all extern functions
@@ -674,6 +898,7 @@ TEST_CASE("full connect network backward propagate")
 				.rows = 3,
 				.cols = 3,
 				.channels = 64,
+				.partition = 1,
 			},
 			.node = {
 				.count = 3 * 3 * 64,
@@ -681,11 +906,12 @@ TEST_CASE("full connect network backward propagate")
 		},
 		.output = {
 			.full_connect = {
+				.relu = 0,
 				.count = 10,
 			},
 		},
 	};
-	ccv_convnet_t *convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t *convnet = ccv_convnet_new(0, ccv_size(3, 3), &params, 1);
 	int i, j;
 	for (i = 0; i < 3 * 3 * 64 * 10; i++)
 		convnet->layers[0].w[i] = 2;
@@ -697,13 +923,13 @@ TEST_CASE("full connect network backward propagate")
 	ccv_dense_matrix_t* y = 0;
 	ccv_convnet_encode(convnet, &x, &y, 1);
 	REQUIRE(y->rows == 10 && y->cols == 1 && CCV_GET_CHANNEL(y->type) == 1, "y should be a 10-dimensional vector");
-	ccv_matrix_free(y);
 	ccv_dense_matrix_t* loss = ccv_dense_matrix_new(10, 1, CCV_32F | CCV_C1, 0, 0);
 	loss->data.f32[0] = 18;
 	for (i = 1; i < 10; i++)
 		loss->data.f32[i] = -1;
 	ccv_dense_matrix_t* b = 0;
-	_ccv_convnet_full_connect_backward_propagate(convnet->layers, loss, 0, x, &b, update_params->layers);
+	_ccv_convnet_full_connect_backward_propagate(convnet->layers, loss, y, x, &b, update_params->layers);
+	ccv_matrix_free(y);
 	ccv_matrix_free(x);
 	ccv_matrix_free(loss);
 	ccv_dense_matrix_t* db = ccv_dense_matrix_new(3, 3, CCV_32F | 64, 0, 0);
@@ -741,6 +967,7 @@ TEST_CASE("convolutional network backward propagate")
 				.rows = 31,
 				.cols = 31,
 				.channels = 3,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -751,10 +978,11 @@ TEST_CASE("convolutional network backward propagate")
 				.border = 2,
 				.strides = 1,
 				.count = 32,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), &params, 1);
 	int i, j, k;
 	for (i = 0; i < 5 * 5 * 3 * 32; i++)
 		convnet->layers[0].w[i] = 2;
@@ -770,7 +998,7 @@ TEST_CASE("convolutional network backward propagate")
 	for (i = 0; i < 31 * 31 * 32; i++)
 		loss->data.f32[i] = 1;
 	ccv_dense_matrix_t* d = 0;
-	_ccv_convnet_convolutional_backward_propagate(convnet->layers, loss, y, 0, x, &d, update_params->layers);
+	_ccv_convnet_convolutional_backward_propagate(convnet->layers, loss, y, x, &d, update_params->layers);
 	ccv_matrix_free(loss);
 	ccv_matrix_free(y);
 	ccv_matrix_free(x);
@@ -801,6 +1029,246 @@ TEST_CASE("convolutional network backward propagate")
 	ccv_convnet_free(convnet);
 }
 
+TEST_CASE("convolutional network backward propagate with partitioned by 2")
+{
+	ccv_convnet_layer_param_t params = {
+		.type = CCV_CONVNET_CONVOLUTIONAL,
+		.bias = 0,
+		.sigma = 0.0001,
+		.input = {
+			.matrix = {
+				.rows = 31,
+				.cols = 31,
+				.channels = 4,
+				.partition = 2,
+			},
+		},
+		.output = {
+			.convolutional = {
+				.rows = 5,
+				.cols = 5,
+				.channels = 4,
+				.border = 2,
+				.strides = 1,
+				.count = 8,
+				.partition = 2,
+			},
+		},
+	};
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), &params, 1);
+	int i, k;
+	for (i = 0; i < convnet->layers->wnum; i++)
+		convnet->layers->w[i] = i * 1e-2;
+	for (i = 0; i < convnet->layers->net.convolutional.count; i++)
+		convnet->layers->bias[i] = i;
+	ccv_dense_matrix_t* a = ccv_dense_matrix_new(31, 31, CCV_32F | 4, 0, 0);
+	for (i = 0; i < 31 * 31 * 4; i++)
+		a->data.f32[i] = 2000 - i;
+	ccv_dense_matrix_t* b = 0;
+	ccv_convnet_encode(convnet, &a, &b, 1);
+	ccv_dense_matrix_t* loss = ccv_dense_matrix_new(b->rows, b->cols, CCV_32F | CCV_GET_CHANNEL(b->type), 0, 0);
+	for (i = 0; i < 31 * 31 * 8; i++)
+		loss->data.f32[i] = 1;
+	ccv_dense_matrix_t* d = 0;
+	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
+	_ccv_convnet_update_zero(update_params);
+	_ccv_convnet_convolutional_backward_propagate(convnet->layers, loss, b, a, &d, update_params->layers);
+	ccv_matrix_free(loss);
+	ccv_convnet_layer_param_t partitioned_params = {
+		.type = CCV_CONVNET_CONVOLUTIONAL,
+		.bias = 0,
+		.sigma = 0.0001,
+		.input = {
+			.matrix = {
+				.rows = 31,
+				.cols = 31,
+				.channels = 2,
+				.partition = 1,
+			},
+		},
+		.output = {
+			.convolutional = {
+				.rows = 5,
+				.cols = 5,
+				.channels = 2,
+				.border = 2,
+				.strides = 1,
+				.count = 4,
+				.partition = 1,
+			},
+		},
+	};
+	ccv_convnet_t* partitioned_convnet = ccv_convnet_new(0, ccv_size(31, 31), &partitioned_params, 1);
+	ccv_dense_matrix_t* aa = ccv_dense_matrix_new(31, 31, CCV_32F | 2, 0, 0);
+	// first partition
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 2; k++)
+			aa->data.f32[i * 2 + k] = a->data.f32[i * 4 + k];
+	memcpy(partitioned_convnet->layers->w, convnet->layers->w, sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(partitioned_convnet->layers->bias, convnet->layers->bias, sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	ccv_dense_matrix_t* bb = 0;
+	ccv_convnet_encode(partitioned_convnet, &aa, &bb, 1);
+	ccv_dense_matrix_t* bbb = ccv_dense_matrix_new(31, 31, CCV_32F | 8, 0, 0);
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 4; k++)
+			bbb->data.f32[i * 8 + k] = bb->data.f32[i * 4 + k];
+	loss = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_GET_CHANNEL(bb->type), 0, 0);
+	for (i = 0; i < 31 * 31 * 4; i++)
+		loss->data.f32[i] = 1;
+	ccv_dense_matrix_t* dd = 0;
+	ccv_convnet_t* partitioned_update_params = _ccv_convnet_update_new(convnet);
+	_ccv_convnet_update_zero(partitioned_update_params);
+	_ccv_convnet_convolutional_backward_propagate(partitioned_convnet->layers, loss, bb, aa, &dd, partitioned_update_params->layers);
+	ccv_dense_matrix_t* ddd = ccv_dense_matrix_new(31, 31, CCV_32F | 4, 0, 0);
+	float* ww = (float*)ccmalloc(sizeof(float) * (convnet->layers->wnum + convnet->layers->net.convolutional.count));
+	float* bbias = ww + convnet->layers->wnum;
+	memcpy(ww, partitioned_update_params->layers->w, sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(bbias, partitioned_update_params->layers->bias, sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 2; k++)
+			ddd->data.f32[i * 4 + k] = dd->data.f32[i * 2 + k];
+	// second partition
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 2; k++)
+			aa->data.f32[i * 2 + k] = a->data.f32[i * 4 + 2 + k];
+	memcpy(partitioned_convnet->layers->w, convnet->layers->w + (convnet->layers->wnum / 2), sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(partitioned_convnet->layers->bias, convnet->layers->bias + (convnet->layers->net.convolutional.count / 2), sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	ccv_convnet_compact(partitioned_convnet); // because it is reused, we need to clear intermediate data
+	ccv_convnet_encode(partitioned_convnet, &aa, &bb, 1);
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 4; k++)
+			bbb->data.f32[i * 8 + 4 + k] = bb->data.f32[i * 4 + k];
+	REQUIRE_MATRIX_EQ(b, bbb, "forward pass doesn't match the expected value");
+	_ccv_convnet_update_zero(partitioned_update_params);
+	_ccv_convnet_convolutional_backward_propagate(partitioned_convnet->layers, loss, bb, aa, &dd, partitioned_update_params->layers);
+	memcpy(ww + (convnet->layers->wnum / 2), partitioned_update_params->layers->w, sizeof(float) * (convnet->layers->wnum / 2));
+	memcpy(bbias + (convnet->layers->net.convolutional.count / 2), partitioned_update_params->layers->bias, sizeof(float) * (convnet->layers->net.convolutional.count / 2));
+	for (i = 0; i < 31 * 31; i++)
+		for (k = 0; k < 2; k++)
+			ddd->data.f32[i * 4 + 2 + k] = dd->data.f32[i * 2 + k];
+	REQUIRE_MATRIX_EQ(d, ddd, "propagated error doesn't match the expected value");
+	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, ww, update_params->layers[0].w, convnet->layers->wnum, 1e-4, "weight gradient doesn't match the expected value");
+	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, bbias, update_params->layers[0].bias, convnet->layers->net.convolutional.count, 1e-4, "bias gradient doesn't match the expected value");
+	ccfree(ww);
+	ccv_matrix_free(loss);
+	ccv_matrix_free(ddd);
+	ccv_matrix_free(dd);
+	ccv_matrix_free(bbb);
+	ccv_matrix_free(bb);
+	ccv_matrix_free(aa);
+	ccv_matrix_free(d);
+	ccv_matrix_free(b);
+	ccv_matrix_free(a);
+	ccv_convnet_free(convnet);
+	ccv_convnet_free(update_params);
+	ccv_convnet_free(partitioned_convnet);
+	ccv_convnet_free(partitioned_update_params);
+}
+
+TEST_CASE("local response normalization backward propagate with partitioned by 2")
+{
+	ccv_convnet_layer_param_t params = {
+		.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 6,
+				.partition = 2,
+			},
+		},
+		.output = {
+			.rnorm = {
+				.size = 3,
+				.kappa = 2,
+				.alpha = 1e-4,
+				.beta = 0.75,
+			},
+		},
+	};
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(27, 27), &params, 1);
+	int i, k;
+	ccv_dense_matrix_t* a = ccv_dense_matrix_new(27, 27, CCV_32F | 6, 0, 0);
+	for (i = 0; i < 27 * 27 * 6; i++)
+		a->data.f32[i] = i;
+	ccv_dense_matrix_t* b = 0;
+	ccv_convnet_encode(convnet, &a, &b, 1);
+	ccv_dense_matrix_t* d = 0;
+	ccv_dense_matrix_t* loss = ccv_dense_matrix_new(27, 27, CCV_32F | 6, 0, 0);
+	for (i = 0; i < 27 * 27 * 6; i++)
+		loss->data.f32[i] = 1;
+	_ccv_convnet_rnorm_backward_propagate(convnet->layers, loss, b, a, convnet->denoms[0], &d);
+	ccv_convnet_layer_param_t partitioned_params = {
+		.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+		.input = {
+			.matrix = {
+				.rows = 27,
+				.cols = 27,
+				.channels = 3,
+				.partition = 1,
+			},
+		},
+		.output = {
+			.rnorm = {
+				.size = 3,
+				.kappa = 2,
+				.alpha = 1e-4,
+				.beta = 0.75,
+			},
+		},
+	};
+	ccv_convnet_t* partitioned_convnet = ccv_convnet_new(0, ccv_size(27, 27), &partitioned_params, 1);
+	ccv_dense_matrix_t* aa = ccv_dense_matrix_new(27, 27, CCV_32F | 3, 0, 0);
+	// first partition
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 3; k++)
+			aa->data.f32[i * 3 + k] = a->data.f32[i * 6 + k];
+	ccv_dense_matrix_t* bb = 0;
+	ccv_convnet_encode(partitioned_convnet, &aa, &bb, 1);
+	ccv_matrix_free(loss);
+	loss = ccv_dense_matrix_new(27, 27, CCV_32F | 3, 0, 0);
+	for (i = 0; i < 27 * 27 * 3; i++)
+		loss->data.f32[i] = 1;
+	ccv_dense_matrix_t* dd = 0;
+	_ccv_convnet_rnorm_backward_propagate(partitioned_convnet->layers, loss, bb, aa, partitioned_convnet->denoms[0], &dd);
+	ccv_dense_matrix_t* ddd = ccv_dense_matrix_new(27, 27, CCV_32F | 6, 0, 0);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 3; k++)
+			ddd->data.f32[i * 6 + k] = dd->data.f32[i * 3 + k];
+	// second partition
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 3; k++)
+			aa->data.f32[i * 3 + k] = a->data.f32[i * 6 + 3 + k];
+	ccv_convnet_encode(partitioned_convnet, &aa, &bb, 1);
+	_ccv_convnet_rnorm_backward_propagate(partitioned_convnet->layers, loss, bb, aa, partitioned_convnet->denoms[0], &dd);
+	for (i = 0; i < 27 * 27; i++)
+		for (k = 0; k < 3; k++)
+			ddd->data.f32[i * 6 + 3 + k] = dd->data.f32[i * 3 + k];
+	REQUIRE_MATRIX_EQ(d, ddd, "27x27x6 error local response normalization backward propagated from convnet with partition and partitioned convnet should be exactly the same");
+	ccv_matrix_free(a);
+	ccv_matrix_free(b);
+	ccv_matrix_free(d);
+	ccv_matrix_free(aa);
+	ccv_matrix_free(bb);
+	ccv_matrix_free(dd);
+	ccv_matrix_free(ddd);
+	ccv_matrix_free(loss);
+	ccv_convnet_free(convnet);
+	ccv_convnet_free(partitioned_convnet);
+}
+
+// five-stencil constants
+static float fs[4] = { 1, -8, 8, -1 };
+static float fsh[4] = { -2, -1, 1, 2 };
+
+static float dsfmt_genrand_gaussian(dsfmt_t* dsfmt, float sigma)
+{
+	double rand1 = dsfmt_genrand_open_close(dsfmt);
+	rand1 = -2 * log(rand1);
+	double rand2 = dsfmt_genrand_open_close(dsfmt) * CCV_PI * 2;
+	return (float)(sqrt(sigma * rand1) * cos(rand2));
+}
+
 TEST_CASE("numerical gradient versus analytical gradient for full connect network")
 {
 	ccv_convnet_layer_param_t params = {
@@ -812,6 +1280,7 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 				.rows = 3,
 				.cols = 3,
 				.channels = 8,
+				.partition = 1,
 			},
 			.node = {
 				.count = 3 * 3 * 8,
@@ -819,15 +1288,20 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 		},
 		.output = {
 			.full_connect = {
+				.relu = 0,
 				.count = 10,
 			},
 		},
 	};
-	ccv_convnet_t *convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t *convnet = ccv_convnet_new(0, ccv_size(3, 3), &params, 1);
+	dsfmt_t dsfmt;
+	dsfmt_init_gen_rand(&dsfmt, 0);
+	int i, j, k;
+	for (i = 0; i < convnet->layers->wnum; i++)
+		convnet->layers->w[i] = dsfmt_genrand_gaussian(&dsfmt, 0.01);
 	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
 	_ccv_convnet_update_zero(update_params);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(3, 3, CCV_32F | 8, 0, 0);
-	int i, j;
 	for (i = 0; i < 3 * 3 * 8; i++)
 		x->data.f32[i] = i;
 	ccv_dense_matrix_t* y = 0;
@@ -842,36 +1316,46 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 	for (i = 0; i < 10; i++)
 		for (j = 0; j < 3 * 3 * 8; j++)
 		{
-			float w = convnet->layers->w[j + i * 3 * 3 * 8];
-			convnet->layers->w[j + i * 3 * 3 * 8] += eps;
+			dw[j + i * 3 * 3 * 8] = 0;
+			for (k = 0; k < 4; k++)
+			{
+				float w = convnet->layers->w[j + i * 3 * 3 * 8];
+				convnet->layers->w[j + i * 3 * 3 * 8] += fsh[k] * eps;
+				ccv_dense_matrix_t* z = 0;
+				ccv_convnet_encode(convnet, &x, &z, 1);
+				_ccv_convnet_compute_softmax(z, &z, 0);
+				dw[j + i * 3 * 3 * 8] += -logf(z->data.f32[2]) * fs[k];
+				ccv_matrix_free(z);
+				convnet->layers->w[j + i * 3 * 3 * 8] = w;
+			}
+			dw[j + i * 3 * 3 * 8] *= 1.0 / (12 * eps);
+		}
+	float* dbias = (float*)ccmalloc(sizeof(float) * 10);
+	for (i = 0; i < 10; i++)
+	{
+		dbias[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float bias = convnet->layers->bias[i];
+			convnet->layers->bias[i] += fsh[k] * eps;
 			ccv_dense_matrix_t* z = 0;
 			ccv_convnet_encode(convnet, &x, &z, 1);
 			_ccv_convnet_compute_softmax(z, &z, 0);
-			dw[j + i * 3 * 3 * 8] = ((-logf(z->data.f32[2])) - (-logf(y->data.f32[2]))) / eps;
+			dbias[i] += -logf(z->data.f32[2]) * fs[k];
 			ccv_matrix_free(z);
-			convnet->layers->w[j + i * 3 * 3 * 8] = w;
+			convnet->layers->bias[i] = bias;
 		}
-	float* dbias = (float*)ccmalloc(sizeof(float) * 10);
-	for (i = 0; i < 10; i++)
-	{
-		float bias = convnet->layers->bias[i];
-		convnet->layers->bias[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dbias[i] = ((-logf(z->data.f32[2])) - (-logf(y->data.f32[2]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->bias[i] = bias;
+		dbias[i] *= 1.0 / (12 * eps);
 	}
 	ccv_dense_matrix_t* b = 0;
-	_ccv_convnet_full_connect_backward_propagate(convnet->layers, dloss, 0, x, &b, update_params->layers);
+	_ccv_convnet_full_connect_backward_propagate(convnet->layers, dloss, y, x, &b, update_params->layers);
 	ccv_matrix_free(y);
 	ccv_matrix_free(x);
 	ccv_matrix_free(dloss);
 	ccv_matrix_free(b);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw, update_params->layers[0].w, 3 * 3 * 8 * 10, 8 * 1e-2, "weight gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dw, update_params->layers[0].w, 3 * 3 * 8 * 10, 30, 2e-1, "weight gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dw);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias, update_params->layers[0].bias, 10, 1e-2, "bias gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dbias, update_params->layers[0].bias, 10, 30, 2e-1, "bias gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dbias);
 	ccv_convnet_free(update_params);
 	ccv_convnet_free(convnet);
@@ -888,6 +1372,7 @@ TEST_CASE("numerical gradient versus analytical gradient for convolutional netwo
 				.rows = 31,
 				.cols = 31,
 				.channels = 3,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -897,12 +1382,17 @@ TEST_CASE("numerical gradient versus analytical gradient for convolutional netwo
 				.channels = 3,
 				.border = 2,
 				.strides = 1,
-				.count = 2,
+				.count = 4,
+				.partition = 1,
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
-	int i;
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), &params, 1);
+	dsfmt_t dsfmt;
+	dsfmt_init_gen_rand(&dsfmt, 1);
+	int i, k;
+	for (i = 0; i < convnet->layers->wnum; i++)
+		convnet->layers->w[i] = dsfmt_genrand_gaussian(&dsfmt, 0.0001);
 	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
 	_ccv_convnet_update_zero(update_params);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C3, 0, 0);
@@ -910,151 +1400,59 @@ TEST_CASE("numerical gradient versus analytical gradient for convolutional netwo
 		x->data.f32[i] = i;
 	ccv_dense_matrix_t* y = 0;
 	ccv_convnet_encode(convnet, &x, &y, 1);
-	REQUIRE(y->rows == 31 && y->cols == 31 && CCV_GET_CHANNEL(y->type) == 2, "convnet should return a 31x31x2 matrix");
+	REQUIRE(y->rows == 31 && y->cols == 31 && CCV_GET_CHANNEL(y->type) == 4, "convnet should return a 31x31x4 matrix");
 	ccv_dense_matrix_t* softmax = 0;
 	_ccv_convnet_compute_softmax(y, &softmax, 0);
 	ccv_dense_matrix_t* dloss = ccv_dense_matrix_new(y->rows, y->cols, CCV_32F | CCV_GET_CHANNEL(y->type), 0, 0);
-	for (i = 0; i < 31 * 31 * 2; i++)
+	for (i = 0; i < 31 * 31 * 4; i++)
 		dloss->data.f32[i] = softmax->data.f32[i] - (i == 24);
 	static const float eps = 0.000005;
-	float* dw = (float*)ccmalloc(sizeof(float) * 5 * 5 * 3 * 2); 
-	for (i = 0; i < 5 * 5 * 3 * 2; i++)
-	{
-		float w = convnet->layers->w[i];
-		convnet->layers->w[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dw[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->w[i] = w;
-	}
-	float* dbias = (float*)ccmalloc(sizeof(float) * 2);
-	for (i = 0; i < 2; i++)
+	float* dw = (float*)ccmalloc(sizeof(float) * 5 * 5 * 3 * 4); 
+	for (i = 0; i < 5 * 5 * 3 * 4; i++)
 	{
-		float bias = convnet->layers->bias[i];
-		convnet->layers->bias[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dbias[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->bias[i] = bias;
-	}
-	ccv_dense_matrix_t* d = 0;
-	_ccv_convnet_convolutional_backward_propagate(convnet->layers, dloss, y, 0, x, &d, update_params->layers);
-	ccv_matrix_free(softmax);
-	ccv_matrix_free(dloss);
-	ccv_matrix_free(y);
-	ccv_matrix_free(x);
-	ccv_matrix_free(d);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw, update_params->layers[0].w, 5 * 5 * 3 * 2, 4.0, "weight gradient from analytical method doesn't match the one from numerical method");
-	ccfree(dw);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias, update_params->layers[0].bias, 2, 5 * 1e-1, "bias gradient from analytical method doesn't match the one from numerical method");
-	ccfree(dbias);
-	ccv_convnet_free(update_params);
-	ccv_convnet_free(convnet);
-}
-
-TEST_CASE("numerical gradient versus analytical gradient for convolutional network over convolutional network")
-{
-	ccv_convnet_layer_param_t params[] = {
+		dw[i] = 0;
+		for (k = 0; k < 4; k++)
 		{
-			.type = CCV_CONVNET_CONVOLUTIONAL,
-			.bias = 0,
-			.sigma = 0.001,
-			.input = {
-				.matrix = {
-					.rows = 31,
-					.cols = 31,
-					.channels = 2,
-				},
-			},
-			.output = {
-				.convolutional = {
-					.rows = 5,
-					.cols = 5,
-					.channels = 2,
-					.border = 2,
-					.strides = 1,
-					.count = 2,
-				},
-			},
-		},
-		{
-			.type = CCV_CONVNET_CONVOLUTIONAL,
-			.bias = 0,
-			.sigma = 0.001,
-			.input = {
-				.matrix = {
-					.rows = 31,
-					.cols = 31,
-					.channels = 2,
-				},
-			},
-			.output = {
-				.convolutional = {
-					.rows = 5,
-					.cols = 5,
-					.channels = 2,
-					.border = 2,
-					.strides = 1,
-					.count = 2,
-				},
-			},
-		},
-	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, params, 2);
-	int i;
-	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
-	_ccv_convnet_update_zero(update_params);
-	ccv_dense_matrix_t* x = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C2, 0, 0);
-	for (i = 0; i < 31 * 31 * 2; i++)
-		x->data.f32[i] = i;
-	ccv_dense_matrix_t* y = 0;
-	ccv_convnet_encode(convnet, &x, &y, 1);
-	REQUIRE(y->rows == 31 && y->cols == 31 && CCV_GET_CHANNEL(y->type) == 2, "convnet should return a 31x31x2 matrix");
-	ccv_dense_matrix_t* softmax = 0;
-	_ccv_convnet_compute_softmax(y, &softmax, 0);
-	ccv_dense_matrix_t* dloss = ccv_dense_matrix_new(y->rows, y->cols, CCV_32F | CCV_GET_CHANNEL(y->type), 0, 0);
-	for (i = 0; i < 31 * 31 * 2; i++)
-		dloss->data.f32[i] = softmax->data.f32[i] - (i == 24);
-	ccv_dense_matrix_t* d = 0;
-	_ccv_convnet_convolutional_backward_propagate(convnet->layers + 1, dloss, y, 0, convnet->acts[0], update_params->acts, update_params->layers + 1);
-	_ccv_convnet_convolutional_backward_propagate(convnet->layers, update_params->acts[0], convnet->acts[0], 0, x, &d, update_params->layers);
-	static const float eps = 0.00001;
-	float* dw = (float*)ccmalloc(sizeof(float) * 5 * 5 * 2 * 2); 
-	for (i = 0; i < 5 * 5 * 2 * 2; i++)
-	{
-		float w = convnet->layers->w[i];
-		convnet->layers->w[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dw[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->w[i] = w;
+			float w = convnet->layers->w[i];
+			convnet->layers->w[i] += fsh[k] * eps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dw[i] += -logf(z->data.f32[24]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->w[i] = w;
+		}
+		dw[i] *= 1.0 / (12 * eps);
 	}
-	float* dbias = (float*)ccmalloc(sizeof(float) * 2);
-	for (i = 0; i < 2; i++)
+	float* dbias = (float*)ccmalloc(sizeof(float) * 4);
+	for (i = 0; i < 4; i++)
 	{
-		float bias = convnet->layers->bias[i];
-		convnet->layers->bias[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dbias[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->bias[i] = bias;
+		dbias[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float bias = convnet->layers->bias[i];
+			convnet->layers->bias[i] += fsh[k] * eps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dbias[i] += -logf(z->data.f32[24]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->bias[i] = bias;
+		}
+		dbias[i] *= 1.0 / (12 * eps);
 	}
+	ccv_dense_matrix_t* d = 0;
+	_ccv_convnet_convolutional_backward_propagate(convnet->layers, dloss, y, x, &d, update_params->layers);
 	ccv_matrix_free(softmax);
 	ccv_matrix_free(dloss);
 	ccv_matrix_free(y);
 	ccv_matrix_free(x);
 	ccv_matrix_free(d);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw, update_params->layers[0].w, 5 * 5 * 2 * 2, 5 * 1e-1, "weight gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dw, update_params->layers[0].w, 5 * 5 * 3 * 4, 30, 2e-1, "weight gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dw);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias, update_params->layers[0].bias, 2, 5 * 1e-2, "bias gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dbias, update_params->layers[0].bias, 4, 30, 2e-1, "bias gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dbias);
 	ccv_convnet_free(update_params);
 	ccv_convnet_free(convnet);
@@ -1072,6 +1470,7 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 					.rows = 5,
 					.cols = 5,
 					.channels = 2,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -1081,7 +1480,8 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 					.channels = 2,
 					.border = 1,
 					.strides = 1,
-					.count = 2,
+					.count = 4,
+					.partition = 1,
 				},
 			},
 		},
@@ -1093,26 +1493,34 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 				.matrix = {
 					.rows = 5,
 					.cols = 5,
-					.channels = 2,
+					.channels = 4,
+					.partition = 1,
 				},
 				.node = {
-					.count = 5 * 5 * 2,
+					.count = 5 * 5 * 4,
 				},
 			},
 			.output = {
 				.full_connect = {
+					.relu = 0,
 					.count = 10,
 				},
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, params, 2);
-	int i;
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(5, 5), params, 2);
+	dsfmt_t dsfmt;
+	dsfmt_init_gen_rand(&dsfmt, 2);
+	int i, k;
+	for (i = 0; i < convnet->layers[0].wnum; i++)
+		convnet->layers[0].w[i] = dsfmt_genrand_gaussian(&dsfmt, 0.001);
+	for (i = 0; i < convnet->layers[1].wnum; i++)
+		convnet->layers[1].w[i] = dsfmt_genrand_gaussian(&dsfmt, 0.01);
 	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
 	_ccv_convnet_update_zero(update_params);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(5, 5, CCV_32F | CCV_C2, 0, 0);
 	for (i = 0; i < 5 * 5 * 2; i++)
-		x->data.f32[i] = 1;
+		x->data.f32[i] = 0.2;
 	ccv_dense_matrix_t* y = 0;
 	ccv_convnet_encode(convnet, &x, &y, 1);
 	REQUIRE(y->rows == 10 && y->cols == 1 && CCV_GET_CHANNEL(y->type) == 1, "y should be a 10-dimensional vector");
@@ -1123,35 +1531,47 @@ TEST_CASE("numerical gradient versus analytical gradient for full connect networ
 	_ccv_convnet_propagate_loss(convnet, x, dloss, update_params);
 	ccv_matrix_free(dloss);
 	static const float eps = 0.0001;
-	float* dw = (float*)ccmalloc(sizeof(float) * 3 * 3 * 2 * 2); 
-	for (i = 0; i < 3 * 3 * 2 * 2; i++)
+	float* dw = (float*)ccmalloc(sizeof(float) * 3 * 3 * 2 * 4); 
+	for (i = 0; i < 3 * 3 * 2 * 4; i++)
 	{
-		float w = convnet->layers->w[i];
-		convnet->layers->w[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dw[i] = ((-logf(z->data.f32[2])) - (-logf(y->data.f32[2]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->w[i] = w;
+		dw[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float w = convnet->layers->w[i];
+			convnet->layers->w[i] += fsh[k] * eps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dw[i] += -logf(z->data.f32[2]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->w[i] = w;
+		}
+		dw[i] *= 1.0 / (12 * eps);
 	}
-	float* dbias = (float*)ccmalloc(sizeof(float) * 2);
-	for (i = 0; i < 2; i++)
+	float* dbias = (float*)ccmalloc(sizeof(float) * 4);
+	for (i = 0; i < 4; i++)
 	{
-		float bias = convnet->layers->bias[i];
-		convnet->layers->bias[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dbias[i] = ((-logf(z->data.f32[2])) - (-logf(y->data.f32[2]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->bias[i] = bias;
+		dbias[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float bias = convnet->layers->bias[i];
+			convnet->layers->bias[i] += fsh[k] * eps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dbias[i] += -logf(z->data.f32[2]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->bias[i] = bias;
+		}
+		dbias[i] *= 1.0 / (12 * eps);
 	}
 	ccv_matrix_free(y);
 	ccv_matrix_free(x);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw, update_params->layers[0].w, 3 * 3 * 2 * 2, 1e-2, "weight gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dw, update_params->layers[0].w, 3 * 3 * 2 * 4, 30, 2e-1, "weight gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dw);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias, update_params->layers[0].bias, 2, 1e-2, "bias gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dbias, update_params->layers[0].bias, 4, 30, 2e-1, "bias gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dbias);
 	ccv_convnet_free(update_params);
 	ccv_convnet_free(convnet);
@@ -1169,6 +1589,7 @@ TEST_CASE("numerical gradient versus analytical gradient for local response norm
 					.rows = 31,
 					.cols = 31,
 					.channels = 2,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -1178,7 +1599,8 @@ TEST_CASE("numerical gradient versus analytical gradient for local response norm
 					.channels = 2,
 					.border = 2,
 					.strides = 1,
-					.count = 2,
+					.count = 4,
+					.partition = 1,
 				},
 			},
 		},
@@ -1188,7 +1610,8 @@ TEST_CASE("numerical gradient versus analytical gradient for local response norm
 				.matrix = {
 					.rows = 31,
 					.cols = 31,
-					.channels = 2,
+					.channels = 4,
+					.partition = 1,
 				},
 			},
 			.output = {
@@ -1201,8 +1624,12 @@ TEST_CASE("numerical gradient versus analytical gradient for local response norm
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, params, 2);
-	int i;
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), params, 2);
+	dsfmt_t dsfmt;
+	dsfmt_init_gen_rand(&dsfmt, 3);
+	int i, k;
+	for (i = 0; i < convnet->layers->wnum; i++)
+		convnet->layers->w[i] = dsfmt_genrand_gaussian(&dsfmt, 0.001);
 	ccv_convnet_t* update_params = _ccv_convnet_update_new(convnet);
 	_ccv_convnet_update_zero(update_params);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C2, 0, 0);
@@ -1210,49 +1637,61 @@ TEST_CASE("numerical gradient versus analytical gradient for local response norm
 		x->data.f32[i] = i;
 	ccv_dense_matrix_t* y = 0;
 	ccv_convnet_encode(convnet, &x, &y, 1);
-	REQUIRE(y->rows == 31 && y->cols == 31 && CCV_GET_CHANNEL(y->type) == 2, "convnet should return a 31x31x2 matrix");
+	REQUIRE(y->rows == 31 && y->cols == 31 && CCV_GET_CHANNEL(y->type) == 4, "convnet should return a 31x31x4 matrix");
 	ccv_dense_matrix_t* softmax = 0;
 	_ccv_convnet_compute_softmax(y, &softmax, 0);
 	ccv_dense_matrix_t* dloss = ccv_dense_matrix_new(y->rows, y->cols, CCV_32F | CCV_GET_CHANNEL(y->type), 0, 0);
-	for (i = 0; i < 31 * 31 * 2; i++)
+	for (i = 0; i < 31 * 31 * 4; i++)
 		dloss->data.f32[i] = softmax->data.f32[i] - (i == 24);
 	ccv_dense_matrix_t* d = 0;
 	_ccv_convnet_rnorm_backward_propagate(convnet->layers + 1, dloss, y, convnet->acts[0], convnet->denoms[1], update_params->acts);
-	_ccv_convnet_convolutional_backward_propagate(convnet->layers, update_params->acts[0], convnet->acts[0], 0, x, &d, update_params->layers);
+	_ccv_convnet_convolutional_backward_propagate(convnet->layers, update_params->acts[0], convnet->acts[0], x, &d, update_params->layers);
 	static const float eps = 0.000001;
-	float* dw = (float*)ccmalloc(sizeof(float) * 5 * 5 * 2 * 2); 
-	for (i = 0; i < 5 * 5 * 2 * 2; i++)
+	float* dw = (float*)ccmalloc(sizeof(float) * 5 * 5 * 2 * 4); 
+	for (i = 0; i < 5 * 5 * 2 * 4; i++)
 	{
-		float w = convnet->layers->w[i];
-		convnet->layers->w[i] += eps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dw[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / eps;
-		ccv_matrix_free(z);
-		convnet->layers->w[i] = w;
+		dw[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float w = convnet->layers->w[i];
+			convnet->layers->w[i] += fsh[k] * eps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dw[i] += -logf(z->data.f32[24]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->w[i] = w;
+		}
+		dw[i] *= 1.0 / (12 * eps);
 	}
-	float* dbias = (float*)ccmalloc(sizeof(float) * 2);
-	static const float beps = 0.00001;
-	for (i = 0; i < 2; i++)
+	float* dbias = (float*)ccmalloc(sizeof(float) * 4);
+	static const float beps = 0.0001;
+	for (i = 0; i < 4; i++)
 	{
-		float bias = convnet->layers->bias[i];
-		convnet->layers->bias[i] += beps;
-		ccv_dense_matrix_t* z = 0;
-		ccv_convnet_encode(convnet, &x, &z, 1);
-		_ccv_convnet_compute_softmax(z, &z, 0);
-		dbias[i] = ((-logf(z->data.f32[24])) - (-logf(softmax->data.f32[24]))) / beps;
-		ccv_matrix_free(z);
-		convnet->layers->bias[i] = bias;
+		dbias[i] = 0;
+		for (k = 0; k < 4; k++)
+		{
+			float bias = convnet->layers->bias[i];
+			convnet->layers->bias[i] += fsh[k] * beps;
+			ccv_dense_matrix_t* z = 0;
+			ccv_convnet_compact(convnet);
+			ccv_convnet_encode(convnet, &x, &z, 1);
+			_ccv_convnet_compute_softmax(z, &z, 0);
+			dbias[i] += -logf(z->data.f32[24]) * fs[k];
+			ccv_matrix_free(z);
+			convnet->layers->bias[i] = bias;
+		}
+		dbias[i] *= 1.0 / (12 * beps);
 	}
 	ccv_matrix_free(softmax);
 	ccv_matrix_free(dloss);
 	ccv_matrix_free(y);
 	ccv_matrix_free(x);
 	ccv_matrix_free(d);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dw, update_params->layers[0].w, 5 * 5 * 2 * 2, 2, "weight gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dw, update_params->layers[0].w, 5 * 5 * 2 * 4, 30, 2e-1, "weight gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dw);
-	REQUIRE_ARRAY_EQ_WITH_TOLERANCE(float, dbias, update_params->layers[0].bias, 2, 5 * 1e-2, "bias gradient from analytical method doesn't match the one from numerical method");
+	REQUIRE_ARRAY_EQ_WITHIN_ANGLE_AND_MAGNITUDE(float, dbias, update_params->layers[0].bias, 4, 30, 2e-1, "bias gradient from analytical method doesn't match the one from numerical method");
 	ccfree(dbias);
 	ccv_convnet_free(update_params);
 	ccv_convnet_free(convnet);
@@ -1267,6 +1706,7 @@ TEST_CASE("max pool network backward propagate")
 				.rows = 31,
 				.cols = 31,
 				.channels = 2,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -1277,7 +1717,7 @@ TEST_CASE("max pool network backward propagate")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), &params, 1);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C2, 0, 0);
 	int i, j, k;
 	for (i = 0; i < 31 * 31 * 2; i++)
@@ -1313,6 +1753,7 @@ TEST_CASE("average pool network backward propagate")
 				.rows = 31,
 				.cols = 31,
 				.channels = 2,
+				.partition = 1,
 			},
 		},
 		.output = {
@@ -1323,7 +1764,7 @@ TEST_CASE("average pool network backward propagate")
 			},
 		},
 	};
-	ccv_convnet_t* convnet = ccv_convnet_new(0, &params, 1);
+	ccv_convnet_t* convnet = ccv_convnet_new(0, ccv_size(31, 31), &params, 1);
 	ccv_dense_matrix_t* x = ccv_dense_matrix_new(31, 31, CCV_32F | CCV_C2, 0, 0);
 	int i, j, k;
 	for (i = 0; i < 31 * 31 * 2; i++)
diff --git a/test/functional/makefile b/test/functional/makefile
index cfa132952..aa7dbb9e1 100644
--- a/test/functional/makefile
+++ b/test/functional/makefile
@@ -3,13 +3,13 @@ include ../../lib/config.mk
 #CC +=# -fprofile-arcs -ftest-coverage
 LDFLAGS := -L"../../lib" -lccv $(LDFLAGS)
 CFLAGS := -O3 -Wall -I"../../lib" -I"../" $(CFLAGS)
-TARGETS = algebra.tests util.tests numeric.tests basic.tests memory.tests io.tests transform.tests convnet.tests
-
-test: all
-	@for test in $(TARGETS) ; do ./"$$test" ; done
+TARGETS = algebra.tests util.tests numeric.tests basic.tests memory.tests io.tests transform.tests convnet.tests 3rdparty.tests
 
 all: $(TARGETS)
 
+test: all
+	@for test in $(TARGETS) ; do ./"$$test" || exit ; done
+
 clean:
 	${MAKE} clean -C ../../lib ; rm -f *.o $(TARGETS)
 
diff --git a/test/functional/memory.tests.c b/test/functional/memory.tests.c
index 701caf58a..15a8beb45 100644
--- a/test/functional/memory.tests.c
+++ b/test/functional/memory.tests.c
@@ -75,7 +75,7 @@ TEST_CASE("garbage collector 95\% hit rate")
 {
 	int i;
 	// deliberately let only cache size fits 90% of data
-	ccv_enable_cache((sizeof(ccv_dense_matrix_t) + 4) * N * 90 / 100);
+	ccv_enable_cache((sizeof(ccv_dense_matrix_t) + 4) * N * 9 / 10);
 	for (i = 0; i < N; i++)
 	{
 		ccv_dense_matrix_t* dmt = ccv_dense_matrix_new(1, 1, CCV_32S | CCV_C1, 0, 0);
@@ -128,7 +128,7 @@ TEST_CASE("garbage collector 47\% hit rate")
 TEST_CASE("multi-type garbage collector 92\% hit rate")
 {
 	int i;
-	ccv_enable_cache(((sizeof(ccv_dense_matrix_t) + 4) + (sizeof(ccv_array_t) + 4 * 4)) * N * 90 / 100);
+	ccv_enable_cache(((sizeof(ccv_dense_matrix_t) + 4) + (sizeof(ccv_array_t) + 4 * 4)) * N * 9 / 10);
 	for (i = 0; i < N; i++)
 	{
 		ccv_dense_matrix_t* dmt = ccv_dense_matrix_new(1, 1, CCV_32S | CCV_C1, 0, 0);
diff --git a/test/functional/numeric.tests.c b/test/functional/numeric.tests.c
index 3ca03ebbb..ef18c2503 100644
--- a/test/functional/numeric.tests.c
+++ b/test/functional/numeric.tests.c
@@ -1,12 +1,48 @@
 #include "ccv.h"
 #include "case.h"
 #include "ccv_case.h"
+#include "3rdparty/dsfmt/dSFMT.h"
 
 /* numeric tests are more like functional tests rather than unit tests:
  * the following tests contain:
- * 1. minimization of the famous rosenbrock function;
- * 2. compute ssd with ccv_filter, and compare the result with naive method
- * 3. compare the result from ccv_distance_transform (linear time) with reference implementation from voc-release4 (O(nlog(n))) */
+ * 1. compute eigenvectors / eigenvalues on a random symmetric matrix and verify these are eigenvectors / eigenvalues;
+ * 2. minimization of the famous rosenbrock function;
+ * 3. compute ssd with ccv_filter, and compare the result with naive method
+ * 4. compare the result from ccv_distance_transform (linear time) with reference implementation from voc-release4 (O(nlog(n))) */
+
+TEST_CASE("compute eigenvectors and eigenvalues of a symmetric matrix")
+{
+	dsfmt_t dsfmt;
+	dsfmt_init_gen_rand(&dsfmt, 0xdead);
+	dsfmt_genrand_close_open(&dsfmt);
+	ccv_dense_matrix_t* a = ccv_dense_matrix_new(4, 4, CCV_64F | CCV_C1, 0, 0);
+	int i, j, k;
+	for (i = 0; i < 4; i++)
+		for (j = i; j < 4; j++)
+			a->data.f64[i * 4 + j] = dsfmt_genrand_close_open(&dsfmt) * 10;
+	for (i = 0; i < 4; i++)
+		for (j = 0; j < i; j++)
+			a->data.f64[i * 4 + j] = a->data.f64[j * 4 + i];
+	ccv_dense_matrix_t* evec = 0;
+	ccv_dense_matrix_t* eval = 0;
+	ccv_eigen(a, &evec, &eval, 0, 1e-6);
+	for (k = 0; k < 4; k++)
+	{
+		double veca[4] = {
+			0, 0, 0, 0,
+		};
+		for (i = 0; i < 4; i++)
+			for (j = 0; j < 4; j++)
+				veca[i] += a->data.f64[i * 4 + j] * evec->data.f64[k * 4 + j];
+		double vece[4];
+		for (i = 0; i < 4; i++)
+			vece[i] = eval->data.f64[k] * evec->data.f64[k * 4 + i];
+		REQUIRE_ARRAY_EQ_WITH_TOLERANCE(double, veca, vece, 4, 1e-6, "verify %d(th) eigenvectors and eigenvalues with Ax = rx", k + 1);
+	}
+	ccv_matrix_free(a);
+	ccv_matrix_free(evec);
+	ccv_matrix_free(eval);
+}
 
 int rosenbrock(const ccv_dense_matrix_t* x, double* f, ccv_dense_matrix_t* df, void* data)
 {
diff --git a/test/makefile b/test/makefile
index 38e4119e6..98bf891a9 100644
--- a/test/makefile
+++ b/test/makefile
@@ -1,5 +1,8 @@
-test:
+all:
 	${MAKE} -C functional ; ${MAKE} -C regression
 
+test:
+	${MAKE} -C functional test && ${MAKE} -C regression test
+
 clean:
 	${MAKE} clean -C ../lib ; ${MAKE} clean -C functional ; ${MAKE} clean -C regression
diff --git a/test/regression/makefile b/test/regression/makefile
index 1d9a92e9a..e83686cfc 100644
--- a/test/regression/makefile
+++ b/test/regression/makefile
@@ -4,11 +4,11 @@ LDFLAGS := -L"../../lib" -lccv $(LDFLAGS)
 CFLAGS := -O3 -Wall -I"../../lib" -I"../" $(CFLAGS)
 TARGETS = defects.l0.1.tests
 
-test: all
-	@for test in $(TARGETS) ; do ./"$$test" ; done
-
 all: $(TARGETS)
 
+test: all
+	@for test in $(TARGETS) ; do ./"$$test" || exit ; done
+
 clean:
 	${MAKE} clean -C ../../lib ; rm -f *.o $(TARGETS)