From 53d1fdb64495d166e815f5d83bb8622982930405 Mon Sep 17 00:00:00 2001
From: Anastassios Nanos <ananos@cslab.ece.ntua.gr>
Date: Mon, 13 Oct 2014 14:15:40 +0300
Subject: [PATCH 1/3] Add timer support.

---
 src/Makefile.am | 18 +++++++------
 src/common.h    |  1 +
 src/server.c    |  6 +++++
 src/timers.h    | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 84 insertions(+), 8 deletions(-)
 create mode 100644 src/timers.h

diff --git a/src/Makefile.am b/src/Makefile.am
index 6c1c7a4..5028bbf 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,18 +1,19 @@
-bin_PROGRAMS = server libcudawrapper #test-cuda test-client
+bin_PROGRAMS = server libcudawrapper.so #test-cuda test-client
 
 
-AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include
+CYCLES_PER_SEC = `cat /proc/cpuinfo |grep cpu\ MHz | head -1 | cut -d\: -f2 | awk '{ print $$1 * 1000 }'`
+AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -DCYCLES_PER_SEC=$(CYCLES_PER_SEC)
 
 
 BUILT_SOURCES = @srcdir@/common.pb-c.c @srcdir@/common.pb-c.h
 
-server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h
+server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timers.h
 server_SOURCES += common.pb-c.c common.pb-c.h
 
-libcudawrapper_CFLAGS = -fPIC -shared
-libcudawrapper_CFLAGS +=  -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include
-libcudawrapper_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c
-libcudawrapper_SOURCES += common.pb-c.c common.pb-c.h
+libcudawrapper_so_CFLAGS = -fPIC -shared
+libcudawrapper_so_CFLAGS +=  -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include
+libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c
+libcudawrapper_so_SOURCES += common.pb-c.c common.pb-c.h
 
 common.pb-c.c: @srcdir@/common.proto
 	 $(PROTOC_C) --proto_path=@srcdir@ --c_out=. @srcdir@/common.proto
@@ -26,9 +27,10 @@ common.pb-c.h: @srcdir@/common.pb-c.c
 CLEANFILES = @builddir@/common.pb-c.c @builddir@/common.pb-c.h
 
 server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda
-libcudawrapper_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl
+libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl
 
 EXTRA_DIST = common.proto
 
 AM_CFLAGS += -I@builddir@
 
+
diff --git a/src/common.h b/src/common.h
index b666fe4..2ccf311 100644
--- a/src/common.h
+++ b/src/common.h
@@ -2,6 +2,7 @@
 #define COMMON_H
 
 #include <stdint.h>
+#include "timers.h"
 /*
 typedef struct cuda_cmd_s {
 	uint16_t type;
diff --git a/src/server.c b/src/server.c
index f2b09e5..70e4026 100644
--- a/src/server.c
+++ b/src/server.c
@@ -8,6 +8,7 @@
 #include <unistd.h>
 #include <netdb.h>
 
+#define TIMERS_ENABLED
 #include "common.h"
 #include "common.pb-c.h"
 #include "protocol.h"
@@ -71,6 +72,7 @@ int main(int argc, char *argv[]) {
 	void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL,
 		 *free_list=NULL, *busy_list=NULL, *client_list=NULL, *client_handle=NULL;
 	uint32_t msg_length;
+	timers_t tm;
 
 	if (argc > 2) {
 		printf("Usage: server <local_port>\n");
@@ -110,12 +112,14 @@ int main(int argc, char *argv[]) {
 		else
 			printf("from unidentified client");
 
+		TIMER_RESET(&tm);
 		for(;;) {
 			msg_length = receive_message(&msg, client_sock_fd);
 			if (msg_length > 0)
 				msg_type = decode_message(&dec_msg, &payload, msg, msg_length);
 
 			printf("Processing message\n");
+			TIMER_START(&tm);
 			switch (msg_type) {
 				case CUDA_CMD:
 					arg_cnt = process_cuda_cmd(&result, payload, free_list, busy_list, &client_list, &client_handle);
@@ -154,6 +158,7 @@ int main(int argc, char *argv[]) {
 				}
 			}
 			printf(">>\nMessage processed, cleaning up...\n<<\n");
+			TIMER_STOP(&tm);
 			if (msg != NULL) {
 				free(msg);
 				msg = NULL;
@@ -165,6 +170,7 @@ int main(int argc, char *argv[]) {
 				break;
 			}
 		}
+	printf("message needed %lf to be processed\n", TICKS_TO_USEC(TIMER_TOTAL(&tm)));
 	}
 	close(client_sock_fd);
 
diff --git a/src/timers.h b/src/timers.h
new file mode 100644
index 0000000..7d13f42
--- /dev/null
+++ b/src/timers.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) Anastassios Nanos 2012 (See AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details. You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#define _GNU_SOURCE
+
+#ifndef __gpusock_timers_h__
+#define __gpusock_timers_h__
+
+typedef struct timers {
+	unsigned long long total;
+	unsigned long long val;
+	unsigned long cnt;
+} timers_t;
+
+
+#ifdef TIMERS_ENABLED
+
+#include <asm/msr.h>
+//#include <linux/timex.h>
+
+unsigned long long get_cycles()
+{
+        struct timeval tv;
+	unsigned long long paparia;
+        gettimeofday(&tv,NULL);
+        paparia = tv.tv_sec * 1000000 + tv.tv_usec;
+	printf("paparia:%ld\n", paparia);
+	return paparia;
+}
+
+
+#define TIMER_START(tp)	do {( tp)->val = get_cycles(); } while (0)
+#define TIMER_STOP(tp)	do { (tp)->total += get_cycles() - (tp)->val; ++(tp)->cnt; } while (0)
+#define TIMER_RESET(tp)	do { (tp)->total = (tp)->val = 0; (tp)->cnt = 0; } while (0)
+#define TIMER_TOTAL(tp)	((tp)->total)
+#define TIMER_COUNT(tp)	((tp)->cnt)
+#define TIMER_AVG(tp)	((tp)->cnt ? ((tp)->total / (tp)->cnt) : -1)
+
+#define TICKS_TO_USEC(t)	(1000 * t/CYCLES_PER_SEC)
+
+#else
+
+#define TIMER_START(a)
+#define TIMER_STOP(a)
+#define TIMER_TOTAL(a) 0ULL
+#define TIMER_COUNT(a) 0UL
+#define TIMER_RESET(a)
+#define TICKS_TO_USEC(a) 0ULL
+
+#endif	/* TIMERS_ENABLED */
+
+#define var_name(x) #x
+#define gpusock_timer_reset(x) TIMER_RESET(x);
+#endif	/* __gpusock_timers_h__ */

From 06a8e0b598fc6e3ee9332af1c1c84996a1576c40 Mon Sep 17 00:00:00 2001
From: papazof <papazof@gmail.com>
Date: Mon, 10 Nov 2014 19:50:26 +0200
Subject: [PATCH 2/3] Use newer clock_gettime. Time different operations. Merge
 server-client_con branch.

---
 configure.ac         |  15 ++-
 src/Makefile.am      |  14 +--
 src/client.c         |  14 +++
 src/common.h         |   1 -
 src/libcudawrapper.c |   1 -
 src/process.c        | 287 +++++++++++++++++++++++++------------------
 src/process.h        |  35 ++++--
 src/protocol.c       |  76 +++++++++---
 src/protocol.h       |   5 +
 src/server.c         | 270 ++++++++++++++++++++++++++++++----------
 src/timer.h          |  57 +++++++++
 src/timers.h         |  67 ----------
 12 files changed, 552 insertions(+), 290 deletions(-)
 create mode 100644 src/timer.h
 delete mode 100644 src/timers.h

diff --git a/configure.ac b/configure.ac
index a3d28d8..949c967 100644
--- a/configure.ac
+++ b/configure.ac
@@ -16,9 +16,19 @@ AC_ARG_WITH([protobuf-c-libdir],
 AC_ARG_ENABLE([debug],
  [AS_HELP_STRING([--enable-debug], [Enable debug print messages. This is a debugging feature which should not be usually enabled])],
  [DEBUG_CFLAGS="-DGPUSOCK_DEBUG"])
-
 AC_SUBST(DEBUG_CFLAGS)
 
+AC_ARG_ENABLE([timers],
+ [AS_HELP_STRING([--enable-timers], [Enable performance timers])],
+ [timers=yes])
+
+AS_IF([test "x$timers" == xyes],
+	  [AC_CHECK_LIB([rt], [clock_gettime],
+					[AC_SUBST([TIMERS_LIBS], ["-lrt"])
+					 AC_SUBST([TIMERS_CFLAGS], ["-DTIMERS_ENABLED"])
+					])
+	  ])
+
 #AX_LIB_PROTOBUF_C([0.14])
 
 AC_CHECK_LIB([protobuf-c], [main], [], [AC_MSG_ERROR([cannot find protobuf library])])
@@ -35,6 +45,9 @@ AC_ARG_WITH([protoc-c],
     [PROTOC_C='protoc-c'])
 AC_SUBST([PROTOC_C])
 
+AC_CHECK_LIB(pthread, pthread_create, [PTHREAD_LIBS+=-lpthread])
+AC_SUBST([PTHREAD_LIBS])
+
 AC_PROG_CXX
 
 AM_SILENT_RULES([no])
diff --git a/src/Makefile.am b/src/Makefile.am
index 49c2244..831a8e7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,19 +1,17 @@
 bin_PROGRAMS = server libcudawrapper.so #test-cuda test-client
 
 
-CYCLES_PER_SEC = `cat /proc/cpuinfo |grep cpu\ MHz | head -1 | cut -d\: -f2 | awk '{ print $$1 * 1000 }'`
-
-AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -DCYCLES_PER_SEC=$(CYCLES_PER_SEC) $(DEBUG_CFLAGS)
+AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include $(TIMERS_CFLAGS) $(DEBUG_CFLAGS)
 
 
 BUILT_SOURCES = @srcdir@/common.pb-c.c @srcdir@/common.pb-c.h
 
-server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timers.h
+server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timer.h
 server_SOURCES += common.pb-c.c common.pb-c.h
 
-libcudawrapper_so_CFLAGS = -fPIC -shared $(DEBUG_CFLAGS)
+libcudawrapper_so_CFLAGS = -fPIC -shared $(TIMERS_CFLAGS) $(DEBUG_CFLAGS)
 libcudawrapper_so_CFLAGS +=  -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include
-libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c timers.h
+libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c
 libcudawrapper_so_SOURCES += common.pb-c.c common.pb-c.h
 
 common.pb-c.c: @srcdir@/common.proto
@@ -27,8 +25,8 @@ common.pb-c.h: @srcdir@/common.pb-c.c
 
 CLEANFILES = @builddir@/common.pb-c.c @builddir@/common.pb-c.h
 
-server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda
-libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl
+server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda $(PTHREAD_LIBS) $(TIMERS_LIBS)
+libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl $(TIMERS_LIBS)
 
 EXTRA_DIST = common.proto
 
diff --git a/src/client.c b/src/client.c
index 293e539..02f70df 100644
--- a/src/client.c
+++ b/src/client.c
@@ -14,6 +14,7 @@
 #include "common.pb-c.h"
 #include "protocol.h"
 #include "process.h"
+#include "timer.h"
 
 int init_client(const char *s_ip, const char *s_port, struct addrinfo *s_addr) {
 	int socket_fd, ret;
@@ -100,6 +101,9 @@ int64_t get_cuda_cmd_result(void **result, int sock_fd) {
 	void *buffer=NULL, *payload=NULL, *dec_msg=NULL;
 	int res_code;
 
+	gs_timer tm;
+	TIMER_RESET(&tm);
+	TIMER_START(&tm);
 	gdprintf("Waiting for response:\n");
 	msg_length = receive_message(&buffer, sock_fd);
 	if (msg_length > 0) {
@@ -131,6 +135,9 @@ int64_t get_cuda_cmd_result(void **result, int sock_fd) {
 	if (buffer != NULL)
 		free(buffer);
 
+	TIMER_STOP(&tm);
+	gdprintf("\nClient Receive: %lf\n", TIMER_TO_SEC(TIMER_TOTAL(&tm)));
+
 	return res_code;
 }
 
@@ -173,6 +180,10 @@ int send_cuda_cmd(int sock_fd, var **args, size_t arg_count, int type) {
 	void *buffer = NULL, *payload = NULL;
 	size_t buf_size;
 
+	gs_timer tm;
+	TIMER_RESET(&tm);
+	TIMER_START(&tm);
+
 	gdprintf("Sendind CUDA cmd...\n");
 	pack_cuda_cmd(&payload, args, arg_count, type);
 
@@ -184,6 +195,9 @@ int send_cuda_cmd(int sock_fd, var **args, size_t arg_count, int type) {
 
 	free(buffer);
 
+	TIMER_STOP(&tm);
+	gdprintf("\nClient Send: %lf\n", TIMER_TO_SEC(TIMER_TOTAL(&tm)));
+
 	return 0;
 }
 
diff --git a/src/common.h b/src/common.h
index 2ccf311..b666fe4 100644
--- a/src/common.h
+++ b/src/common.h
@@ -2,7 +2,6 @@
 #define COMMON_H
 
 #include <stdint.h>
-#include "timers.h"
 /*
 typedef struct cuda_cmd_s {
 	uint16_t type;
diff --git a/src/libcudawrapper.c b/src/libcudawrapper.c
index a100354..55613d3 100644
--- a/src/libcudawrapper.c
+++ b/src/libcudawrapper.c
@@ -13,7 +13,6 @@
 #include "common.pb-c.h"
 #include "client.h"
 
-
 //TODO: assert stored results' size fits...
 
 static params c_params;
diff --git a/src/process.c b/src/process.c
index 4d4785e..ae6eef4 100644
--- a/src/process.c
+++ b/src/process.c
@@ -6,12 +6,14 @@
 #include <sys/stat.h>
 #include <errno.h>
 #include <inttypes.h>
+#include <pthread.h>
 
 #include "process.h"
 #include "common.h"
 #include "common.pb-c.h"
 #include "cuda.h"
 #include "list.h"
+#include "timer.h"
 
 // cuGetErrorName() doesn't exist for CUDA < 6.0 ...
 #if defined(CUDA_VERSION) && CUDA_VERSION < 6000
@@ -26,22 +28,28 @@ CUresult cuGetErrorName(CUresult error, const char** pStr) {
 	*pStr = CUDA_RESULT_STRING[error];
 	if (*pStr == NULL)
 		return CUDA_ERROR_INVALID_VALUE;
-		
+
 	return CUDA_SUCCESS;
 }
 #endif
 
+static pthread_mutex_t client_mutex = PTHREAD_MUTEX_INITIALIZER;
+__thread gs_timer ps_i, ps_dg, ps_dgc, ps_dgca, ps_dgn, ps_dgna, ps_cc,
+				  ps_cca, ps_cd, ps_cda, ps_ml, ps_mla, ps_mgf, ps_mgfa,
+				  ps_ma, ps_maa, ps_mf, ps_mfa, ps_mhd, ps_mhda, ps_mdh,
+				  ps_mdha, ps_lk, ps_lka, ps_ext;
+
 #define cuda_err_print(res, ef) \
 	cuda_error_print(res, ef, __FILE__, __LINE__)
 
 inline CUresult cuda_error_print(CUresult result, int exit_flag, const char *file, const int line) {
 	const char *cuda_err_str = NULL;
-	
+
 	if (result != CUDA_SUCCESS) {
 		cuGetErrorName(result, &cuda_err_str);
 		fprintf(stderr, "-\nCUDA Driver API error: %04d - %s [%s, %i]\n-\n",
 				result, cuda_err_str, file, line);
-	
+
 		if (exit_flag != 0)
 			exit(EXIT_FAILURE);
 	}
@@ -55,13 +63,13 @@ size_t read_cuda_module_file(void **buffer, const char *filename) {
 	struct stat st;
 	void *buf = NULL;
 
-	gdprintf("Reading from file <%s> ... ", filename); 
+	gdprintf("Reading from file <%s> ... ", filename);
 	fd = fopen(filename, "rb");
 	if (fd == NULL) {
 		perror("fopen failed");
 		exit(EXIT_FAILURE);
 	}
-	
+
 
 	if (fstat(fileno(fd), &st) != 0) {
 		fprintf(stderr, "Reading file size failed: %s\n", strerror(errno));
@@ -85,7 +93,7 @@ size_t read_cuda_module_file(void **buffer, const char *filename) {
 		fprintf(stderr, "Reading file failed: read %zu vs %jd expected\n", b_read, st.st_size);
 		exit(EXIT_FAILURE);
 	}
-	
+
 	*buffer = buf;
 
 	fclose(fd);
@@ -96,7 +104,7 @@ size_t read_cuda_module_file(void **buffer, const char *filename) {
 
 void print_file_as_hex(uint8_t *file, size_t file_size) {
 	int i;
-	
+
 	gdprintf("File size: %zu\n", file_size);
 	for (i = 0; i < file_size; i++) {
 		if (i % 14 == 0)
@@ -112,7 +120,7 @@ void init_device_list(cuda_device_node **list) {
 	empty_list = malloc_safe(sizeof(cuda_device_node));
 
 	INIT_LIST_HEAD(&empty_list->node);
-	
+
 	*list = empty_list;
 }
 
@@ -122,7 +130,7 @@ void init_client_list(client_node **list) {
 	empty_list = malloc_safe(sizeof(client_node));
 
 	INIT_LIST_HEAD(&empty_list->node);
-	
+
 	*list = empty_list;
 }
 
@@ -154,9 +162,9 @@ int add_device_to_list(cuda_device_node *dev_list, int dev_id) {
 	cuda_device_node *cuda_dev_node;
 	char cuda_dev_name[CUDA_DEV_NAME_MAX];
 	CUdevice *cuda_device;
+	CUcontext *cuda_context;
 
 	cuda_device = malloc_safe(sizeof(CUdevice));
-
 	cuda_dev_node = malloc_safe(sizeof(*cuda_dev_node));
 
 	if (cuda_err_print(cuDeviceGet(cuda_device, dev_id), 0) != CUDA_SUCCESS)
@@ -165,11 +173,20 @@ int add_device_to_list(cuda_device_node *dev_list, int dev_id) {
 	if (cuda_err_print(cuDeviceGetName(cuda_dev_name, CUDA_DEV_NAME_MAX, *cuda_device), 0) != CUDA_SUCCESS)
 		return -1;
 
+	cuda_context = malloc_safe(sizeof(*cuda_context));
+	// Initializing per device context.
+	// FIXME(?): We are ignoring client flags...
+	if (cuda_err_print(cuCtxCreate(cuda_context, 0, *cuda_device), 0) != CUDA_SUCCESS)
+		return -1;
+
 	cuda_dev_node->cuda_device = cuda_device;
-	memcpy(cuda_dev_node->cuda_device_name, cuda_dev_name, CUDA_DEV_NAME_MAX);
+	strcpy(cuda_dev_node->cuda_device_name, cuda_dev_name);
+	cuda_dev_node->cuda_context = cuda_context;
 	cuda_dev_node->is_busy = 0;
+	cuda_dev_node->client_count = 0;
+
 
-	fprintf(stdout, "Adding device [%d]@%p -> %s\n", dev_id, cuda_dev_node->cuda_device, cuda_dev_node->cuda_device_name);
+	fprintf(stdout, "Adding device [%d]@%p -> %s, with context @%p\n", dev_id, cuda_dev_node->cuda_device, cuda_dev_node->cuda_device_name, cuda_dev_node->cuda_context);
 
 	list_add_tail(&cuda_dev_node->node, &dev_list->node);
 
@@ -190,7 +207,7 @@ int discover_cuda_devices(void **free_list, void **busy_list) {
 		exit(EXIT_FAILURE);
 	}
 	gdprintf("Available CUDA devices: %d\n", cuda_dev_count);
-	
+
 	// Init free and busy CUDA device lists
 	init_device_list(&free_cuda_devs);
 	init_device_list(&busy_cuda_devs);
@@ -225,7 +242,7 @@ void print_cuda_devices(void *free_list, void *busy_list) {
 	}
 }
 
-int add_client_to_list(void **client_handle, client_node *client_list) {
+int add_client_to_list(client_node **client_handle, client_node *client_list) {
 	client_node *new_node, *tmp;
 
 	new_node = malloc_safe(sizeof(*new_node));
@@ -236,21 +253,23 @@ int add_client_to_list(void **client_handle, client_node *client_list) {
 	} else {
 		new_node->id = 0;
 	}
+
 	new_node->dev_count = 0;
 	new_node->status = 1;
 	new_node->cuda_dev_node = NULL;
-	new_node->cuda_context = NULL;
 
 	gdprintf("Adding client <%d> to list\n", new_node->id);
 	list_add_tail(&new_node->node, &client_list->node);
 
 	*client_handle = new_node;
+
 	return 0;
 }
 
-int get_client_handle(void **client_handle, void **client_list, int client_id) {
+int get_client_handle(client_node **client_handle, client_node **client_list, int client_id) {
 	client_node *client_list_p = *client_list, *pos;
 
+	pthread_mutex_lock(&client_mutex);
 	if (client_list_p == NULL) {
 		init_client_list(&client_list_p);
 		*client_list = client_list_p;
@@ -258,39 +277,44 @@ int get_client_handle(void **client_handle, void **client_list, int client_id) {
 		// check if client exists in list
 		list_for_each_entry(pos, &client_list_p->node, node) {
 			if (pos->id == client_id) {
-				printf("Client <%d> is already in the list\n", client_id);
+				gdprintf("Client <%d> is already in the list\n", client_id);
 				*client_handle = pos;
 				return 0;
 			}
 		}
 	}
-	
+
 	add_client_to_list(client_handle, client_list_p);
-	
+	pthread_mutex_unlock(&client_mutex);
+
 	return 0;
 }
 
-int del_client_of_list(void *client_handle) {
+int del_client_of_list(client_node *client_handle) {
 	client_node *client = client_handle;
-	
+
+	pthread_mutex_lock(&client_mutex);
 	gdprintf("Deleting client <%d> from list\n", client->id);
 	list_del(&client->node);
+	pthread_mutex_unlock(&client_mutex);
 	free(client_handle);
 
 	return 0;
 }
 
-void print_clients(void *client_list) {
+void print_clients(client_node *client_list) {
 	client_node *client_list_p=client_list, *pos;
 	int i = 0;
 
+	pthread_mutex_lock(&client_mutex);
 	gdprintf("\nClients:\n");
 	list_for_each_entry(pos, &client_list_p->node, node) {
 		gdprintf("| [%d] <%d>\n", i++, pos->id);
 	}
+	pthread_mutex_unlock(&client_mutex);
 }
 
-unsigned int get_client_status(void *client_handle) {
+unsigned int get_client_status(client_node *client_handle) {
 	client_node *client = client_handle;
 
 	return (client_handle == NULL) ? 0 : client->status;
@@ -299,11 +323,11 @@ unsigned int get_client_status(void *client_handle) {
 uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation) {
 	uint32_t param_id = 0;
 	param_node *new_node, *tmp;
-	
+
 	if (*list == NULL)
 		init_param_list(list);
 	else {
-		// TODO: (?) generate unique random id
+		// TODO(?): generate unique random id
 		tmp = list_last_entry(&((*list)->node), param_node, node);
 		param_id = tmp->id + 1;
 	}
@@ -320,7 +344,7 @@ uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation)
 
 int find_param_by_id(param_node **param, param_node *list, uint32_t param_id) {
 	param_node *pos;
-	
+
 	list_for_each_entry(pos, &list->node, node) {
 		if (pos->id == param_id) {
 			*param = pos;
@@ -333,7 +357,7 @@ int find_param_by_id(param_node **param, param_node *list, uint32_t param_id) {
 
 int find_param_by_ptr(param_node **param, param_node *list, uint64_t param_ptr) {
 	param_node *pos;
-	
+
 	list_for_each_entry(pos, &list->node, node) {
 		if (pos->ptr == param_ptr) {
 			*param = pos;
@@ -351,11 +375,10 @@ int del_param_of_list(param_node *param) {
 	return 0;
 }
 
-int update_device_of_client(uintptr_t *dev_ptr, cuda_device_node *free_list, int dev_ordinal, client_node *client) {
+int update_devices_of_client(uintptr_t *dev_node_ptr, cuda_device_node *free_list, int dev_ordinal, client_node *client) {
 	cuda_device_node *tmp;
 	int i = 0, true_ordinal;
 
-	// TODO: support more than one devices per client.
 	gdprintf("Updating devices of client <%d>...\n", client->id);
 
 	tmp = list_first_entry_or_null(&free_list->node, cuda_device_node, node);
@@ -363,72 +386,67 @@ int update_device_of_client(uintptr_t *dev_ptr, cuda_device_node *free_list, int
 		fprintf(stderr, "No CUDA devices available for assignment\n");
 		return -1;
 	}
-	true_ordinal = dev_ordinal - client->dev_count;
-	while (i++ < true_ordinal) {
+	//true_ordinal = dev_ordinal - client->dev_count;
+	while (i++ < dev_ordinal) {
 		tmp = list_next_entry(tmp, node);
 		if (&tmp->node == &free_list->node) {
-			fprintf(stderr, "No CUDA devices available for assignment with the desired ordinal\n");
+			fprintf(stderr, "No CUDA devices available with the desired ordinal\n");
 			return -1;
 		}
 	}
-	
-	*dev_ptr = (uintptr_t) tmp->cuda_device;
+
+	*dev_node_ptr = (uintptr_t) tmp;
 	// TODO: What if client deviceGets the same ordinal twice?
 	add_param_to_list(&client->cuda_dev_node, (uintptr_t) tmp, NULL);
 
 	return 0;
 }
 
-int assign_device_to_client(uintptr_t dev_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) {
-	cuda_device_node *dev_node;
+int attach_device_to_client(uintptr_t dev_node_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) {
+	cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr;
 	param_node *pos;
-	CUdevice *cuda_device = (CUdevice *) dev_ptr;
 
-	gdprintf("Assigning device @%p to client <%d> ...\n", cuda_device, client->id);
+	pthread_mutex_lock(&client_mutex);
+	gdprintf("Attaching device @%p to client <%d> ...\n", dev_node->cuda_device, client->id);
 
 	list_for_each_entry(pos, &client->cuda_dev_node->node, node) {
-		dev_node = (cuda_device_node *) pos->ptr;
-		if (dev_node->cuda_device == cuda_device) {
-			if (dev_node->is_busy == 1) {
-				fprintf(stderr, "Requested CUDA device is busy\n");
-				return -2;
-			}
-			gdprintf("Moving device <%s>@%p to busy list\n",
-					dev_node->cuda_device_name, dev_node->cuda_device);
+		if ((cuda_device_node *) pos->ptr == dev_node) {
 			dev_node->is_busy = 1;
-			list_move_tail(&dev_node->node, &busy_list->node);
 			++client->dev_count;
+			++dev_node->client_count;
+			pthread_mutex_unlock(&client_mutex);
 
 			return 0;
-		}	
+		}
 	}
-	
-	fprintf(stderr, "Requested CUDA device not in client's list!\n");	
+	pthread_mutex_unlock(&client_mutex);
+
+	fprintf(stderr, "Requested CUDA device not in client's list!\n");
 	return -1;
 }
 
-int free_device_from_client(uintptr_t dev_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) {
-	cuda_device_node *dev_node;
+int detach_device_from_client(uintptr_t dev_node_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) {
+	cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr;
 	param_node *pos, *tmp;
-	CUdevice *cuda_device = (CUdevice *) dev_ptr;
-	
-	gdprintf("Freeing device @%p from client <%d>...\n", cuda_device, client->id);
+
+	pthread_mutex_lock(&client_mutex);
+	gdprintf("Detaching device @%p from client <%d>...\n", dev_node->cuda_device, client->id);
 
 	list_for_each_entry_safe(pos, tmp, &client->cuda_dev_node->node, node) {
-		dev_node = (cuda_device_node *) pos->ptr;
-		if (dev_node->cuda_device == cuda_device) {
-			gdprintf("Moving device <%s>@%p to free list\n",
-					dev_node->cuda_device_name, dev_node->cuda_device);
-			dev_node->is_busy = 0;
-			list_move_tail(&dev_node->node, &free_list->node);
+		if ((cuda_device_node *) pos->ptr == dev_node) {
 			del_param_of_list(pos);
 			--client->dev_count;
+			--dev_node->client_count;
+			if (dev_node->client_count == 0)
+				dev_node->is_busy = 0;
+			pthread_mutex_unlock(&client_mutex);
 
 			return 0;
-		}	
+		}
 	}
+	pthread_mutex_unlock(&client_mutex);
 
-	fprintf(stderr, "Requested CUDA device not in client's list!\n");	
+	fprintf(stderr, "Requested CUDA device not in client's list!\n");
 	return -1;
 }
 
@@ -437,8 +455,10 @@ int get_device_count_for_client(uint64_t *host_count) {
 	int count;
 
 	gdprintf("Getting CUDA device count...\n");
-	
+
+	TIMER_START(&ps_dgca);
 	res = cuda_err_print(cuDeviceGetCount(&count), 0);
+	TIMER_STOP(&ps_dgca);
 	if (res == CUDA_SUCCESS)
 		*host_count = count;
 
@@ -454,55 +474,37 @@ int get_device_name_for_client(void **host_name_ptr, size_t *host_name_size, int
 	*host_name_size = name_size;
 	*host_name_ptr = malloc_safe(name_size);
 
+	TIMER_START(&ps_dgna);
 	res = cuda_err_print(cuDeviceGetName(*host_name_ptr, name_size, *cuda_device), 0);
-	
-	printf("\n\n>>>>> %s\n\n", *host_name_ptr);
+	TIMER_STOP(&ps_dgna);
 
 	return res;
 }
 
 
-int create_context_of_client(uintptr_t *ctx_ptr, unsigned int flags, uintptr_t dev_ptr, client_node *client) {
-	CUcontext *cuda_context;
-	CUdevice *cuda_device = (CUdevice *) dev_ptr;
-	CUresult res = 0;
-
-	cuda_context = malloc_safe(sizeof(CUcontext));
-
-	// TODO: support more than one contexts per client.
-	gdprintf("Creating CUDA context of client <%d> ... ", client->id);
-
-	res = cuda_err_print(cuCtxCreate(cuda_context, flags, *cuda_device), 0);
+int get_context_of_client(uintptr_t *ctx_ptr, unsigned int flags, uintptr_t dev_node_ptr, client_node *client) {
+	cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr;
+	CUresult res;
 
-	if (res == CUDA_SUCCESS) {
-		*ctx_ptr = (uintptr_t) cuda_context;
-		add_param_to_list(&client->cuda_context, (uintptr_t) cuda_context, cuda_device);
-		gdprintf("created @%p ... Done\n", cuda_context);
-	} else {
-		gdprintf("failed ... Done\n");
-	}
+	gdprintf("Getting CUDA context @%p of client <%d> ...\n", dev_node->cuda_context, client->id);
+	TIMER_START(&ps_cca);
+	res = cuda_err_print(cuCtxSetCurrent(*dev_node->cuda_context), 0);
+	TIMER_STOP(&ps_cca);
+	*ctx_ptr = dev_node_ptr;
 
 	return res;
 }
 
-int destroy_context_of_client(uintptr_t *dev_ptr, uintptr_t ctx_ptr, client_node *client) {
-	CUresult res = 0;
-	CUcontext *cuda_context = (CUcontext *) ctx_ptr;
-	param_node *param = NULL;
+int put_context_of_client(uintptr_t *dev_node_ptr, uintptr_t ctx_ptr, client_node *client) {
+	CUresult res;
+	cuda_device_node *dev_node = (cuda_device_node *) ctx_ptr;
 
-	// TODO: free modules/functions allocated handles (?)	
-	gdprintf("Destroying CUDA context @%p of client <%d> ...\n", cuda_context, client->id);
-		
-	res = cuda_err_print(cuCtxDestroy(*cuda_context), 0);
-	
-	if (res == CUDA_SUCCESS) {
-		if (find_param_by_ptr(&param, client->cuda_context, ctx_ptr) != 0) {
-			fprintf(stderr, "Requested param not in given list!\n");
-		} else {
-			*dev_ptr = (uintptr_t) param->rel;
-			del_param_of_list(param);
-		}
-	}
+	// TODO: free modules/functions allocated handles (?)
+	gdprintf("Putting CUDA context @%p of client <%d> ...\n", dev_node->cuda_context, client->id);
+	TIMER_START(&ps_cda);
+	res = cuda_err_print(cuCtxSetCurrent(NULL), 0);
+	TIMER_STOP(&ps_cda);
+	*dev_node_ptr = ctx_ptr;
 
 	return res;
 }
@@ -511,12 +513,13 @@ int load_module_of_client(uintptr_t *mod_ptr, ProtobufCBinaryData *image, client
 	CUresult res;
 	CUmodule *cuda_module;
 
-	// TODO: support more than one modules per client.	
 	gdprintf("Loading CUDA module of client <%d> ... ", client->id);
 
 	cuda_module = malloc_safe(sizeof(*cuda_module));
 
+	TIMER_START(&ps_mla);
 	res = cuda_err_print(cuModuleLoadData(cuda_module, image->data), 0);
+	TIMER_STOP(&ps_mla);
 
 	if (res == CUDA_SUCCESS)
 		*mod_ptr = (uintptr_t) cuda_module;
@@ -529,12 +532,13 @@ int get_module_function_of_client(uintptr_t *fun_ptr, uintptr_t mod_ptr, char *f
 	CUfunction *cuda_func;
 	CUmodule *cuda_module = (CUmodule *) mod_ptr;
 
-	// TODO: support more than one functions per client.	
 	gdprintf("Loading CUDA module function of client <%d> ... ", client->id);
 
 	cuda_func = malloc_safe(sizeof(*cuda_func));
 
+	TIMER_START(&ps_mgfa);
 	res = cuda_err_print(cuModuleGetFunction(cuda_func, *cuda_module, func_name), 0);
+	TIMER_STOP(&ps_mgfa);
 
 	if (res == CUDA_SUCCESS)
 		*fun_ptr = (uintptr_t) cuda_func;
@@ -547,8 +551,10 @@ int memory_allocate_for_client(uintptr_t *dev_mem_ptr, size_t mem_size) {
 	CUdeviceptr cuda_dev_ptr;
 
 	gdprintf("Allocating CUDA device memory of size %zuB...\n", mem_size);
-	
+
+	TIMER_START(&ps_maa);
 	res = cuda_err_print(cuMemAlloc(&cuda_dev_ptr, mem_size), 0);
+	TIMER_STOP(&ps_maa);
 	if (res == CUDA_SUCCESS) {
 		*dev_mem_ptr = cuda_dev_ptr;
 		gdprintf("allocated @0x%llx\n", cuda_dev_ptr);
@@ -563,7 +569,9 @@ int memory_free_for_client(uintptr_t dev_mem_ptr) {
 
 	gdprintf("Freeing CUDA device memory @0x%llx...\n", cuda_dev_ptr);
 
+	TIMER_START(&ps_mfa);
 	res = cuda_err_print(cuMemFree(cuda_dev_ptr), 0);
+	TIMER_STOP(&ps_mfa);
 
 	return res;
 }
@@ -574,7 +582,9 @@ int memcpy_host_to_dev_for_client(uintptr_t dev_mem_ptr, void *host_mem_ptr, siz
 
 	gdprintf("Memcpying %zuB from host to CUDA device @0x%llx...\n", mem_size, cuda_dev_ptr);
 
+	TIMER_START(&ps_mhda);
 	res = cuda_err_print(cuMemcpyHtoD(cuda_dev_ptr, host_mem_ptr, mem_size), 0);
+	TIMER_STOP(&ps_mhda);
 
 	return res;
 }
@@ -588,7 +598,9 @@ int memcpy_dev_to_host_for_client(void **host_mem_ptr, size_t *host_mem_size, ui
 	*host_mem_size = mem_size;
 	*host_mem_ptr = malloc_safe(mem_size);
 
+	TIMER_START(&ps_mdha);
 	res = cuda_err_print(cuMemcpyDtoH(*host_mem_ptr, cuda_dev_ptr, mem_size), 0);
+	TIMER_STOP(&ps_mdha);
 
 	return res;
 }
@@ -606,12 +618,12 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData
 	gdprintf("Executing kernel...\n");
 	if (n_params > 0) {
 		params = malloc_safe(sizeof(void *) * n_params);
-	
+
 		for(i = 0; i < n_params; i++) {
 			params[i] = (void *) uints[9 + i];
 		}
 		gdprintf("using <params>\n");
-	}	
+	}
 	if (n_extras > 0) {
 		extra = malloc_safe(sizeof(void *) * 5);
 
@@ -624,10 +636,12 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData
 	}
 	gdprintf("with grid (x, y, z) = (%u, %u, %u)\n", grid_x, grid_y, grid_z);
 	gdprintf("and block (x, y, z) = (%u, %u, %u)\n", block_x, block_y, block_z);
-	
+
+	TIMER_START(&ps_lka);
 	res = cuda_err_print(cuLaunchKernel(*func, grid_x, grid_y, grid_z,
 				block_x, block_y, block_z, shared_mem_size, h_stream,
 				params, extra), 0);
+	TIMER_STOP(&ps_lka);
 
 	if (params != NULL)
 		free(params);
@@ -638,7 +652,7 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData
 	return res;
 }
 
-int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, void **client_list, void **client_handle) {
+int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, client_node **client_list, client_node **client_handle) {
 	int cuda_result = 0, arg_count = 0;
 	CudaCmd *cmd = cmd_ptr;
 	uint64_t uint_res = 0, tmp_ptr = 0;
@@ -655,93 +669,121 @@ int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_l
 	gdprintf("Processing CUDA_CMD\n");
 	switch(cmd->type) {
 		case INIT:
+			TIMER_START(&ps_i);
 			gdprintf("Executing cuInit...\n");
 			get_client_handle(client_handle, client_list, cmd->int_args[0]);
-			uint_res = ((client_node *) *client_handle)->id;
-			// cuInit() should have already been executed by the server 
+			uint_res = (*client_handle)->id;
+			// cuInit() should have already been executed by the server
 			// by that point...
 			//cuda_result = cuda_err_print(cuInit(cmd->uint_args[0]), 0);
 			cuda_result = CUDA_SUCCESS;
 			res_type = UINT;
+			TIMER_STOP(&ps_i);
 			break;
 		case DEVICE_GET:
+			TIMER_START(&ps_dg);
 			gdprintf("Executing cuDeviceGet...\n");
-			if (update_device_of_client(&uint_res, free_list, cmd->int_args[0], *client_handle) < 0)
+			if (update_devices_of_client(&uint_res, free_list, cmd->int_args[0], *client_handle) < 0)
 				cuda_result = CUDA_ERROR_INVALID_DEVICE;
 			else
 				cuda_result = CUDA_SUCCESS;
 
 			res_type = UINT;
+			TIMER_STOP(&ps_dg);
 			break;
 		case DEVICE_GET_COUNT:
+			TIMER_START(&ps_dgc);
 			gdprintf("Executing cuDeviceGetCount...\n");
 			cuda_result = get_device_count_for_client(&uint_res);
 			res_type = UINT;
+			TIMER_STOP(&ps_dgc);
 			break;
 		case DEVICE_GET_NAME:
+			TIMER_START(&ps_dgn);
 			gdprintf("Executing cuDeviceGetName...\n");
 			cuda_result = get_device_name_for_client(&extra_args, &extra_args_size, cmd->int_args[0], cmd->uint_args[0]);
+			TIMER_STOP(&ps_dgn);
 			break;
 		case CONTEXT_CREATE:
+			TIMER_START(&ps_cc);
 			gdprintf("Executing cuCtxCreate...\n");
-			cuda_result = assign_device_to_client(cmd->uint_args[1], free_list, busy_list, *client_handle);
-			if (cuda_result	< 0)
+			cuda_result = attach_device_to_client(cmd->uint_args[1], free_list, busy_list, *client_handle);
+			if (cuda_result	< 0) {
+				TIMER_STOP(&ps_cc);
 				break; // Handle appropriately in client.
+			}
 
-			cuda_result = create_context_of_client(&uint_res, cmd->uint_args[0], cmd->uint_args[1], *client_handle);
+			cuda_result = get_context_of_client(&uint_res, cmd->uint_args[0], cmd->uint_args[1], *client_handle);
 			res_type = UINT;
+			TIMER_STOP(&ps_cc);
 			break;
 		case CONTEXT_DESTROY:
+			TIMER_START(&ps_cd);
 			gdprintf("Executing cuCtxDestroy...\n");
 			// We assume that only one context per device is created
-			cuda_result = destroy_context_of_client(&tmp_ptr, cmd->uint_args[0], *client_handle);
+			cuda_result = put_context_of_client(&tmp_ptr, cmd->uint_args[0], *client_handle);
 			if (cuda_result == CUDA_SUCCESS) {
-				free_device_from_client(tmp_ptr, free_list, busy_list, *client_handle);
+				detach_device_from_client(tmp_ptr, free_list, busy_list, *client_handle);
 				if (cmd->n_uint_args > 1 && cmd->uint_args[1] == 1) {
 					del_client_of_list(*client_handle);
 					*client_handle = NULL;
-					//((client_node *) *client_handle)->status = 0;
 				}
 			}
+			TIMER_STOP(&ps_cd);
 			break;
 		case MODULE_LOAD:
+			TIMER_START(&ps_ml);
 			gdprintf("Executing cuModuleLoad...\n");
 			//print_file_as_hex(cmd->extra_args[0].data, cmd->extra_args[0].len);
 			cuda_result = load_module_of_client(&uint_res, &(cmd->extra_args[0]), *client_handle);
 			res_type = UINT;
+			TIMER_STOP(&ps_ml);
 			break;
 		case MODULE_GET_FUNCTION:
+			TIMER_START(&ps_mgf);
 			gdprintf("Executing cuModuleGetFuction...\n");
 			cuda_result = get_module_function_of_client(&uint_res, cmd->uint_args[0], cmd->str_args[0], *client_handle);
 			res_type = UINT;
+			TIMER_STOP(&ps_mgf);
 			break;
 		case MEMORY_ALLOCATE:
+			TIMER_START(&ps_ma);
 			gdprintf("Executing cuMemAlloc...\n");
 			cuda_result = memory_allocate_for_client(&uint_res, cmd->uint_args[0]);
 			res_type = UINT;
+			TIMER_STOP(&ps_ma);
 			break;
 		case MEMORY_FREE:
+			TIMER_START(&ps_mf);
 			gdprintf("Executing cuMemFree...\n");
 			cuda_result = memory_free_for_client(cmd->uint_args[0]);
+			TIMER_STOP(&ps_mf);
 			break;
 		case MEMCPY_HOST_TO_DEV:
+			TIMER_START(&ps_mhd);
 			gdprintf("Executing cuMemcpyHtoD...\n");
 			cuda_result = memcpy_host_to_dev_for_client(cmd->uint_args[0], cmd->extra_args[0].data, cmd->extra_args[0].len);
+			TIMER_STOP(&ps_mhd);
 			break;
 		case MEMCPY_DEV_TO_HOST:
+			TIMER_START(&ps_mdh);
 			gdprintf("Executing cuMemcpyDtoH...\n");
 			cuda_result = memcpy_dev_to_host_for_client(&extra_args, &extra_args_size, cmd->uint_args[0], cmd->uint_args[1]);
+			TIMER_STOP(&ps_mdh);
 			break;
 		case LAUNCH_KERNEL:
+			TIMER_START(&ps_lk);
 			gdprintf("Executing cuLaunchKernel...\n");
 			cuda_result = launch_kernel_of_client(cmd->uint_args, cmd->n_uint_args, cmd->extra_args, cmd->n_extra_args);
+			TIMER_STOP(&ps_lk);
 			break;
 	}
 
+	TIMER_START(&ps_ext);
 	if (res_type == UINT) {
 		res_length = sizeof(uint64_t);
 		res_data = &uint_res;
-	} else if (extra_args_size != 0) {	
+	} else if (extra_args_size != 0) {
 		res_type = BYTES;
 		res_length = extra_args_size;
 		res_data = extra_args;
@@ -772,6 +814,7 @@ int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_l
 
 	if (extra_args != NULL)
 		free(extra_args);
+	TIMER_STOP(&ps_ext);
 
 	return arg_count;
 }
@@ -790,7 +833,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) {
 		cuda_dev_count++;
 	}
 	gdprintf("Available CUDA devices: %d\n", cuda_dev_count);
-	
+
 	// Init variables
 	cuda_devs = malloc_safe(sizeof(CudaDeviceList));
 	cuda_device_list__init(cuda_devs);
@@ -809,7 +852,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) {
 		i++;
 	}
 	cuda_devs->devices_free = i;
-	
+
 	// busy
 	i = 0;
 	list_for_each_entry(pos, &busy_list_p->node, node){
@@ -820,7 +863,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) {
 		cuda_devs_dev[i]->name = pos->cuda_device_name;
 		i++;
 	}
-	
+
 	cuda_devs->n_device = cuda_dev_count;
 	cuda_devs->device = cuda_devs_dev;
 	*result = cuda_devs;
@@ -840,7 +883,7 @@ int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type) {
 	cmd->type = type;
 	cmd->arg_count = arg_count;
 
-	for (i = 0; i < arg_count; i++) {	
+	for (i = 0; i < arg_count; i++) {
 		switch (args[i]->type) {
 			case INT:
 				cmd->n_int_args = args[i]->elements;
diff --git a/src/process.h b/src/process.h
index 601ed05..d6e8d05 100644
--- a/src/process.h
+++ b/src/process.h
@@ -4,15 +4,9 @@
 #include <cuda.h>
 #include "list.h"
 #include "common.h"
+#include "timer.h"
 
 #define CUDA_DEV_NAME_MAX 100
-typedef struct cuda_device_node_s {
-	CUdevice *cuda_device;
-	struct list_head node;
-	char cuda_device_name[CUDA_DEV_NAME_MAX];
-	int is_busy;
-} cuda_device_node;
-
 typedef struct param_node_s {
 	struct list_head node;
 	uint32_t id;
@@ -20,13 +14,21 @@ typedef struct param_node_s {
 	void *rel;
 } param_node;
 
+typedef struct cuda_device_node_s {
+	CUdevice *cuda_device;
+	struct list_head node;
+	char cuda_device_name[CUDA_DEV_NAME_MAX];
+	CUcontext *cuda_context;
+	unsigned int client_count;
+	int is_busy;
+} cuda_device_node;
+
 typedef struct client_node_s {
 	int id;
-	int dev_count;
+	unsigned int dev_count;
 	unsigned int status;
 	struct list_head node;
 	param_node *cuda_dev_node;
-	param_node *cuda_context;
 } client_node;
 
 
@@ -36,9 +38,9 @@ int discover_cuda_devices(void **free_list, void **busy_list);
 
 void print_cuda_devices(void *free_list, void *busy_list);
 
-void print_clients(void *client_list); 
+void print_clients(client_node *client_list);
 
-unsigned int get_client_status(void *client_handle);
+unsigned int get_client_status(client_node *client_handle);
 
 uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation);
 
@@ -46,12 +48,19 @@ int find_param_by_id(param_node **param, param_node *list, uint32_t param_id);
 
 int del_param_of_list(param_node *param);
 
-int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, void **client_list, void **client_handle);
+int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, client_node **client_list, client_node **client_handle);
 
 int process_cuda_device_query(void **result, void *free_list, void *busy_list);
 
 void free_cdn_list(void *list);
 
-int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type); 
+int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type);
+
+
+extern __thread gs_timer ps_i, ps_dg, ps_dgc, ps_dgca, ps_dgn, ps_dgna,
+						 ps_cc, ps_cca, ps_cd, ps_cda, ps_ml, ps_mla,
+						 ps_mgf, ps_mgfa, ps_ma, ps_maa, ps_mf, ps_mfa,
+						 ps_mhd, ps_mhda, ps_mdh, ps_mdha, ps_lk, ps_lka,
+						 ps_ext;
 
 #endif /* PROCESS_H */
diff --git a/src/protocol.c b/src/protocol.c
index b794ea0..78c3517 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -3,6 +3,9 @@
 #include <string.h>
 #include <arpa/inet.h>
 #include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+
 #include "protocol.h"
 #include "common.h"
 #include "common.pb-c.h"
@@ -21,7 +24,7 @@ ssize_t read_socket_msg(int fd, void *buffer, size_t bytes) {
 		exit(EXIT_FAILURE);
 	}
 	//} while (b_total < bytes);
-	
+
 	//printf("Bytes received: %zd\n", b_total);
 	iovlen = msghdr.msg_iovlen;
 	while (iovlen--) {
@@ -35,18 +38,51 @@ ssize_t read_socket_msg(int fd, void *buffer, size_t bytes) {
 }
 #endif
 
+#include "timer.h"
+__thread gs_timer pl_rds, pl_rdm, pl_rd1, pl_rd2, pl_m, pl_re;
+__thread unsigned long long pl_rd1a, pl_rd2a;
+static __thread gs_timer *rdt;
+static __thread unsigned long long *rda;
+
 ssize_t read_socket(int fd, void *buffer, size_t bytes) {
 	ssize_t	b_read, b_total = 0;
+	int flags, err;
+	unsigned long long tmp = TIMER_TOTAL(rdt);
 
+	flags = fcntl(fd, F_GETFL, 0);
+	if (flags < 0) {
+		perror("read socket fcntl failed");
+		exit(EXIT_FAILURE);
+	}
+	if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
+		perror("read socket fcntl failed");
+		exit(EXIT_FAILURE);
+	}
+
+	TIMER_START(rdt);
 	do {
 		b_read = read(fd, buffer+b_total, bytes-b_total);
 		if (b_read < 0) {
-			perror("read socket failed");
-			exit(EXIT_FAILURE);
+			if (errno == EAGAIN || errno == EWOULDBLOCK) {
+				TIMER_GET(rdt);
+				tmp = TIMER_TOTAL(rdt);
+			} else {
+				perror("read socket failed");
+				exit(EXIT_FAILURE);
+			}
+		} else {
+			b_total += b_read;
 		}
-		b_total += b_read;
 	} while (b_total < bytes);
-	
+	TIMER_STOP(rdt);
+
+	*rda += TIMER_TOTAL(rdt) - tmp;
+
+	if (fcntl(fd, F_SETFL, flags) < 0) {
+		perror("read socket fcntl failed");
+		exit(EXIT_FAILURE);
+	}
+
 	gdprintf("Bytes received: %zd\n", b_total);
 	return b_total;
 }
@@ -63,7 +99,7 @@ ssize_t write_socket(int fd, void *buffer, size_t bytes) {
 		}
 		b_total += b_written;
 	} while (b_total < bytes);
-	
+
 	gdprintf("Bytes sent: %zd\n", b_total);
 	return b_total;
 }
@@ -78,19 +114,31 @@ uint32_t receive_message(void **enc_msg, int sock_fd) {
 	uint32_t msg_length;
 	int ret = 0;
 
+	TIMER_START(&pl_m);
 	buffer = malloc_safe(sizeof(uint32_t));
-	
+	TIMER_STOP(&pl_m);
+
+	rdt = &pl_rd1;
+	rda = &pl_rd1a;
+	TIMER_START(&pl_rds);
 	// read message length
 	read_socket(sock_fd, buffer, sizeof(uint32_t));
+	TIMER_STOP(&pl_rds);
 
 	msg_length = ntohl(*(uint32_t *)buffer);
 	gdprintf("Going to read a message of %u bytes...\n", msg_length);
-	
+
+	TIMER_START(&pl_re);
 	buffer = realloc(buffer, msg_length);
-	
+	TIMER_STOP(&pl_re);
+
+	rdt = &pl_rd2;
+	rda = &pl_rd2a;
+	TIMER_START(&pl_rdm);
 	// read message
 	read_socket(sock_fd, buffer, msg_length);
-	
+	TIMER_STOP(&pl_rdm);
+
 	*enc_msg = buffer;
 
 	return msg_length;
@@ -105,7 +153,7 @@ int decode_message(void **result, void **payload, void *enc_msg, uint32_t enc_ms
 		fprintf(stderr, "message unpacking failed\n");
 		return -1;
 	}
-	
+
 	switch (msg->type) {
 		case CUDA_CMD:
 			gdprintf("--------------\nIs CUDA_CMD\n");
@@ -124,11 +172,11 @@ int decode_message(void **result, void **payload, void *enc_msg, uint32_t enc_ms
 			*payload = msg->cuda_devices;
 			break;
 	}
-	
+
 	// We can't call this here unless we make a *deep* copy of the
 	// message payload...
 	//cookie__free_unpacked(msg, NULL);
-	*result = msg; 
+	*result = msg;
 
 	return msg->type;
 }
@@ -137,7 +185,7 @@ size_t encode_message(void **result, int msg_type, void *payload) {
 	size_t buf_size;
 	uint32_t msg_length, msg_len_n;
 	Cookie message = COOKIE__INIT;
-	void *buffer, *msg_buffer; 
+	void *buffer, *msg_buffer;
 
 	gdprintf("Encoding message data...\n");
 	message.type = msg_type;
diff --git a/src/protocol.h b/src/protocol.h
index 2fe88c4..3345075 100644
--- a/src/protocol.h
+++ b/src/protocol.h
@@ -1,6 +1,8 @@
 #ifndef PROTOCOL_H
 #define PROTOCOL_H
 
+#include "timer.h"
+
 ssize_t read_socket(int fd, void *buffer, size_t bytes);
 
 ssize_t write_socket(int fd, void *buffer, size_t bytes);
@@ -15,4 +17,7 @@ size_t encode_message(void **result, int msg_type, void *payload);
 
 void free_decoded_message(void *msg);
 
+extern __thread gs_timer pl_rds, pl_rdm, pl_rd1, pl_rd2, pl_m, pl_re;
+extern __thread unsigned long long pl_rd1a, pl_rd2a;
+
 #endif /* PROTOCOL_H */
diff --git a/src/server.c b/src/server.c
index 70e4026..5248e92 100644
--- a/src/server.c
+++ b/src/server.c
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <pthread.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -8,11 +9,208 @@
 #include <unistd.h>
 #include <netdb.h>
 
-#define TIMERS_ENABLED
 #include "common.h"
 #include "common.pb-c.h"
 #include "protocol.h"
 #include "process.h"
+#include "timer.h"
+
+void *free_list=NULL, *busy_list=NULL;
+client_node *client_list=NULL;
+
+void *connection_handler(void *socket_desc) {
+	int client_sock_fd = *(int *) socket_desc, client_id = -1;
+	int msg_type, resp_type, arg_cnt;
+	void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL;
+	client_node *client_handle=NULL;
+	uint32_t msg_length;
+	gs_timer mt, rt, dt, pt;
+
+	pl_rd1a = 0;
+	pl_rd2a = 0;
+	TIMER_RESET(&mt);
+	TIMER_RESET(&rt);
+	TIMER_RESET(&dt);
+	TIMER_RESET(&pt);
+	TIMER_RESET(&pl_rds);
+	TIMER_RESET(&pl_rdm);
+	TIMER_RESET(&pl_rd1);
+	TIMER_RESET(&pl_rd2);
+	TIMER_RESET(&pl_m);
+	TIMER_RESET(&pl_re);
+	TIMER_RESET(&ps_i);
+	TIMER_RESET(&ps_dg);
+	TIMER_RESET(&ps_dgc);
+	TIMER_RESET(&ps_dgca);
+	TIMER_RESET(&ps_dgn);
+	TIMER_RESET(&ps_dgna);
+	TIMER_RESET(&ps_cc);
+	TIMER_RESET(&ps_cca);
+	TIMER_RESET(&ps_cd);
+	TIMER_RESET(&ps_cda);
+	TIMER_RESET(&ps_ml);
+	TIMER_RESET(&ps_mla);
+	TIMER_RESET(&ps_mgf);
+	TIMER_RESET(&ps_mgfa);
+	TIMER_RESET(&ps_ma);
+	TIMER_RESET(&ps_maa);
+	TIMER_RESET(&ps_mf);
+	TIMER_RESET(&ps_mfa);
+	TIMER_RESET(&ps_mhd);
+	TIMER_RESET(&ps_mhda);
+	TIMER_RESET(&ps_mdh);
+	TIMER_RESET(&ps_mdha);
+	TIMER_RESET(&ps_lk);
+	TIMER_RESET(&ps_lka);
+	TIMER_RESET(&ps_ext);
+
+	for (;;) {
+		TIMER_START(&mt);
+		TIMER_START(&rt);
+		msg_length = receive_message(&msg, client_sock_fd);
+		TIMER_STOP(&rt);
+		TIMER_START(&dt);
+		if (msg_length > 0)
+			msg_type = decode_message(&dec_msg, &payload,
+						  msg, msg_length);
+
+		TIMER_STOP(&dt);
+		gdprintf("Processing message\n");
+		TIMER_START(&pt);
+		switch (msg_type) {
+			case CUDA_CMD:
+				arg_cnt = process_cuda_cmd(&result,
+						payload, free_list,
+						busy_list, &client_list,
+						&client_handle);
+				cuCtxSynchronize();
+				resp_type = CUDA_CMD_RESULT;
+				break;
+			case CUDA_DEVICE_QUERY:
+				process_cuda_device_query(&result,
+						free_list, busy_list);
+				resp_type = CUDA_DEVICE_LIST;
+				break;
+		}
+
+		print_clients(client_list);
+		print_cuda_devices(free_list, busy_list);
+
+		if (msg != NULL) {
+			free(msg);
+			msg = NULL;
+		}
+		if (dec_msg != NULL) {
+			free_decoded_message(dec_msg);
+			dec_msg = NULL;
+			// payload should be invalid now
+			payload = NULL;
+		}
+		TIMER_STOP(&pt);
+
+		if (resp_type != -1) {
+			gdprintf("Sending result\n");
+			pack_cuda_cmd(&payload, result, arg_cnt,
+				      CUDA_CMD_RESULT);
+			msg_length = encode_message(&msg, resp_type, payload);
+			send_message(client_sock_fd, msg, msg_length);
+
+			if (result != NULL) {
+				// should be more freeing here...
+				free(result);
+				result = NULL;
+			}
+		}
+
+		gdprintf(">>\nMessage processed, cleaning up...\n<<\n");
+
+		if (msg != NULL) {
+			free(msg);
+			msg = NULL;
+		}
+
+		if (client_id == -1)
+			client_id = client_handle->id;
+
+		TIMER_STOP(&mt);
+		if (get_client_status(client_handle) == 0) {
+			// TODO: freeing
+			printf("\n--------------\nClient %d finished.\n", client_id);
+			break;
+		}
+	}
+	printf("Client elapsed time (usec): %lf\n\n",
+			TIMER_TO_USEC(TIMER_TOTAL(&mt)));
+
+	printf("Message average elapsed time (usec):\n- total: %lf\n"
+		   "- receive: %lf\n- decode: %lf\n"
+		   "- process: %lf\n- send: %lf\n",
+			TIMER_TO_USEC(TIMER_AVG(&mt)), TIMER_TO_USEC(TIMER_AVG(&rt)),
+			TIMER_TO_USEC(TIMER_AVG(&dt)), TIMER_TO_USEC(TIMER_AVG(&pt)),
+			TIMER_TO_USEC((TIMER_TOTAL(&mt) - TIMER_TOTAL(&rt)
+				- TIMER_TOTAL(&dt) - TIMER_TOTAL(&pt))
+				/ (double) TIMER_COUNT(&mt)));
+
+	printf("\nReceive average elapsed time (usec):\n- malloc: %lf\n"
+		   "- read size: %lf\n  |read socket:\n  |- total: %lf\n"
+		   "  |- actual: %lf\n  |- waiting: %lf\n- realloc: %lf\n"
+		   "- read message: %lf\n  |read socket:\n  |- total: %lf\n"
+		   "  |- actual: %lf\n  |- waiting: %lf\n",
+			TIMER_TO_USEC(TIMER_AVG(&pl_m)),
+			TIMER_TO_USEC(TIMER_AVG(&pl_rds)),
+			TIMER_TO_USEC(TIMER_AVG(&pl_rd1)),
+			TIMER_TO_USEC(pl_rd1a / (double) TIMER_COUNT(&pl_rd1)),
+			TIMER_TO_USEC((TIMER_TOTAL(&pl_rd1) - pl_rd1a)
+				/ (double) TIMER_COUNT(&pl_rd1)),
+			TIMER_TO_USEC(TIMER_AVG(&pl_re)),
+			TIMER_TO_USEC(TIMER_AVG(&pl_rdm)),
+			TIMER_TO_USEC(TIMER_AVG(&pl_rd2)),
+			TIMER_TO_USEC(pl_rd2a / (double) TIMER_COUNT(&pl_rd2)),
+			TIMER_TO_USEC((TIMER_TOTAL(&pl_rd2) - pl_rd2a)
+				/ (double) TIMER_COUNT(&pl_rd2)));
+
+	printf("Process average elapsed time (usec):\n"
+		   "- init: %lf\n- device_get: %lf\n"
+		   "- device_get_count:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- device_get_name:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- context_create:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- context_destroy:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- module_load:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- module_get_function:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- memory_allocate:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- memory_free:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- memcpy_h_to_d:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- memcpy_d_to_h:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- launch_kernel:\n  |- total: %lf\n  |- actual: %lf\n"
+		   "- extra: %lf\n\n",
+			TIMER_TO_USEC(TIMER_AVG(&ps_i)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_dg)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_dgc)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_dgca)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_dgn)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_dgna)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_cc)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_cca)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_cd)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_cda)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_ml)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mla)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mgf)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mgfa)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_ma)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_maa)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mf)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mfa)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mhd)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mhda)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mdh)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_mdha)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_lk)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_lka)),
+			TIMER_TO_USEC(TIMER_AVG(&ps_ext)));
+
+}
+
 
 int init_server_net(const char *port, struct addrinfo *addr) {
 	int socket_fd, ret;
@@ -63,16 +261,15 @@ int init_server(char *port, struct addrinfo *addr, void **free_list, void **busy
 }
 
 int main(int argc, char *argv[]) {
-	int server_sock_fd, client_sock_fd, msg_type, resp_type, arg_cnt;
+	int server_sock_fd, client_sock_fd, msg_type, resp_type, arg_cnt, *new_sock;
 	struct sockaddr_in client_addr;
 	struct addrinfo local_addr;
 	char server_ip[16] /* IPv4 */, server_port[6], *local_port,
 		 client_host[NI_MAXHOST], client_serv[NI_MAXSERV];
 	socklen_t s;
-	void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL,
-		 *free_list=NULL, *busy_list=NULL, *client_list=NULL, *client_handle=NULL;
+	void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL;
 	uint32_t msg_length;
-	timers_t tm;
+	pthread_t sniffer_thread;
 
 	if (argc > 2) {
 		printf("Usage: server <local_port>\n");
@@ -112,65 +309,12 @@ int main(int argc, char *argv[]) {
 		else
 			printf("from unidentified client");
 
-		TIMER_RESET(&tm);
-		for(;;) {
-			msg_length = receive_message(&msg, client_sock_fd);
-			if (msg_length > 0)
-				msg_type = decode_message(&dec_msg, &payload, msg, msg_length);
-
-			printf("Processing message\n");
-			TIMER_START(&tm);
-			switch (msg_type) {
-				case CUDA_CMD:
-					arg_cnt = process_cuda_cmd(&result, payload, free_list, busy_list, &client_list, &client_handle);
-					resp_type = CUDA_CMD_RESULT;
-					break;
-				case CUDA_DEVICE_QUERY:
-					process_cuda_device_query(&result, free_list, busy_list);
-					resp_type = CUDA_DEVICE_LIST;
-					break;
-			}
-
-			print_clients(client_list);
-			print_cuda_devices(free_list, busy_list);
-
-			if (msg != NULL) {
-				free(msg);
-				msg = NULL;
-			}
-			if (dec_msg != NULL) {
-				free_decoded_message(dec_msg);
-				dec_msg = NULL;
-				// payload should be invalid now
-				payload = NULL;
-			}
-
-			if (resp_type != -1) {
-				gdprintf("Sending result\n");
-				pack_cuda_cmd(&payload, result, arg_cnt, CUDA_CMD_RESULT);
-				msg_length = encode_message(&msg, resp_type, payload);
-				send_message(client_sock_fd, msg, msg_length);
-
-				if (result != NULL) {
-					// should be more freeing here...
-					free(result);
-					result = NULL;
-				}
-			}
-			printf(">>\nMessage processed, cleaning up...\n<<\n");
-			TIMER_STOP(&tm);
-			if (msg != NULL) {
-				free(msg);
-				msg = NULL;
-			}
-
-			if (get_client_status(client_handle) == 0) {
-				// TODO: freeing
-				printf("\n--------------\nClient finished.\n\n");
-				break;
-			}
+		new_sock = malloc_safe(sizeof(*new_sock));
+		*new_sock = client_sock_fd;
+		if (pthread_create(&sniffer_thread, NULL, connection_handler, (void *)new_sock) < 0) {
+			fprintf(stderr, "could not create thread\n");
+			return 1;
 		}
-	printf("message needed %lf to be processed\n", TICKS_TO_USEC(TIMER_TOTAL(&tm)));
 	}
 	close(client_sock_fd);
 
diff --git a/src/timer.h b/src/timer.h
new file mode 100644
index 0000000..c3c6452
--- /dev/null
+++ b/src/timer.h
@@ -0,0 +1,57 @@
+#ifndef GPUSOCK_TIMER_H
+#define GPUSOCK_TIMER_H
+
+typedef struct timer {
+	unsigned long long total;
+	unsigned long long val;
+	unsigned long cnt;
+} gs_timer;
+
+
+#ifdef TIMERS_ENABLED
+
+#include <time.h>
+
+static inline unsigned long long get_time()
+{
+	struct timespec ts;
+	unsigned long long time;
+
+	if(clock_gettime(CLOCK_MONOTONIC, &ts) == -1) {
+		perror("clock gettime");
+		exit(EXIT_FAILURE);
+	}
+	time = ts.tv_sec * 1000000000 + ts.tv_nsec;
+	//printf("time: %llu ns\n", time);
+
+	return time;
+}
+
+
+#define TIMER_START(tm)	do { (tm)->val = get_time(); } while (0)
+#define TIMER_STOP(tm)	do { (tm)->total += get_time() - (tm)->val; ++(tm)->cnt; } while (0)
+#define TIMER_GET(tm)	do { (tm)->total += get_time() - (tm)->val; (tm)->val = get_time(); } while (0)
+#define TIMER_RESET(tm)	do { (tm)->total = (tm)->val = 0; (tm)->cnt = 0; } while (0)
+#define TIMER_TOTAL(tm)	((tm)->total)
+#define TIMER_COUNT(tm)	((tm)->cnt)
+#define TIMER_AVG(tm)	((tm)->cnt ? ((tm)->total / (tm)->cnt) : 0)
+
+#define TIMER_TO_SEC(t) (t / 1000000000.0)
+#define TIMER_TO_USEC(t) (t / 1000.0)
+
+#else
+
+#define TIMER_START(a)
+#define TIMER_STOP(a)
+#define TIMER_GET(a)
+#define TIMER_RESET(a)
+#define TIMER_TOTAL(a) 0ULL
+#define TIMER_COUNT(a) 0UL
+#define TIMER_AVG(a) 0ULL
+
+#define TIMER_TO_SEC(a) 0
+#define TIMER_TO_USEC(a) 0
+
+#endif	/* TIMER_ENABLED */
+
+#endif	/* GPUSOCK_TIMER_H */
diff --git a/src/timers.h b/src/timers.h
deleted file mode 100644
index 7d13f42..0000000
--- a/src/timers.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (C) Anastassios Nanos 2012 (See AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details. You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#define _GNU_SOURCE
-
-#ifndef __gpusock_timers_h__
-#define __gpusock_timers_h__
-
-typedef struct timers {
-	unsigned long long total;
-	unsigned long long val;
-	unsigned long cnt;
-} timers_t;
-
-
-#ifdef TIMERS_ENABLED
-
-#include <asm/msr.h>
-//#include <linux/timex.h>
-
-unsigned long long get_cycles()
-{
-        struct timeval tv;
-	unsigned long long paparia;
-        gettimeofday(&tv,NULL);
-        paparia = tv.tv_sec * 1000000 + tv.tv_usec;
-	printf("paparia:%ld\n", paparia);
-	return paparia;
-}
-
-
-#define TIMER_START(tp)	do {( tp)->val = get_cycles(); } while (0)
-#define TIMER_STOP(tp)	do { (tp)->total += get_cycles() - (tp)->val; ++(tp)->cnt; } while (0)
-#define TIMER_RESET(tp)	do { (tp)->total = (tp)->val = 0; (tp)->cnt = 0; } while (0)
-#define TIMER_TOTAL(tp)	((tp)->total)
-#define TIMER_COUNT(tp)	((tp)->cnt)
-#define TIMER_AVG(tp)	((tp)->cnt ? ((tp)->total / (tp)->cnt) : -1)
-
-#define TICKS_TO_USEC(t)	(1000 * t/CYCLES_PER_SEC)
-
-#else
-
-#define TIMER_START(a)
-#define TIMER_STOP(a)
-#define TIMER_TOTAL(a) 0ULL
-#define TIMER_COUNT(a) 0UL
-#define TIMER_RESET(a)
-#define TICKS_TO_USEC(a) 0ULL
-
-#endif	/* TIMERS_ENABLED */
-
-#define var_name(x) #x
-#define gpusock_timer_reset(x) TIMER_RESET(x);
-#endif	/* __gpusock_timers_h__ */

From 9bb8f4e3fef85dce6e50b83f06843012fe5aeba0 Mon Sep 17 00:00:00 2001
From: papazof <papazof@gmail.com>
Date: Wed, 12 Nov 2014 00:21:40 +0200
Subject: [PATCH 3/3] Server send message timing breakdown.

---
 src/server.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/server.c b/src/server.c
index 5248e92..2e1dabb 100644
--- a/src/server.c
+++ b/src/server.c
@@ -24,7 +24,7 @@ void *connection_handler(void *socket_desc) {
 	void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL;
 	client_node *client_handle=NULL;
 	uint32_t msg_length;
-	gs_timer mt, rt, dt, pt;
+	gs_timer mt, rt, dt, pt, st, st1, st2, st3;
 
 	pl_rd1a = 0;
 	pl_rd2a = 0;
@@ -32,6 +32,10 @@ void *connection_handler(void *socket_desc) {
 	TIMER_RESET(&rt);
 	TIMER_RESET(&dt);
 	TIMER_RESET(&pt);
+	TIMER_RESET(&st);
+	TIMER_RESET(&st1);
+	TIMER_RESET(&st2);
+	TIMER_RESET(&st3);
 	TIMER_RESET(&pl_rds);
 	TIMER_RESET(&pl_rdm);
 	TIMER_RESET(&pl_rd1);
@@ -107,13 +111,19 @@ void *connection_handler(void *socket_desc) {
 			payload = NULL;
 		}
 		TIMER_STOP(&pt);
-
+		TIMER_START(&st);
 		if (resp_type != -1) {
 			gdprintf("Sending result\n");
+			TIMER_START(&st1);
 			pack_cuda_cmd(&payload, result, arg_cnt,
 				      CUDA_CMD_RESULT);
+			TIMER_STOP(&st1);
+			TIMER_START(&st2);
 			msg_length = encode_message(&msg, resp_type, payload);
+			TIMER_STOP(&st2);
+			TIMER_START(&st3);
 			send_message(client_sock_fd, msg, msg_length);
+			TIMER_STOP(&st3);
 
 			if (result != NULL) {
 				// should be more freeing here...
@@ -121,7 +131,7 @@ void *connection_handler(void *socket_desc) {
 				result = NULL;
 			}
 		}
-
+		TIMER_STOP(&st);
 		gdprintf(">>\nMessage processed, cleaning up...\n<<\n");
 
 		if (msg != NULL) {
@@ -144,12 +154,14 @@ void *connection_handler(void *socket_desc) {
 
 	printf("Message average elapsed time (usec):\n- total: %lf\n"
 		   "- receive: %lf\n- decode: %lf\n"
-		   "- process: %lf\n- send: %lf\n",
+		   "- process: %lf\n- send:\n  |- total: %lf\n  |- pack:% lf\n"
+		   "  |- encode: %lf\n  |- actual: %lf\n",
 			TIMER_TO_USEC(TIMER_AVG(&mt)), TIMER_TO_USEC(TIMER_AVG(&rt)),
 			TIMER_TO_USEC(TIMER_AVG(&dt)), TIMER_TO_USEC(TIMER_AVG(&pt)),
-			TIMER_TO_USEC((TIMER_TOTAL(&mt) - TIMER_TOTAL(&rt)
-				- TIMER_TOTAL(&dt) - TIMER_TOTAL(&pt))
-				/ (double) TIMER_COUNT(&mt)));
+			TIMER_TO_USEC(TIMER_AVG(&st)),
+			TIMER_TO_USEC(TIMER_AVG(&st1)),
+			TIMER_TO_USEC(TIMER_AVG(&st2)),
+			TIMER_TO_USEC(TIMER_AVG(&st3)));
 
 	printf("\nReceive average elapsed time (usec):\n- malloc: %lf\n"
 		   "- read size: %lf\n  |read socket:\n  |- total: %lf\n"