From 53d1fdb64495d166e815f5d83bb8622982930405 Mon Sep 17 00:00:00 2001 From: Anastassios Nanos Date: Mon, 13 Oct 2014 14:15:40 +0300 Subject: [PATCH 1/3] Add timer support. --- src/Makefile.am | 18 +++++++------ src/common.h | 1 + src/server.c | 6 +++++ src/timers.h | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 8 deletions(-) create mode 100644 src/timers.h diff --git a/src/Makefile.am b/src/Makefile.am index 6c1c7a4..5028bbf 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,18 +1,19 @@ -bin_PROGRAMS = server libcudawrapper #test-cuda test-client +bin_PROGRAMS = server libcudawrapper.so #test-cuda test-client -AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include +CYCLES_PER_SEC = `cat /proc/cpuinfo |grep cpu\ MHz | head -1 | cut -d\: -f2 | awk '{ print $$1 * 1000 }'` +AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -DCYCLES_PER_SEC=$(CYCLES_PER_SEC) BUILT_SOURCES = @srcdir@/common.pb-c.c @srcdir@/common.pb-c.h -server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h +server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timers.h server_SOURCES += common.pb-c.c common.pb-c.h -libcudawrapper_CFLAGS = -fPIC -shared -libcudawrapper_CFLAGS += -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -libcudawrapper_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c -libcudawrapper_SOURCES += common.pb-c.c common.pb-c.h +libcudawrapper_so_CFLAGS = -fPIC -shared +libcudawrapper_so_CFLAGS += -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include +libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c +libcudawrapper_so_SOURCES += common.pb-c.c common.pb-c.h common.pb-c.c: @srcdir@/common.proto $(PROTOC_C) --proto_path=@srcdir@ --c_out=. @srcdir@/common.proto @@ -26,9 +27,10 @@ common.pb-c.h: @srcdir@/common.pb-c.c CLEANFILES = @builddir@/common.pb-c.c @builddir@/common.pb-c.h server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -libcudawrapper_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl +libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl EXTRA_DIST = common.proto AM_CFLAGS += -I@builddir@ + diff --git a/src/common.h b/src/common.h index b666fe4..2ccf311 100644 --- a/src/common.h +++ b/src/common.h @@ -2,6 +2,7 @@ #define COMMON_H #include +#include "timers.h" /* typedef struct cuda_cmd_s { uint16_t type; diff --git a/src/server.c b/src/server.c index f2b09e5..70e4026 100644 --- a/src/server.c +++ b/src/server.c @@ -8,6 +8,7 @@ #include #include +#define TIMERS_ENABLED #include "common.h" #include "common.pb-c.h" #include "protocol.h" @@ -71,6 +72,7 @@ int main(int argc, char *argv[]) { void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL, *free_list=NULL, *busy_list=NULL, *client_list=NULL, *client_handle=NULL; uint32_t msg_length; + timers_t tm; if (argc > 2) { printf("Usage: server \n"); @@ -110,12 +112,14 @@ int main(int argc, char *argv[]) { else printf("from unidentified client"); + TIMER_RESET(&tm); for(;;) { msg_length = receive_message(&msg, client_sock_fd); if (msg_length > 0) msg_type = decode_message(&dec_msg, &payload, msg, msg_length); printf("Processing message\n"); + TIMER_START(&tm); switch (msg_type) { case CUDA_CMD: arg_cnt = process_cuda_cmd(&result, payload, free_list, busy_list, &client_list, &client_handle); @@ -154,6 +158,7 @@ int main(int argc, char *argv[]) { } } printf(">>\nMessage processed, cleaning up...\n<<\n"); + TIMER_STOP(&tm); if (msg != NULL) { free(msg); msg = NULL; @@ -165,6 +170,7 @@ int main(int argc, char *argv[]) { break; } } + printf("message needed %lf to be processed\n", TICKS_TO_USEC(TIMER_TOTAL(&tm))); } close(client_sock_fd); diff --git a/src/timers.h b/src/timers.h new file mode 100644 index 0000000..7d13f42 --- /dev/null +++ b/src/timers.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) Anastassios Nanos 2012 (See AUTHORS file) + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. You should have received a copy of the GNU General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#define _GNU_SOURCE + +#ifndef __gpusock_timers_h__ +#define __gpusock_timers_h__ + +typedef struct timers { + unsigned long long total; + unsigned long long val; + unsigned long cnt; +} timers_t; + + +#ifdef TIMERS_ENABLED + +#include +//#include + +unsigned long long get_cycles() +{ + struct timeval tv; + unsigned long long paparia; + gettimeofday(&tv,NULL); + paparia = tv.tv_sec * 1000000 + tv.tv_usec; + printf("paparia:%ld\n", paparia); + return paparia; +} + + +#define TIMER_START(tp) do {( tp)->val = get_cycles(); } while (0) +#define TIMER_STOP(tp) do { (tp)->total += get_cycles() - (tp)->val; ++(tp)->cnt; } while (0) +#define TIMER_RESET(tp) do { (tp)->total = (tp)->val = 0; (tp)->cnt = 0; } while (0) +#define TIMER_TOTAL(tp) ((tp)->total) +#define TIMER_COUNT(tp) ((tp)->cnt) +#define TIMER_AVG(tp) ((tp)->cnt ? ((tp)->total / (tp)->cnt) : -1) + +#define TICKS_TO_USEC(t) (1000 * t/CYCLES_PER_SEC) + +#else + +#define TIMER_START(a) +#define TIMER_STOP(a) +#define TIMER_TOTAL(a) 0ULL +#define TIMER_COUNT(a) 0UL +#define TIMER_RESET(a) +#define TICKS_TO_USEC(a) 0ULL + +#endif /* TIMERS_ENABLED */ + +#define var_name(x) #x +#define gpusock_timer_reset(x) TIMER_RESET(x); +#endif /* __gpusock_timers_h__ */ From 06a8e0b598fc6e3ee9332af1c1c84996a1576c40 Mon Sep 17 00:00:00 2001 From: papazof Date: Mon, 10 Nov 2014 19:50:26 +0200 Subject: [PATCH 2/3] Use newer clock_gettime. Time different operations. Merge server-client_con branch. --- configure.ac | 15 ++- src/Makefile.am | 14 +-- src/client.c | 14 +++ src/common.h | 1 - src/libcudawrapper.c | 1 - src/process.c | 287 +++++++++++++++++++++++++------------------ src/process.h | 35 ++++-- src/protocol.c | 76 +++++++++--- src/protocol.h | 5 + src/server.c | 270 ++++++++++++++++++++++++++++++---------- src/timer.h | 57 +++++++++ src/timers.h | 67 ---------- 12 files changed, 552 insertions(+), 290 deletions(-) create mode 100644 src/timer.h delete mode 100644 src/timers.h diff --git a/configure.ac b/configure.ac index a3d28d8..949c967 100644 --- a/configure.ac +++ b/configure.ac @@ -16,9 +16,19 @@ AC_ARG_WITH([protobuf-c-libdir], AC_ARG_ENABLE([debug], [AS_HELP_STRING([--enable-debug], [Enable debug print messages. This is a debugging feature which should not be usually enabled])], [DEBUG_CFLAGS="-DGPUSOCK_DEBUG"]) - AC_SUBST(DEBUG_CFLAGS) +AC_ARG_ENABLE([timers], + [AS_HELP_STRING([--enable-timers], [Enable performance timers])], + [timers=yes]) + +AS_IF([test "x$timers" == xyes], + [AC_CHECK_LIB([rt], [clock_gettime], + [AC_SUBST([TIMERS_LIBS], ["-lrt"]) + AC_SUBST([TIMERS_CFLAGS], ["-DTIMERS_ENABLED"]) + ]) + ]) + #AX_LIB_PROTOBUF_C([0.14]) AC_CHECK_LIB([protobuf-c], [main], [], [AC_MSG_ERROR([cannot find protobuf library])]) @@ -35,6 +45,9 @@ AC_ARG_WITH([protoc-c], [PROTOC_C='protoc-c']) AC_SUBST([PROTOC_C]) +AC_CHECK_LIB(pthread, pthread_create, [PTHREAD_LIBS+=-lpthread]) +AC_SUBST([PTHREAD_LIBS]) + AC_PROG_CXX AM_SILENT_RULES([no]) diff --git a/src/Makefile.am b/src/Makefile.am index 49c2244..831a8e7 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,19 +1,17 @@ bin_PROGRAMS = server libcudawrapper.so #test-cuda test-client -CYCLES_PER_SEC = `cat /proc/cpuinfo |grep cpu\ MHz | head -1 | cut -d\: -f2 | awk '{ print $$1 * 1000 }'` - -AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -DCYCLES_PER_SEC=$(CYCLES_PER_SEC) $(DEBUG_CFLAGS) +AM_CFLAGS = -I$(PROTOBUF_C_CFLAGS) -L$(PROTOBUF_C_LIBDIR) -L$(CUDA) -I@builddir@ -I/usr/include/google -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include $(TIMERS_CFLAGS) $(DEBUG_CFLAGS) BUILT_SOURCES = @srcdir@/common.pb-c.c @srcdir@/common.pb-c.h -server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timers.h +server_SOURCES = server.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h timer.h server_SOURCES += common.pb-c.c common.pb-c.h -libcudawrapper_so_CFLAGS = -fPIC -shared $(DEBUG_CFLAGS) +libcudawrapper_so_CFLAGS = -fPIC -shared $(TIMERS_CFLAGS) $(DEBUG_CFLAGS) libcudawrapper_so_CFLAGS += -L$(CUDA_INSTALL_PATH)/lib -I$(CUDA_INSTALL_PATH)/include -libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c timers.h +libcudawrapper_so_SOURCES = libcudawrapper.c process.c process.h common.h common.c protocol.c protocol.h list.h cuda_errors.h client.h client.c libcudawrapper_so_SOURCES += common.pb-c.c common.pb-c.h common.pb-c.c: @srcdir@/common.proto @@ -27,8 +25,8 @@ common.pb-c.h: @srcdir@/common.pb-c.c CLEANFILES = @builddir@/common.pb-c.c @builddir@/common.pb-c.h -server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl +server_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda $(PTHREAD_LIBS) $(TIMERS_LIBS) +libcudawrapper_so_LDADD = $(PROTOBUF_C_LIBS) $(CUDA_LIBS) -lcuda -ldl $(TIMERS_LIBS) EXTRA_DIST = common.proto diff --git a/src/client.c b/src/client.c index 293e539..02f70df 100644 --- a/src/client.c +++ b/src/client.c @@ -14,6 +14,7 @@ #include "common.pb-c.h" #include "protocol.h" #include "process.h" +#include "timer.h" int init_client(const char *s_ip, const char *s_port, struct addrinfo *s_addr) { int socket_fd, ret; @@ -100,6 +101,9 @@ int64_t get_cuda_cmd_result(void **result, int sock_fd) { void *buffer=NULL, *payload=NULL, *dec_msg=NULL; int res_code; + gs_timer tm; + TIMER_RESET(&tm); + TIMER_START(&tm); gdprintf("Waiting for response:\n"); msg_length = receive_message(&buffer, sock_fd); if (msg_length > 0) { @@ -131,6 +135,9 @@ int64_t get_cuda_cmd_result(void **result, int sock_fd) { if (buffer != NULL) free(buffer); + TIMER_STOP(&tm); + gdprintf("\nClient Receive: %lf\n", TIMER_TO_SEC(TIMER_TOTAL(&tm))); + return res_code; } @@ -173,6 +180,10 @@ int send_cuda_cmd(int sock_fd, var **args, size_t arg_count, int type) { void *buffer = NULL, *payload = NULL; size_t buf_size; + gs_timer tm; + TIMER_RESET(&tm); + TIMER_START(&tm); + gdprintf("Sendind CUDA cmd...\n"); pack_cuda_cmd(&payload, args, arg_count, type); @@ -184,6 +195,9 @@ int send_cuda_cmd(int sock_fd, var **args, size_t arg_count, int type) { free(buffer); + TIMER_STOP(&tm); + gdprintf("\nClient Send: %lf\n", TIMER_TO_SEC(TIMER_TOTAL(&tm))); + return 0; } diff --git a/src/common.h b/src/common.h index 2ccf311..b666fe4 100644 --- a/src/common.h +++ b/src/common.h @@ -2,7 +2,6 @@ #define COMMON_H #include -#include "timers.h" /* typedef struct cuda_cmd_s { uint16_t type; diff --git a/src/libcudawrapper.c b/src/libcudawrapper.c index a100354..55613d3 100644 --- a/src/libcudawrapper.c +++ b/src/libcudawrapper.c @@ -13,7 +13,6 @@ #include "common.pb-c.h" #include "client.h" - //TODO: assert stored results' size fits... static params c_params; diff --git a/src/process.c b/src/process.c index 4d4785e..ae6eef4 100644 --- a/src/process.c +++ b/src/process.c @@ -6,12 +6,14 @@ #include #include #include +#include #include "process.h" #include "common.h" #include "common.pb-c.h" #include "cuda.h" #include "list.h" +#include "timer.h" // cuGetErrorName() doesn't exist for CUDA < 6.0 ... #if defined(CUDA_VERSION) && CUDA_VERSION < 6000 @@ -26,22 +28,28 @@ CUresult cuGetErrorName(CUresult error, const char** pStr) { *pStr = CUDA_RESULT_STRING[error]; if (*pStr == NULL) return CUDA_ERROR_INVALID_VALUE; - + return CUDA_SUCCESS; } #endif +static pthread_mutex_t client_mutex = PTHREAD_MUTEX_INITIALIZER; +__thread gs_timer ps_i, ps_dg, ps_dgc, ps_dgca, ps_dgn, ps_dgna, ps_cc, + ps_cca, ps_cd, ps_cda, ps_ml, ps_mla, ps_mgf, ps_mgfa, + ps_ma, ps_maa, ps_mf, ps_mfa, ps_mhd, ps_mhda, ps_mdh, + ps_mdha, ps_lk, ps_lka, ps_ext; + #define cuda_err_print(res, ef) \ cuda_error_print(res, ef, __FILE__, __LINE__) inline CUresult cuda_error_print(CUresult result, int exit_flag, const char *file, const int line) { const char *cuda_err_str = NULL; - + if (result != CUDA_SUCCESS) { cuGetErrorName(result, &cuda_err_str); fprintf(stderr, "-\nCUDA Driver API error: %04d - %s [%s, %i]\n-\n", result, cuda_err_str, file, line); - + if (exit_flag != 0) exit(EXIT_FAILURE); } @@ -55,13 +63,13 @@ size_t read_cuda_module_file(void **buffer, const char *filename) { struct stat st; void *buf = NULL; - gdprintf("Reading from file <%s> ... ", filename); + gdprintf("Reading from file <%s> ... ", filename); fd = fopen(filename, "rb"); if (fd == NULL) { perror("fopen failed"); exit(EXIT_FAILURE); } - + if (fstat(fileno(fd), &st) != 0) { fprintf(stderr, "Reading file size failed: %s\n", strerror(errno)); @@ -85,7 +93,7 @@ size_t read_cuda_module_file(void **buffer, const char *filename) { fprintf(stderr, "Reading file failed: read %zu vs %jd expected\n", b_read, st.st_size); exit(EXIT_FAILURE); } - + *buffer = buf; fclose(fd); @@ -96,7 +104,7 @@ size_t read_cuda_module_file(void **buffer, const char *filename) { void print_file_as_hex(uint8_t *file, size_t file_size) { int i; - + gdprintf("File size: %zu\n", file_size); for (i = 0; i < file_size; i++) { if (i % 14 == 0) @@ -112,7 +120,7 @@ void init_device_list(cuda_device_node **list) { empty_list = malloc_safe(sizeof(cuda_device_node)); INIT_LIST_HEAD(&empty_list->node); - + *list = empty_list; } @@ -122,7 +130,7 @@ void init_client_list(client_node **list) { empty_list = malloc_safe(sizeof(client_node)); INIT_LIST_HEAD(&empty_list->node); - + *list = empty_list; } @@ -154,9 +162,9 @@ int add_device_to_list(cuda_device_node *dev_list, int dev_id) { cuda_device_node *cuda_dev_node; char cuda_dev_name[CUDA_DEV_NAME_MAX]; CUdevice *cuda_device; + CUcontext *cuda_context; cuda_device = malloc_safe(sizeof(CUdevice)); - cuda_dev_node = malloc_safe(sizeof(*cuda_dev_node)); if (cuda_err_print(cuDeviceGet(cuda_device, dev_id), 0) != CUDA_SUCCESS) @@ -165,11 +173,20 @@ int add_device_to_list(cuda_device_node *dev_list, int dev_id) { if (cuda_err_print(cuDeviceGetName(cuda_dev_name, CUDA_DEV_NAME_MAX, *cuda_device), 0) != CUDA_SUCCESS) return -1; + cuda_context = malloc_safe(sizeof(*cuda_context)); + // Initializing per device context. + // FIXME(?): We are ignoring client flags... + if (cuda_err_print(cuCtxCreate(cuda_context, 0, *cuda_device), 0) != CUDA_SUCCESS) + return -1; + cuda_dev_node->cuda_device = cuda_device; - memcpy(cuda_dev_node->cuda_device_name, cuda_dev_name, CUDA_DEV_NAME_MAX); + strcpy(cuda_dev_node->cuda_device_name, cuda_dev_name); + cuda_dev_node->cuda_context = cuda_context; cuda_dev_node->is_busy = 0; + cuda_dev_node->client_count = 0; + - fprintf(stdout, "Adding device [%d]@%p -> %s\n", dev_id, cuda_dev_node->cuda_device, cuda_dev_node->cuda_device_name); + fprintf(stdout, "Adding device [%d]@%p -> %s, with context @%p\n", dev_id, cuda_dev_node->cuda_device, cuda_dev_node->cuda_device_name, cuda_dev_node->cuda_context); list_add_tail(&cuda_dev_node->node, &dev_list->node); @@ -190,7 +207,7 @@ int discover_cuda_devices(void **free_list, void **busy_list) { exit(EXIT_FAILURE); } gdprintf("Available CUDA devices: %d\n", cuda_dev_count); - + // Init free and busy CUDA device lists init_device_list(&free_cuda_devs); init_device_list(&busy_cuda_devs); @@ -225,7 +242,7 @@ void print_cuda_devices(void *free_list, void *busy_list) { } } -int add_client_to_list(void **client_handle, client_node *client_list) { +int add_client_to_list(client_node **client_handle, client_node *client_list) { client_node *new_node, *tmp; new_node = malloc_safe(sizeof(*new_node)); @@ -236,21 +253,23 @@ int add_client_to_list(void **client_handle, client_node *client_list) { } else { new_node->id = 0; } + new_node->dev_count = 0; new_node->status = 1; new_node->cuda_dev_node = NULL; - new_node->cuda_context = NULL; gdprintf("Adding client <%d> to list\n", new_node->id); list_add_tail(&new_node->node, &client_list->node); *client_handle = new_node; + return 0; } -int get_client_handle(void **client_handle, void **client_list, int client_id) { +int get_client_handle(client_node **client_handle, client_node **client_list, int client_id) { client_node *client_list_p = *client_list, *pos; + pthread_mutex_lock(&client_mutex); if (client_list_p == NULL) { init_client_list(&client_list_p); *client_list = client_list_p; @@ -258,39 +277,44 @@ int get_client_handle(void **client_handle, void **client_list, int client_id) { // check if client exists in list list_for_each_entry(pos, &client_list_p->node, node) { if (pos->id == client_id) { - printf("Client <%d> is already in the list\n", client_id); + gdprintf("Client <%d> is already in the list\n", client_id); *client_handle = pos; return 0; } } } - + add_client_to_list(client_handle, client_list_p); - + pthread_mutex_unlock(&client_mutex); + return 0; } -int del_client_of_list(void *client_handle) { +int del_client_of_list(client_node *client_handle) { client_node *client = client_handle; - + + pthread_mutex_lock(&client_mutex); gdprintf("Deleting client <%d> from list\n", client->id); list_del(&client->node); + pthread_mutex_unlock(&client_mutex); free(client_handle); return 0; } -void print_clients(void *client_list) { +void print_clients(client_node *client_list) { client_node *client_list_p=client_list, *pos; int i = 0; + pthread_mutex_lock(&client_mutex); gdprintf("\nClients:\n"); list_for_each_entry(pos, &client_list_p->node, node) { gdprintf("| [%d] <%d>\n", i++, pos->id); } + pthread_mutex_unlock(&client_mutex); } -unsigned int get_client_status(void *client_handle) { +unsigned int get_client_status(client_node *client_handle) { client_node *client = client_handle; return (client_handle == NULL) ? 0 : client->status; @@ -299,11 +323,11 @@ unsigned int get_client_status(void *client_handle) { uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation) { uint32_t param_id = 0; param_node *new_node, *tmp; - + if (*list == NULL) init_param_list(list); else { - // TODO: (?) generate unique random id + // TODO(?): generate unique random id tmp = list_last_entry(&((*list)->node), param_node, node); param_id = tmp->id + 1; } @@ -320,7 +344,7 @@ uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation) int find_param_by_id(param_node **param, param_node *list, uint32_t param_id) { param_node *pos; - + list_for_each_entry(pos, &list->node, node) { if (pos->id == param_id) { *param = pos; @@ -333,7 +357,7 @@ int find_param_by_id(param_node **param, param_node *list, uint32_t param_id) { int find_param_by_ptr(param_node **param, param_node *list, uint64_t param_ptr) { param_node *pos; - + list_for_each_entry(pos, &list->node, node) { if (pos->ptr == param_ptr) { *param = pos; @@ -351,11 +375,10 @@ int del_param_of_list(param_node *param) { return 0; } -int update_device_of_client(uintptr_t *dev_ptr, cuda_device_node *free_list, int dev_ordinal, client_node *client) { +int update_devices_of_client(uintptr_t *dev_node_ptr, cuda_device_node *free_list, int dev_ordinal, client_node *client) { cuda_device_node *tmp; int i = 0, true_ordinal; - // TODO: support more than one devices per client. gdprintf("Updating devices of client <%d>...\n", client->id); tmp = list_first_entry_or_null(&free_list->node, cuda_device_node, node); @@ -363,72 +386,67 @@ int update_device_of_client(uintptr_t *dev_ptr, cuda_device_node *free_list, int fprintf(stderr, "No CUDA devices available for assignment\n"); return -1; } - true_ordinal = dev_ordinal - client->dev_count; - while (i++ < true_ordinal) { + //true_ordinal = dev_ordinal - client->dev_count; + while (i++ < dev_ordinal) { tmp = list_next_entry(tmp, node); if (&tmp->node == &free_list->node) { - fprintf(stderr, "No CUDA devices available for assignment with the desired ordinal\n"); + fprintf(stderr, "No CUDA devices available with the desired ordinal\n"); return -1; } } - - *dev_ptr = (uintptr_t) tmp->cuda_device; + + *dev_node_ptr = (uintptr_t) tmp; // TODO: What if client deviceGets the same ordinal twice? add_param_to_list(&client->cuda_dev_node, (uintptr_t) tmp, NULL); return 0; } -int assign_device_to_client(uintptr_t dev_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) { - cuda_device_node *dev_node; +int attach_device_to_client(uintptr_t dev_node_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) { + cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr; param_node *pos; - CUdevice *cuda_device = (CUdevice *) dev_ptr; - gdprintf("Assigning device @%p to client <%d> ...\n", cuda_device, client->id); + pthread_mutex_lock(&client_mutex); + gdprintf("Attaching device @%p to client <%d> ...\n", dev_node->cuda_device, client->id); list_for_each_entry(pos, &client->cuda_dev_node->node, node) { - dev_node = (cuda_device_node *) pos->ptr; - if (dev_node->cuda_device == cuda_device) { - if (dev_node->is_busy == 1) { - fprintf(stderr, "Requested CUDA device is busy\n"); - return -2; - } - gdprintf("Moving device <%s>@%p to busy list\n", - dev_node->cuda_device_name, dev_node->cuda_device); + if ((cuda_device_node *) pos->ptr == dev_node) { dev_node->is_busy = 1; - list_move_tail(&dev_node->node, &busy_list->node); ++client->dev_count; + ++dev_node->client_count; + pthread_mutex_unlock(&client_mutex); return 0; - } + } } - - fprintf(stderr, "Requested CUDA device not in client's list!\n"); + pthread_mutex_unlock(&client_mutex); + + fprintf(stderr, "Requested CUDA device not in client's list!\n"); return -1; } -int free_device_from_client(uintptr_t dev_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) { - cuda_device_node *dev_node; +int detach_device_from_client(uintptr_t dev_node_ptr, cuda_device_node *free_list, cuda_device_node *busy_list, client_node *client) { + cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr; param_node *pos, *tmp; - CUdevice *cuda_device = (CUdevice *) dev_ptr; - - gdprintf("Freeing device @%p from client <%d>...\n", cuda_device, client->id); + + pthread_mutex_lock(&client_mutex); + gdprintf("Detaching device @%p from client <%d>...\n", dev_node->cuda_device, client->id); list_for_each_entry_safe(pos, tmp, &client->cuda_dev_node->node, node) { - dev_node = (cuda_device_node *) pos->ptr; - if (dev_node->cuda_device == cuda_device) { - gdprintf("Moving device <%s>@%p to free list\n", - dev_node->cuda_device_name, dev_node->cuda_device); - dev_node->is_busy = 0; - list_move_tail(&dev_node->node, &free_list->node); + if ((cuda_device_node *) pos->ptr == dev_node) { del_param_of_list(pos); --client->dev_count; + --dev_node->client_count; + if (dev_node->client_count == 0) + dev_node->is_busy = 0; + pthread_mutex_unlock(&client_mutex); return 0; - } + } } + pthread_mutex_unlock(&client_mutex); - fprintf(stderr, "Requested CUDA device not in client's list!\n"); + fprintf(stderr, "Requested CUDA device not in client's list!\n"); return -1; } @@ -437,8 +455,10 @@ int get_device_count_for_client(uint64_t *host_count) { int count; gdprintf("Getting CUDA device count...\n"); - + + TIMER_START(&ps_dgca); res = cuda_err_print(cuDeviceGetCount(&count), 0); + TIMER_STOP(&ps_dgca); if (res == CUDA_SUCCESS) *host_count = count; @@ -454,55 +474,37 @@ int get_device_name_for_client(void **host_name_ptr, size_t *host_name_size, int *host_name_size = name_size; *host_name_ptr = malloc_safe(name_size); + TIMER_START(&ps_dgna); res = cuda_err_print(cuDeviceGetName(*host_name_ptr, name_size, *cuda_device), 0); - - printf("\n\n>>>>> %s\n\n", *host_name_ptr); + TIMER_STOP(&ps_dgna); return res; } -int create_context_of_client(uintptr_t *ctx_ptr, unsigned int flags, uintptr_t dev_ptr, client_node *client) { - CUcontext *cuda_context; - CUdevice *cuda_device = (CUdevice *) dev_ptr; - CUresult res = 0; - - cuda_context = malloc_safe(sizeof(CUcontext)); - - // TODO: support more than one contexts per client. - gdprintf("Creating CUDA context of client <%d> ... ", client->id); - - res = cuda_err_print(cuCtxCreate(cuda_context, flags, *cuda_device), 0); +int get_context_of_client(uintptr_t *ctx_ptr, unsigned int flags, uintptr_t dev_node_ptr, client_node *client) { + cuda_device_node *dev_node = (cuda_device_node *) dev_node_ptr; + CUresult res; - if (res == CUDA_SUCCESS) { - *ctx_ptr = (uintptr_t) cuda_context; - add_param_to_list(&client->cuda_context, (uintptr_t) cuda_context, cuda_device); - gdprintf("created @%p ... Done\n", cuda_context); - } else { - gdprintf("failed ... Done\n"); - } + gdprintf("Getting CUDA context @%p of client <%d> ...\n", dev_node->cuda_context, client->id); + TIMER_START(&ps_cca); + res = cuda_err_print(cuCtxSetCurrent(*dev_node->cuda_context), 0); + TIMER_STOP(&ps_cca); + *ctx_ptr = dev_node_ptr; return res; } -int destroy_context_of_client(uintptr_t *dev_ptr, uintptr_t ctx_ptr, client_node *client) { - CUresult res = 0; - CUcontext *cuda_context = (CUcontext *) ctx_ptr; - param_node *param = NULL; +int put_context_of_client(uintptr_t *dev_node_ptr, uintptr_t ctx_ptr, client_node *client) { + CUresult res; + cuda_device_node *dev_node = (cuda_device_node *) ctx_ptr; - // TODO: free modules/functions allocated handles (?) - gdprintf("Destroying CUDA context @%p of client <%d> ...\n", cuda_context, client->id); - - res = cuda_err_print(cuCtxDestroy(*cuda_context), 0); - - if (res == CUDA_SUCCESS) { - if (find_param_by_ptr(¶m, client->cuda_context, ctx_ptr) != 0) { - fprintf(stderr, "Requested param not in given list!\n"); - } else { - *dev_ptr = (uintptr_t) param->rel; - del_param_of_list(param); - } - } + // TODO: free modules/functions allocated handles (?) + gdprintf("Putting CUDA context @%p of client <%d> ...\n", dev_node->cuda_context, client->id); + TIMER_START(&ps_cda); + res = cuda_err_print(cuCtxSetCurrent(NULL), 0); + TIMER_STOP(&ps_cda); + *dev_node_ptr = ctx_ptr; return res; } @@ -511,12 +513,13 @@ int load_module_of_client(uintptr_t *mod_ptr, ProtobufCBinaryData *image, client CUresult res; CUmodule *cuda_module; - // TODO: support more than one modules per client. gdprintf("Loading CUDA module of client <%d> ... ", client->id); cuda_module = malloc_safe(sizeof(*cuda_module)); + TIMER_START(&ps_mla); res = cuda_err_print(cuModuleLoadData(cuda_module, image->data), 0); + TIMER_STOP(&ps_mla); if (res == CUDA_SUCCESS) *mod_ptr = (uintptr_t) cuda_module; @@ -529,12 +532,13 @@ int get_module_function_of_client(uintptr_t *fun_ptr, uintptr_t mod_ptr, char *f CUfunction *cuda_func; CUmodule *cuda_module = (CUmodule *) mod_ptr; - // TODO: support more than one functions per client. gdprintf("Loading CUDA module function of client <%d> ... ", client->id); cuda_func = malloc_safe(sizeof(*cuda_func)); + TIMER_START(&ps_mgfa); res = cuda_err_print(cuModuleGetFunction(cuda_func, *cuda_module, func_name), 0); + TIMER_STOP(&ps_mgfa); if (res == CUDA_SUCCESS) *fun_ptr = (uintptr_t) cuda_func; @@ -547,8 +551,10 @@ int memory_allocate_for_client(uintptr_t *dev_mem_ptr, size_t mem_size) { CUdeviceptr cuda_dev_ptr; gdprintf("Allocating CUDA device memory of size %zuB...\n", mem_size); - + + TIMER_START(&ps_maa); res = cuda_err_print(cuMemAlloc(&cuda_dev_ptr, mem_size), 0); + TIMER_STOP(&ps_maa); if (res == CUDA_SUCCESS) { *dev_mem_ptr = cuda_dev_ptr; gdprintf("allocated @0x%llx\n", cuda_dev_ptr); @@ -563,7 +569,9 @@ int memory_free_for_client(uintptr_t dev_mem_ptr) { gdprintf("Freeing CUDA device memory @0x%llx...\n", cuda_dev_ptr); + TIMER_START(&ps_mfa); res = cuda_err_print(cuMemFree(cuda_dev_ptr), 0); + TIMER_STOP(&ps_mfa); return res; } @@ -574,7 +582,9 @@ int memcpy_host_to_dev_for_client(uintptr_t dev_mem_ptr, void *host_mem_ptr, siz gdprintf("Memcpying %zuB from host to CUDA device @0x%llx...\n", mem_size, cuda_dev_ptr); + TIMER_START(&ps_mhda); res = cuda_err_print(cuMemcpyHtoD(cuda_dev_ptr, host_mem_ptr, mem_size), 0); + TIMER_STOP(&ps_mhda); return res; } @@ -588,7 +598,9 @@ int memcpy_dev_to_host_for_client(void **host_mem_ptr, size_t *host_mem_size, ui *host_mem_size = mem_size; *host_mem_ptr = malloc_safe(mem_size); + TIMER_START(&ps_mdha); res = cuda_err_print(cuMemcpyDtoH(*host_mem_ptr, cuda_dev_ptr, mem_size), 0); + TIMER_STOP(&ps_mdha); return res; } @@ -606,12 +618,12 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData gdprintf("Executing kernel...\n"); if (n_params > 0) { params = malloc_safe(sizeof(void *) * n_params); - + for(i = 0; i < n_params; i++) { params[i] = (void *) uints[9 + i]; } gdprintf("using \n"); - } + } if (n_extras > 0) { extra = malloc_safe(sizeof(void *) * 5); @@ -624,10 +636,12 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData } gdprintf("with grid (x, y, z) = (%u, %u, %u)\n", grid_x, grid_y, grid_z); gdprintf("and block (x, y, z) = (%u, %u, %u)\n", block_x, block_y, block_z); - + + TIMER_START(&ps_lka); res = cuda_err_print(cuLaunchKernel(*func, grid_x, grid_y, grid_z, block_x, block_y, block_z, shared_mem_size, h_stream, params, extra), 0); + TIMER_STOP(&ps_lka); if (params != NULL) free(params); @@ -638,7 +652,7 @@ int launch_kernel_of_client(uint64_t *uints, size_t n_uints, ProtobufCBinaryData return res; } -int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, void **client_list, void **client_handle) { +int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, client_node **client_list, client_node **client_handle) { int cuda_result = 0, arg_count = 0; CudaCmd *cmd = cmd_ptr; uint64_t uint_res = 0, tmp_ptr = 0; @@ -655,93 +669,121 @@ int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_l gdprintf("Processing CUDA_CMD\n"); switch(cmd->type) { case INIT: + TIMER_START(&ps_i); gdprintf("Executing cuInit...\n"); get_client_handle(client_handle, client_list, cmd->int_args[0]); - uint_res = ((client_node *) *client_handle)->id; - // cuInit() should have already been executed by the server + uint_res = (*client_handle)->id; + // cuInit() should have already been executed by the server // by that point... //cuda_result = cuda_err_print(cuInit(cmd->uint_args[0]), 0); cuda_result = CUDA_SUCCESS; res_type = UINT; + TIMER_STOP(&ps_i); break; case DEVICE_GET: + TIMER_START(&ps_dg); gdprintf("Executing cuDeviceGet...\n"); - if (update_device_of_client(&uint_res, free_list, cmd->int_args[0], *client_handle) < 0) + if (update_devices_of_client(&uint_res, free_list, cmd->int_args[0], *client_handle) < 0) cuda_result = CUDA_ERROR_INVALID_DEVICE; else cuda_result = CUDA_SUCCESS; res_type = UINT; + TIMER_STOP(&ps_dg); break; case DEVICE_GET_COUNT: + TIMER_START(&ps_dgc); gdprintf("Executing cuDeviceGetCount...\n"); cuda_result = get_device_count_for_client(&uint_res); res_type = UINT; + TIMER_STOP(&ps_dgc); break; case DEVICE_GET_NAME: + TIMER_START(&ps_dgn); gdprintf("Executing cuDeviceGetName...\n"); cuda_result = get_device_name_for_client(&extra_args, &extra_args_size, cmd->int_args[0], cmd->uint_args[0]); + TIMER_STOP(&ps_dgn); break; case CONTEXT_CREATE: + TIMER_START(&ps_cc); gdprintf("Executing cuCtxCreate...\n"); - cuda_result = assign_device_to_client(cmd->uint_args[1], free_list, busy_list, *client_handle); - if (cuda_result < 0) + cuda_result = attach_device_to_client(cmd->uint_args[1], free_list, busy_list, *client_handle); + if (cuda_result < 0) { + TIMER_STOP(&ps_cc); break; // Handle appropriately in client. + } - cuda_result = create_context_of_client(&uint_res, cmd->uint_args[0], cmd->uint_args[1], *client_handle); + cuda_result = get_context_of_client(&uint_res, cmd->uint_args[0], cmd->uint_args[1], *client_handle); res_type = UINT; + TIMER_STOP(&ps_cc); break; case CONTEXT_DESTROY: + TIMER_START(&ps_cd); gdprintf("Executing cuCtxDestroy...\n"); // We assume that only one context per device is created - cuda_result = destroy_context_of_client(&tmp_ptr, cmd->uint_args[0], *client_handle); + cuda_result = put_context_of_client(&tmp_ptr, cmd->uint_args[0], *client_handle); if (cuda_result == CUDA_SUCCESS) { - free_device_from_client(tmp_ptr, free_list, busy_list, *client_handle); + detach_device_from_client(tmp_ptr, free_list, busy_list, *client_handle); if (cmd->n_uint_args > 1 && cmd->uint_args[1] == 1) { del_client_of_list(*client_handle); *client_handle = NULL; - //((client_node *) *client_handle)->status = 0; } } + TIMER_STOP(&ps_cd); break; case MODULE_LOAD: + TIMER_START(&ps_ml); gdprintf("Executing cuModuleLoad...\n"); //print_file_as_hex(cmd->extra_args[0].data, cmd->extra_args[0].len); cuda_result = load_module_of_client(&uint_res, &(cmd->extra_args[0]), *client_handle); res_type = UINT; + TIMER_STOP(&ps_ml); break; case MODULE_GET_FUNCTION: + TIMER_START(&ps_mgf); gdprintf("Executing cuModuleGetFuction...\n"); cuda_result = get_module_function_of_client(&uint_res, cmd->uint_args[0], cmd->str_args[0], *client_handle); res_type = UINT; + TIMER_STOP(&ps_mgf); break; case MEMORY_ALLOCATE: + TIMER_START(&ps_ma); gdprintf("Executing cuMemAlloc...\n"); cuda_result = memory_allocate_for_client(&uint_res, cmd->uint_args[0]); res_type = UINT; + TIMER_STOP(&ps_ma); break; case MEMORY_FREE: + TIMER_START(&ps_mf); gdprintf("Executing cuMemFree...\n"); cuda_result = memory_free_for_client(cmd->uint_args[0]); + TIMER_STOP(&ps_mf); break; case MEMCPY_HOST_TO_DEV: + TIMER_START(&ps_mhd); gdprintf("Executing cuMemcpyHtoD...\n"); cuda_result = memcpy_host_to_dev_for_client(cmd->uint_args[0], cmd->extra_args[0].data, cmd->extra_args[0].len); + TIMER_STOP(&ps_mhd); break; case MEMCPY_DEV_TO_HOST: + TIMER_START(&ps_mdh); gdprintf("Executing cuMemcpyDtoH...\n"); cuda_result = memcpy_dev_to_host_for_client(&extra_args, &extra_args_size, cmd->uint_args[0], cmd->uint_args[1]); + TIMER_STOP(&ps_mdh); break; case LAUNCH_KERNEL: + TIMER_START(&ps_lk); gdprintf("Executing cuLaunchKernel...\n"); cuda_result = launch_kernel_of_client(cmd->uint_args, cmd->n_uint_args, cmd->extra_args, cmd->n_extra_args); + TIMER_STOP(&ps_lk); break; } + TIMER_START(&ps_ext); if (res_type == UINT) { res_length = sizeof(uint64_t); res_data = &uint_res; - } else if (extra_args_size != 0) { + } else if (extra_args_size != 0) { res_type = BYTES; res_length = extra_args_size; res_data = extra_args; @@ -772,6 +814,7 @@ int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_l if (extra_args != NULL) free(extra_args); + TIMER_STOP(&ps_ext); return arg_count; } @@ -790,7 +833,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) { cuda_dev_count++; } gdprintf("Available CUDA devices: %d\n", cuda_dev_count); - + // Init variables cuda_devs = malloc_safe(sizeof(CudaDeviceList)); cuda_device_list__init(cuda_devs); @@ -809,7 +852,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) { i++; } cuda_devs->devices_free = i; - + // busy i = 0; list_for_each_entry(pos, &busy_list_p->node, node){ @@ -820,7 +863,7 @@ int process_cuda_device_query(void **result, void *free_list, void *busy_list) { cuda_devs_dev[i]->name = pos->cuda_device_name; i++; } - + cuda_devs->n_device = cuda_dev_count; cuda_devs->device = cuda_devs_dev; *result = cuda_devs; @@ -840,7 +883,7 @@ int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type) { cmd->type = type; cmd->arg_count = arg_count; - for (i = 0; i < arg_count; i++) { + for (i = 0; i < arg_count; i++) { switch (args[i]->type) { case INT: cmd->n_int_args = args[i]->elements; diff --git a/src/process.h b/src/process.h index 601ed05..d6e8d05 100644 --- a/src/process.h +++ b/src/process.h @@ -4,15 +4,9 @@ #include #include "list.h" #include "common.h" +#include "timer.h" #define CUDA_DEV_NAME_MAX 100 -typedef struct cuda_device_node_s { - CUdevice *cuda_device; - struct list_head node; - char cuda_device_name[CUDA_DEV_NAME_MAX]; - int is_busy; -} cuda_device_node; - typedef struct param_node_s { struct list_head node; uint32_t id; @@ -20,13 +14,21 @@ typedef struct param_node_s { void *rel; } param_node; +typedef struct cuda_device_node_s { + CUdevice *cuda_device; + struct list_head node; + char cuda_device_name[CUDA_DEV_NAME_MAX]; + CUcontext *cuda_context; + unsigned int client_count; + int is_busy; +} cuda_device_node; + typedef struct client_node_s { int id; - int dev_count; + unsigned int dev_count; unsigned int status; struct list_head node; param_node *cuda_dev_node; - param_node *cuda_context; } client_node; @@ -36,9 +38,9 @@ int discover_cuda_devices(void **free_list, void **busy_list); void print_cuda_devices(void *free_list, void *busy_list); -void print_clients(void *client_list); +void print_clients(client_node *client_list); -unsigned int get_client_status(void *client_handle); +unsigned int get_client_status(client_node *client_handle); uint32_t add_param_to_list(param_node **list, uint64_t uintptr, void *relation); @@ -46,12 +48,19 @@ int find_param_by_id(param_node **param, param_node *list, uint32_t param_id); int del_param_of_list(param_node *param); -int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, void **client_list, void **client_handle); +int process_cuda_cmd(void **result, void *cmd_ptr, void *free_list, void *busy_list, client_node **client_list, client_node **client_handle); int process_cuda_device_query(void **result, void *free_list, void *busy_list); void free_cdn_list(void *list); -int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type); +int pack_cuda_cmd(void **payload, var **args, size_t arg_count, int type); + + +extern __thread gs_timer ps_i, ps_dg, ps_dgc, ps_dgca, ps_dgn, ps_dgna, + ps_cc, ps_cca, ps_cd, ps_cda, ps_ml, ps_mla, + ps_mgf, ps_mgfa, ps_ma, ps_maa, ps_mf, ps_mfa, + ps_mhd, ps_mhda, ps_mdh, ps_mdha, ps_lk, ps_lka, + ps_ext; #endif /* PROCESS_H */ diff --git a/src/protocol.c b/src/protocol.c index b794ea0..78c3517 100644 --- a/src/protocol.c +++ b/src/protocol.c @@ -3,6 +3,9 @@ #include #include #include +#include +#include + #include "protocol.h" #include "common.h" #include "common.pb-c.h" @@ -21,7 +24,7 @@ ssize_t read_socket_msg(int fd, void *buffer, size_t bytes) { exit(EXIT_FAILURE); } //} while (b_total < bytes); - + //printf("Bytes received: %zd\n", b_total); iovlen = msghdr.msg_iovlen; while (iovlen--) { @@ -35,18 +38,51 @@ ssize_t read_socket_msg(int fd, void *buffer, size_t bytes) { } #endif +#include "timer.h" +__thread gs_timer pl_rds, pl_rdm, pl_rd1, pl_rd2, pl_m, pl_re; +__thread unsigned long long pl_rd1a, pl_rd2a; +static __thread gs_timer *rdt; +static __thread unsigned long long *rda; + ssize_t read_socket(int fd, void *buffer, size_t bytes) { ssize_t b_read, b_total = 0; + int flags, err; + unsigned long long tmp = TIMER_TOTAL(rdt); + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + perror("read socket fcntl failed"); + exit(EXIT_FAILURE); + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { + perror("read socket fcntl failed"); + exit(EXIT_FAILURE); + } + + TIMER_START(rdt); do { b_read = read(fd, buffer+b_total, bytes-b_total); if (b_read < 0) { - perror("read socket failed"); - exit(EXIT_FAILURE); + if (errno == EAGAIN || errno == EWOULDBLOCK) { + TIMER_GET(rdt); + tmp = TIMER_TOTAL(rdt); + } else { + perror("read socket failed"); + exit(EXIT_FAILURE); + } + } else { + b_total += b_read; } - b_total += b_read; } while (b_total < bytes); - + TIMER_STOP(rdt); + + *rda += TIMER_TOTAL(rdt) - tmp; + + if (fcntl(fd, F_SETFL, flags) < 0) { + perror("read socket fcntl failed"); + exit(EXIT_FAILURE); + } + gdprintf("Bytes received: %zd\n", b_total); return b_total; } @@ -63,7 +99,7 @@ ssize_t write_socket(int fd, void *buffer, size_t bytes) { } b_total += b_written; } while (b_total < bytes); - + gdprintf("Bytes sent: %zd\n", b_total); return b_total; } @@ -78,19 +114,31 @@ uint32_t receive_message(void **enc_msg, int sock_fd) { uint32_t msg_length; int ret = 0; + TIMER_START(&pl_m); buffer = malloc_safe(sizeof(uint32_t)); - + TIMER_STOP(&pl_m); + + rdt = &pl_rd1; + rda = &pl_rd1a; + TIMER_START(&pl_rds); // read message length read_socket(sock_fd, buffer, sizeof(uint32_t)); + TIMER_STOP(&pl_rds); msg_length = ntohl(*(uint32_t *)buffer); gdprintf("Going to read a message of %u bytes...\n", msg_length); - + + TIMER_START(&pl_re); buffer = realloc(buffer, msg_length); - + TIMER_STOP(&pl_re); + + rdt = &pl_rd2; + rda = &pl_rd2a; + TIMER_START(&pl_rdm); // read message read_socket(sock_fd, buffer, msg_length); - + TIMER_STOP(&pl_rdm); + *enc_msg = buffer; return msg_length; @@ -105,7 +153,7 @@ int decode_message(void **result, void **payload, void *enc_msg, uint32_t enc_ms fprintf(stderr, "message unpacking failed\n"); return -1; } - + switch (msg->type) { case CUDA_CMD: gdprintf("--------------\nIs CUDA_CMD\n"); @@ -124,11 +172,11 @@ int decode_message(void **result, void **payload, void *enc_msg, uint32_t enc_ms *payload = msg->cuda_devices; break; } - + // We can't call this here unless we make a *deep* copy of the // message payload... //cookie__free_unpacked(msg, NULL); - *result = msg; + *result = msg; return msg->type; } @@ -137,7 +185,7 @@ size_t encode_message(void **result, int msg_type, void *payload) { size_t buf_size; uint32_t msg_length, msg_len_n; Cookie message = COOKIE__INIT; - void *buffer, *msg_buffer; + void *buffer, *msg_buffer; gdprintf("Encoding message data...\n"); message.type = msg_type; diff --git a/src/protocol.h b/src/protocol.h index 2fe88c4..3345075 100644 --- a/src/protocol.h +++ b/src/protocol.h @@ -1,6 +1,8 @@ #ifndef PROTOCOL_H #define PROTOCOL_H +#include "timer.h" + ssize_t read_socket(int fd, void *buffer, size_t bytes); ssize_t write_socket(int fd, void *buffer, size_t bytes); @@ -15,4 +17,7 @@ size_t encode_message(void **result, int msg_type, void *payload); void free_decoded_message(void *msg); +extern __thread gs_timer pl_rds, pl_rdm, pl_rd1, pl_rd2, pl_m, pl_re; +extern __thread unsigned long long pl_rd1a, pl_rd2a; + #endif /* PROTOCOL_H */ diff --git a/src/server.c b/src/server.c index 70e4026..5248e92 100644 --- a/src/server.c +++ b/src/server.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -8,11 +9,208 @@ #include #include -#define TIMERS_ENABLED #include "common.h" #include "common.pb-c.h" #include "protocol.h" #include "process.h" +#include "timer.h" + +void *free_list=NULL, *busy_list=NULL; +client_node *client_list=NULL; + +void *connection_handler(void *socket_desc) { + int client_sock_fd = *(int *) socket_desc, client_id = -1; + int msg_type, resp_type, arg_cnt; + void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL; + client_node *client_handle=NULL; + uint32_t msg_length; + gs_timer mt, rt, dt, pt; + + pl_rd1a = 0; + pl_rd2a = 0; + TIMER_RESET(&mt); + TIMER_RESET(&rt); + TIMER_RESET(&dt); + TIMER_RESET(&pt); + TIMER_RESET(&pl_rds); + TIMER_RESET(&pl_rdm); + TIMER_RESET(&pl_rd1); + TIMER_RESET(&pl_rd2); + TIMER_RESET(&pl_m); + TIMER_RESET(&pl_re); + TIMER_RESET(&ps_i); + TIMER_RESET(&ps_dg); + TIMER_RESET(&ps_dgc); + TIMER_RESET(&ps_dgca); + TIMER_RESET(&ps_dgn); + TIMER_RESET(&ps_dgna); + TIMER_RESET(&ps_cc); + TIMER_RESET(&ps_cca); + TIMER_RESET(&ps_cd); + TIMER_RESET(&ps_cda); + TIMER_RESET(&ps_ml); + TIMER_RESET(&ps_mla); + TIMER_RESET(&ps_mgf); + TIMER_RESET(&ps_mgfa); + TIMER_RESET(&ps_ma); + TIMER_RESET(&ps_maa); + TIMER_RESET(&ps_mf); + TIMER_RESET(&ps_mfa); + TIMER_RESET(&ps_mhd); + TIMER_RESET(&ps_mhda); + TIMER_RESET(&ps_mdh); + TIMER_RESET(&ps_mdha); + TIMER_RESET(&ps_lk); + TIMER_RESET(&ps_lka); + TIMER_RESET(&ps_ext); + + for (;;) { + TIMER_START(&mt); + TIMER_START(&rt); + msg_length = receive_message(&msg, client_sock_fd); + TIMER_STOP(&rt); + TIMER_START(&dt); + if (msg_length > 0) + msg_type = decode_message(&dec_msg, &payload, + msg, msg_length); + + TIMER_STOP(&dt); + gdprintf("Processing message\n"); + TIMER_START(&pt); + switch (msg_type) { + case CUDA_CMD: + arg_cnt = process_cuda_cmd(&result, + payload, free_list, + busy_list, &client_list, + &client_handle); + cuCtxSynchronize(); + resp_type = CUDA_CMD_RESULT; + break; + case CUDA_DEVICE_QUERY: + process_cuda_device_query(&result, + free_list, busy_list); + resp_type = CUDA_DEVICE_LIST; + break; + } + + print_clients(client_list); + print_cuda_devices(free_list, busy_list); + + if (msg != NULL) { + free(msg); + msg = NULL; + } + if (dec_msg != NULL) { + free_decoded_message(dec_msg); + dec_msg = NULL; + // payload should be invalid now + payload = NULL; + } + TIMER_STOP(&pt); + + if (resp_type != -1) { + gdprintf("Sending result\n"); + pack_cuda_cmd(&payload, result, arg_cnt, + CUDA_CMD_RESULT); + msg_length = encode_message(&msg, resp_type, payload); + send_message(client_sock_fd, msg, msg_length); + + if (result != NULL) { + // should be more freeing here... + free(result); + result = NULL; + } + } + + gdprintf(">>\nMessage processed, cleaning up...\n<<\n"); + + if (msg != NULL) { + free(msg); + msg = NULL; + } + + if (client_id == -1) + client_id = client_handle->id; + + TIMER_STOP(&mt); + if (get_client_status(client_handle) == 0) { + // TODO: freeing + printf("\n--------------\nClient %d finished.\n", client_id); + break; + } + } + printf("Client elapsed time (usec): %lf\n\n", + TIMER_TO_USEC(TIMER_TOTAL(&mt))); + + printf("Message average elapsed time (usec):\n- total: %lf\n" + "- receive: %lf\n- decode: %lf\n" + "- process: %lf\n- send: %lf\n", + TIMER_TO_USEC(TIMER_AVG(&mt)), TIMER_TO_USEC(TIMER_AVG(&rt)), + TIMER_TO_USEC(TIMER_AVG(&dt)), TIMER_TO_USEC(TIMER_AVG(&pt)), + TIMER_TO_USEC((TIMER_TOTAL(&mt) - TIMER_TOTAL(&rt) + - TIMER_TOTAL(&dt) - TIMER_TOTAL(&pt)) + / (double) TIMER_COUNT(&mt))); + + printf("\nReceive average elapsed time (usec):\n- malloc: %lf\n" + "- read size: %lf\n |read socket:\n |- total: %lf\n" + " |- actual: %lf\n |- waiting: %lf\n- realloc: %lf\n" + "- read message: %lf\n |read socket:\n |- total: %lf\n" + " |- actual: %lf\n |- waiting: %lf\n", + TIMER_TO_USEC(TIMER_AVG(&pl_m)), + TIMER_TO_USEC(TIMER_AVG(&pl_rds)), + TIMER_TO_USEC(TIMER_AVG(&pl_rd1)), + TIMER_TO_USEC(pl_rd1a / (double) TIMER_COUNT(&pl_rd1)), + TIMER_TO_USEC((TIMER_TOTAL(&pl_rd1) - pl_rd1a) + / (double) TIMER_COUNT(&pl_rd1)), + TIMER_TO_USEC(TIMER_AVG(&pl_re)), + TIMER_TO_USEC(TIMER_AVG(&pl_rdm)), + TIMER_TO_USEC(TIMER_AVG(&pl_rd2)), + TIMER_TO_USEC(pl_rd2a / (double) TIMER_COUNT(&pl_rd2)), + TIMER_TO_USEC((TIMER_TOTAL(&pl_rd2) - pl_rd2a) + / (double) TIMER_COUNT(&pl_rd2))); + + printf("Process average elapsed time (usec):\n" + "- init: %lf\n- device_get: %lf\n" + "- device_get_count:\n |- total: %lf\n |- actual: %lf\n" + "- device_get_name:\n |- total: %lf\n |- actual: %lf\n" + "- context_create:\n |- total: %lf\n |- actual: %lf\n" + "- context_destroy:\n |- total: %lf\n |- actual: %lf\n" + "- module_load:\n |- total: %lf\n |- actual: %lf\n" + "- module_get_function:\n |- total: %lf\n |- actual: %lf\n" + "- memory_allocate:\n |- total: %lf\n |- actual: %lf\n" + "- memory_free:\n |- total: %lf\n |- actual: %lf\n" + "- memcpy_h_to_d:\n |- total: %lf\n |- actual: %lf\n" + "- memcpy_d_to_h:\n |- total: %lf\n |- actual: %lf\n" + "- launch_kernel:\n |- total: %lf\n |- actual: %lf\n" + "- extra: %lf\n\n", + TIMER_TO_USEC(TIMER_AVG(&ps_i)), + TIMER_TO_USEC(TIMER_AVG(&ps_dg)), + TIMER_TO_USEC(TIMER_AVG(&ps_dgc)), + TIMER_TO_USEC(TIMER_AVG(&ps_dgca)), + TIMER_TO_USEC(TIMER_AVG(&ps_dgn)), + TIMER_TO_USEC(TIMER_AVG(&ps_dgna)), + TIMER_TO_USEC(TIMER_AVG(&ps_cc)), + TIMER_TO_USEC(TIMER_AVG(&ps_cca)), + TIMER_TO_USEC(TIMER_AVG(&ps_cd)), + TIMER_TO_USEC(TIMER_AVG(&ps_cda)), + TIMER_TO_USEC(TIMER_AVG(&ps_ml)), + TIMER_TO_USEC(TIMER_AVG(&ps_mla)), + TIMER_TO_USEC(TIMER_AVG(&ps_mgf)), + TIMER_TO_USEC(TIMER_AVG(&ps_mgfa)), + TIMER_TO_USEC(TIMER_AVG(&ps_ma)), + TIMER_TO_USEC(TIMER_AVG(&ps_maa)), + TIMER_TO_USEC(TIMER_AVG(&ps_mf)), + TIMER_TO_USEC(TIMER_AVG(&ps_mfa)), + TIMER_TO_USEC(TIMER_AVG(&ps_mhd)), + TIMER_TO_USEC(TIMER_AVG(&ps_mhda)), + TIMER_TO_USEC(TIMER_AVG(&ps_mdh)), + TIMER_TO_USEC(TIMER_AVG(&ps_mdha)), + TIMER_TO_USEC(TIMER_AVG(&ps_lk)), + TIMER_TO_USEC(TIMER_AVG(&ps_lka)), + TIMER_TO_USEC(TIMER_AVG(&ps_ext))); + +} + int init_server_net(const char *port, struct addrinfo *addr) { int socket_fd, ret; @@ -63,16 +261,15 @@ int init_server(char *port, struct addrinfo *addr, void **free_list, void **busy } int main(int argc, char *argv[]) { - int server_sock_fd, client_sock_fd, msg_type, resp_type, arg_cnt; + int server_sock_fd, client_sock_fd, msg_type, resp_type, arg_cnt, *new_sock; struct sockaddr_in client_addr; struct addrinfo local_addr; char server_ip[16] /* IPv4 */, server_port[6], *local_port, client_host[NI_MAXHOST], client_serv[NI_MAXSERV]; socklen_t s; - void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL, - *free_list=NULL, *busy_list=NULL, *client_list=NULL, *client_handle=NULL; + void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL; uint32_t msg_length; - timers_t tm; + pthread_t sniffer_thread; if (argc > 2) { printf("Usage: server \n"); @@ -112,65 +309,12 @@ int main(int argc, char *argv[]) { else printf("from unidentified client"); - TIMER_RESET(&tm); - for(;;) { - msg_length = receive_message(&msg, client_sock_fd); - if (msg_length > 0) - msg_type = decode_message(&dec_msg, &payload, msg, msg_length); - - printf("Processing message\n"); - TIMER_START(&tm); - switch (msg_type) { - case CUDA_CMD: - arg_cnt = process_cuda_cmd(&result, payload, free_list, busy_list, &client_list, &client_handle); - resp_type = CUDA_CMD_RESULT; - break; - case CUDA_DEVICE_QUERY: - process_cuda_device_query(&result, free_list, busy_list); - resp_type = CUDA_DEVICE_LIST; - break; - } - - print_clients(client_list); - print_cuda_devices(free_list, busy_list); - - if (msg != NULL) { - free(msg); - msg = NULL; - } - if (dec_msg != NULL) { - free_decoded_message(dec_msg); - dec_msg = NULL; - // payload should be invalid now - payload = NULL; - } - - if (resp_type != -1) { - gdprintf("Sending result\n"); - pack_cuda_cmd(&payload, result, arg_cnt, CUDA_CMD_RESULT); - msg_length = encode_message(&msg, resp_type, payload); - send_message(client_sock_fd, msg, msg_length); - - if (result != NULL) { - // should be more freeing here... - free(result); - result = NULL; - } - } - printf(">>\nMessage processed, cleaning up...\n<<\n"); - TIMER_STOP(&tm); - if (msg != NULL) { - free(msg); - msg = NULL; - } - - if (get_client_status(client_handle) == 0) { - // TODO: freeing - printf("\n--------------\nClient finished.\n\n"); - break; - } + new_sock = malloc_safe(sizeof(*new_sock)); + *new_sock = client_sock_fd; + if (pthread_create(&sniffer_thread, NULL, connection_handler, (void *)new_sock) < 0) { + fprintf(stderr, "could not create thread\n"); + return 1; } - printf("message needed %lf to be processed\n", TICKS_TO_USEC(TIMER_TOTAL(&tm))); } close(client_sock_fd); diff --git a/src/timer.h b/src/timer.h new file mode 100644 index 0000000..c3c6452 --- /dev/null +++ b/src/timer.h @@ -0,0 +1,57 @@ +#ifndef GPUSOCK_TIMER_H +#define GPUSOCK_TIMER_H + +typedef struct timer { + unsigned long long total; + unsigned long long val; + unsigned long cnt; +} gs_timer; + + +#ifdef TIMERS_ENABLED + +#include + +static inline unsigned long long get_time() +{ + struct timespec ts; + unsigned long long time; + + if(clock_gettime(CLOCK_MONOTONIC, &ts) == -1) { + perror("clock gettime"); + exit(EXIT_FAILURE); + } + time = ts.tv_sec * 1000000000 + ts.tv_nsec; + //printf("time: %llu ns\n", time); + + return time; +} + + +#define TIMER_START(tm) do { (tm)->val = get_time(); } while (0) +#define TIMER_STOP(tm) do { (tm)->total += get_time() - (tm)->val; ++(tm)->cnt; } while (0) +#define TIMER_GET(tm) do { (tm)->total += get_time() - (tm)->val; (tm)->val = get_time(); } while (0) +#define TIMER_RESET(tm) do { (tm)->total = (tm)->val = 0; (tm)->cnt = 0; } while (0) +#define TIMER_TOTAL(tm) ((tm)->total) +#define TIMER_COUNT(tm) ((tm)->cnt) +#define TIMER_AVG(tm) ((tm)->cnt ? ((tm)->total / (tm)->cnt) : 0) + +#define TIMER_TO_SEC(t) (t / 1000000000.0) +#define TIMER_TO_USEC(t) (t / 1000.0) + +#else + +#define TIMER_START(a) +#define TIMER_STOP(a) +#define TIMER_GET(a) +#define TIMER_RESET(a) +#define TIMER_TOTAL(a) 0ULL +#define TIMER_COUNT(a) 0UL +#define TIMER_AVG(a) 0ULL + +#define TIMER_TO_SEC(a) 0 +#define TIMER_TO_USEC(a) 0 + +#endif /* TIMER_ENABLED */ + +#endif /* GPUSOCK_TIMER_H */ diff --git a/src/timers.h b/src/timers.h deleted file mode 100644 index 7d13f42..0000000 --- a/src/timers.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (C) Anastassios Nanos 2012 (See AUTHORS file) - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the Free - * Software Foundation; either version 2 of the License, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. You should have received a copy of the GNU General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#define _GNU_SOURCE - -#ifndef __gpusock_timers_h__ -#define __gpusock_timers_h__ - -typedef struct timers { - unsigned long long total; - unsigned long long val; - unsigned long cnt; -} timers_t; - - -#ifdef TIMERS_ENABLED - -#include -//#include - -unsigned long long get_cycles() -{ - struct timeval tv; - unsigned long long paparia; - gettimeofday(&tv,NULL); - paparia = tv.tv_sec * 1000000 + tv.tv_usec; - printf("paparia:%ld\n", paparia); - return paparia; -} - - -#define TIMER_START(tp) do {( tp)->val = get_cycles(); } while (0) -#define TIMER_STOP(tp) do { (tp)->total += get_cycles() - (tp)->val; ++(tp)->cnt; } while (0) -#define TIMER_RESET(tp) do { (tp)->total = (tp)->val = 0; (tp)->cnt = 0; } while (0) -#define TIMER_TOTAL(tp) ((tp)->total) -#define TIMER_COUNT(tp) ((tp)->cnt) -#define TIMER_AVG(tp) ((tp)->cnt ? ((tp)->total / (tp)->cnt) : -1) - -#define TICKS_TO_USEC(t) (1000 * t/CYCLES_PER_SEC) - -#else - -#define TIMER_START(a) -#define TIMER_STOP(a) -#define TIMER_TOTAL(a) 0ULL -#define TIMER_COUNT(a) 0UL -#define TIMER_RESET(a) -#define TICKS_TO_USEC(a) 0ULL - -#endif /* TIMERS_ENABLED */ - -#define var_name(x) #x -#define gpusock_timer_reset(x) TIMER_RESET(x); -#endif /* __gpusock_timers_h__ */ From 9bb8f4e3fef85dce6e50b83f06843012fe5aeba0 Mon Sep 17 00:00:00 2001 From: papazof Date: Wed, 12 Nov 2014 00:21:40 +0200 Subject: [PATCH 3/3] Server send message timing breakdown. --- src/server.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/server.c b/src/server.c index 5248e92..2e1dabb 100644 --- a/src/server.c +++ b/src/server.c @@ -24,7 +24,7 @@ void *connection_handler(void *socket_desc) { void *msg=NULL, *payload=NULL, *result=NULL, *dec_msg=NULL; client_node *client_handle=NULL; uint32_t msg_length; - gs_timer mt, rt, dt, pt; + gs_timer mt, rt, dt, pt, st, st1, st2, st3; pl_rd1a = 0; pl_rd2a = 0; @@ -32,6 +32,10 @@ void *connection_handler(void *socket_desc) { TIMER_RESET(&rt); TIMER_RESET(&dt); TIMER_RESET(&pt); + TIMER_RESET(&st); + TIMER_RESET(&st1); + TIMER_RESET(&st2); + TIMER_RESET(&st3); TIMER_RESET(&pl_rds); TIMER_RESET(&pl_rdm); TIMER_RESET(&pl_rd1); @@ -107,13 +111,19 @@ void *connection_handler(void *socket_desc) { payload = NULL; } TIMER_STOP(&pt); - + TIMER_START(&st); if (resp_type != -1) { gdprintf("Sending result\n"); + TIMER_START(&st1); pack_cuda_cmd(&payload, result, arg_cnt, CUDA_CMD_RESULT); + TIMER_STOP(&st1); + TIMER_START(&st2); msg_length = encode_message(&msg, resp_type, payload); + TIMER_STOP(&st2); + TIMER_START(&st3); send_message(client_sock_fd, msg, msg_length); + TIMER_STOP(&st3); if (result != NULL) { // should be more freeing here... @@ -121,7 +131,7 @@ void *connection_handler(void *socket_desc) { result = NULL; } } - + TIMER_STOP(&st); gdprintf(">>\nMessage processed, cleaning up...\n<<\n"); if (msg != NULL) { @@ -144,12 +154,14 @@ void *connection_handler(void *socket_desc) { printf("Message average elapsed time (usec):\n- total: %lf\n" "- receive: %lf\n- decode: %lf\n" - "- process: %lf\n- send: %lf\n", + "- process: %lf\n- send:\n |- total: %lf\n |- pack:% lf\n" + " |- encode: %lf\n |- actual: %lf\n", TIMER_TO_USEC(TIMER_AVG(&mt)), TIMER_TO_USEC(TIMER_AVG(&rt)), TIMER_TO_USEC(TIMER_AVG(&dt)), TIMER_TO_USEC(TIMER_AVG(&pt)), - TIMER_TO_USEC((TIMER_TOTAL(&mt) - TIMER_TOTAL(&rt) - - TIMER_TOTAL(&dt) - TIMER_TOTAL(&pt)) - / (double) TIMER_COUNT(&mt))); + TIMER_TO_USEC(TIMER_AVG(&st)), + TIMER_TO_USEC(TIMER_AVG(&st1)), + TIMER_TO_USEC(TIMER_AVG(&st2)), + TIMER_TO_USEC(TIMER_AVG(&st3))); printf("\nReceive average elapsed time (usec):\n- malloc: %lf\n" "- read size: %lf\n |read socket:\n |- total: %lf\n"