From 42387e8405dc7217c4f5a3a12370464d9dd106bc Mon Sep 17 00:00:00 2001 From: Scott Francis Date: Wed, 18 Mar 2020 20:27:32 -0400 Subject: [PATCH 01/21] Initial C extension implementation --- .gitignore | 1 + Rakefile | 11 +++++++++ ext/statsd/extconf.rb | 5 ++++ ext/statsd/statsd.c | 43 +++++++++++++++++++++++++++++++++++ lib/statsd/instrument.rb | 1 + statsd-instrument.gemspec | 2 ++ test/datagram_builder_test.rb | 1 + 7 files changed, 64 insertions(+) create mode 100644 ext/statsd/extconf.rb create mode 100644 ext/statsd/statsd.c diff --git a/.gitignore b/.gitignore index 804eafeb..36ce0d79 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ doc *.gem .bundle Gemfile.lock +lib/statsd/instrument/ext/* pkg/* vendor/ tmp/* diff --git a/Rakefile b/Rakefile index 068e3e2e..f016ddfb 100644 --- a/Rakefile +++ b/Rakefile @@ -3,10 +3,21 @@ require 'bundler/gem_tasks' require 'rake/testtask' +GEMSPEC = eval(File.read('statsd-instrument.gemspec')) + +require 'rake/extensiontask' +Rake::ExtensionTask.new('statsd', GEMSPEC) do |ext| + ext.ext_dir = 'ext/statsd' + ext.lib_dir = 'lib/statsd/instrument/ext' +end +task :build => :compile + Rake::TestTask.new('test') do |t| t.ruby_opts << '-r rubygems' t.libs << 'lib' << 'test' t.test_files = FileList['test/**/*_test.rb'] end +task :test => :build + task default: :test diff --git a/ext/statsd/extconf.rb b/ext/statsd/extconf.rb new file mode 100644 index 00000000..a4fa4a7d --- /dev/null +++ b/ext/statsd/extconf.rb @@ -0,0 +1,5 @@ +require 'mkmf' + +$CFLAGS = "-O3" + +create_makefile('statsd/statsd') diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c new file mode 100644 index 00000000..08036df4 --- /dev/null +++ b/ext/statsd/statsd.c @@ -0,0 +1,43 @@ +#include +#include + +static VALUE idTr; +static VALUE strNormalizeChars, strNormalizeReplacement; + +static VALUE +normalize_name(VALUE self, VALUE name) { + Check_Type(name, T_STRING); + + char *name_start = RSTRING_PTR(name); + char *name_end = RSTRING_END(name); + + while (name_start < name_end) { + if (*name_start == ':' || *name_start == '|' || *name_start == '@') { + break; + } + name_start++; + } + + if (name_start == name_end) { + return name; + } + return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); +} + +void Init_statsd() +{ + VALUE mStatsd, mInstrument, cDatagramBuilder; + + mStatsd = rb_define_module("StatsD"); + mInstrument = rb_define_module_under(mStatsd, "Instrument"); + cDatagramBuilder = rb_define_class_under(mInstrument, "DatagramBuilder", rb_cObject); + + idTr = rb_intern("tr"); + strNormalizeChars = rb_str_new_cstr(":|@"); + strNormalizeReplacement = rb_str_new_cstr("_"); + + rb_global_variable(&strNormalizeChars); + rb_global_variable(&strNormalizeReplacement); + + rb_define_protected_method(cDatagramBuilder, "normalize_name", normalize_name, 1); +} diff --git a/lib/statsd/instrument.rb b/lib/statsd/instrument.rb index 20d00436..7001669b 100644 --- a/lib/statsd/instrument.rb +++ b/lib/statsd/instrument.rb @@ -359,6 +359,7 @@ def singleton_client require 'statsd/instrument/helpers' require 'statsd/instrument/assertions' require 'statsd/instrument/expectation' +require 'statsd/instrument/ext/statsd' require 'statsd/instrument/matchers' if defined?(::RSpec) require 'statsd/instrument/railtie' if defined?(::Rails::Railtie) require 'statsd/instrument/strict' if ENV['STATSD_STRICT_MODE'] diff --git a/statsd-instrument.gemspec b/statsd-instrument.gemspec index e6c51900..05a950a7 100644 --- a/statsd-instrument.gemspec +++ b/statsd-instrument.gemspec @@ -18,6 +18,7 @@ Gem::Specification.new do |spec| spec.files = `git ls-files`.split($/) spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) + spec.extensions = ['ext/statsd/extconf.rb'] spec.require_paths = ["lib"] spec.add_development_dependency 'rake' @@ -27,4 +28,5 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'yard' spec.add_development_dependency 'rubocop' spec.add_development_dependency 'benchmark-ips' + spec.add_development_dependency 'rake-compiler' end diff --git a/test/datagram_builder_test.rb b/test/datagram_builder_test.rb index fe3ec3db..b47e65a8 100644 --- a/test/datagram_builder_test.rb +++ b/test/datagram_builder_test.rb @@ -12,6 +12,7 @@ def test_normalize_name assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo|o') assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo@o') assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo:o') + assert_equal 'foo_', @datagram_builder.send(:normalize_name, 'foo:') end def test_normalize_unsupported_tag_names From ddfce73006155f13e10e4cdfcdea13bd571a9cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sat, 21 Mar 2020 19:34:41 +0000 Subject: [PATCH 02/21] Enable warnings + pedantic flag and allow for easier debugging by setting ENV var STATSD_EXT_DEBUG --- ext/statsd/extconf.rb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ext/statsd/extconf.rb b/ext/statsd/extconf.rb index a4fa4a7d..5ab47bf2 100644 --- a/ext/statsd/extconf.rb +++ b/ext/statsd/extconf.rb @@ -1,5 +1,12 @@ require 'mkmf' -$CFLAGS = "-O3" +append_cflags '-pedantic' +append_cflags '-Wall' +if ENV['STATSD_EXT_DEBUG'] + append_cflags "-Og" + append_cflags '-ggdb3' +else + append_cflags "-O3" +end create_makefile('statsd/statsd') From 37f71d2a5f96b2d985a85ad2d6ad22ca3cd9bbdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sat, 21 Mar 2020 19:44:29 +0000 Subject: [PATCH 03/21] Implement DatagramBuilder#generate_generic_datagram as a C function with a static buffer --- ext/statsd/statsd.c | 93 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 5 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 08036df4..ac83f188 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -1,15 +1,19 @@ #include #include -static VALUE idTr; +#define MAX_DATAGRAM_SIZE 4096 + +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix; static VALUE strNormalizeChars, strNormalizeReplacement; static VALUE normalize_name(VALUE self, VALUE name) { + char *name_start = NULL; + char *name_end = NULL; Check_Type(name, T_STRING); - char *name_start = RSTRING_PTR(name); - char *name_end = RSTRING_END(name); + name_start = RSTRING_PTR(name); + name_end = RSTRING_END(name); while (name_start < name_end) { if (*name_start == ':' || *name_start == '|' || *name_start == '@') { @@ -24,6 +28,82 @@ normalize_name(VALUE self, VALUE name) { return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); } +static VALUE +generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { + VALUE prefix, normalized_name, str_value, str_sample_rate, default_tags, tag; + VALUE normalized_tags = Qnil; + char datagram[MAX_DATAGRAM_SIZE]; + int empty_default_tags = 1, empty_tags = 1; + int len = 0, tags_len = 0, i = 0; + + MEMZERO(&datagram, char, MAX_DATAGRAM_SIZE); + + prefix = rb_ivar_get(self, idPrefix); + if (RSTRING_LEN(prefix) != 0) { + memcpy(datagram, StringValuePtr(prefix), RSTRING_LEN(prefix)); + len += RSTRING_LEN(prefix); + } + + normalized_name = normalize_name(self, name); + memcpy(datagram + len, StringValuePtr(normalized_name), RSTRING_LEN(normalized_name)); + len += RSTRING_LEN(normalized_name); + + memcpy(datagram + len, ":", 1); + len += 1; + str_value = rb_obj_as_string(value); + memcpy(datagram + len, StringValuePtr(str_value), RSTRING_LEN(str_value)); + len += RSTRING_LEN(str_value); + + memcpy(datagram + len, "|", 1); + len += 1; + memcpy(datagram + len, StringValuePtr(type), RSTRING_LEN(type)); + len += RSTRING_LEN(type); + + if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { + memcpy(datagram + len, "|@", 2); + len += 2; + str_sample_rate = rb_obj_as_string(sample_rate); + memcpy(datagram + len, StringValuePtr(str_sample_rate), RSTRING_LEN(str_sample_rate)); + len += RSTRING_LEN(str_sample_rate); + } + + default_tags = rb_ivar_get(self, idDefaultTags); + + empty_default_tags = (RTEST(default_tags) ? RARRAY_LEN(default_tags) == 0 : 0); + if (RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) { + empty_tags = 0; + } else if (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0) { + empty_tags = 0; + } + + if (empty_default_tags && !empty_tags) { + normalized_tags = rb_funcall(self, idNormalizeTags, 1, tags); + } else if (!empty_default_tags && !empty_tags) { + normalized_tags = rb_ary_concat(rb_funcall(self, idNormalizeTags, 1, tags), default_tags); + } else if (!empty_default_tags && empty_tags) { + normalized_tags = default_tags; + } + + if (RTEST(normalized_tags)) { + memcpy(datagram + len, "|#", 2); + len += 2; + + tags_len = RARRAY_LEN(normalized_tags); + for (i = 0; i < tags_len; ++i) { + tag = RARRAY_AREF(normalized_tags, i); + memcpy(datagram + len, StringValuePtr(tag), RSTRING_LEN(tag)); + len += RSTRING_LEN(tag); + if (i < tags_len - 1) { + memcpy(datagram + len, ",", 1); + len += 1; + } + } + } + + return rb_str_new(datagram, len); + RB_GC_GUARD(normalized_tags); +} + void Init_statsd() { VALUE mStatsd, mInstrument, cDatagramBuilder; @@ -33,11 +113,14 @@ void Init_statsd() cDatagramBuilder = rb_define_class_under(mInstrument, "DatagramBuilder", rb_cObject); idTr = rb_intern("tr"); + idNormalizeTags = rb_intern("normalize_tags"); + idDefaultTags = rb_intern("@default_tags"); + idPrefix = rb_intern("@prefix"); strNormalizeChars = rb_str_new_cstr(":|@"); - strNormalizeReplacement = rb_str_new_cstr("_"); - rb_global_variable(&strNormalizeChars); + strNormalizeReplacement = rb_str_new_cstr("_"); rb_global_variable(&strNormalizeReplacement); rb_define_protected_method(cDatagramBuilder, "normalize_name", normalize_name, 1); + rb_define_protected_method(cDatagramBuilder, "generate_generic_datagram", generate_generic_datagram, 5); } From 2c46dba30afd41b49ad53f4e6d47ad378c49b0e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sat, 21 Mar 2020 21:08:07 +0000 Subject: [PATCH 04/21] Introduce support for conditionally enabling GC.stress for the test suite with env var GC_STRESS --- test/test_helper.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_helper.rb b/test/test_helper.rb index 96a676f3..353b6ea8 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -17,3 +17,4 @@ def self.strict_mode_enabled? end StatsD.logger = Logger.new(File::NULL) +GC.stress = true if ENV['GC_STRESS'] From 5913e0387a7f02a76d513fe5b728ab34c832c548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 01:00:36 +0000 Subject: [PATCH 05/21] Prevent buffer overflows in DatagramBuilder#generate_generic_datagram, favoring broken packets to undefined behavior --- ext/statsd/statsd.c | 43 +++++++++++++++++++++++++---------- test/datagram_builder_test.rb | 28 +++++++++++++++++++++++ 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index ac83f188..af746ac7 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -35,36 +35,50 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE char datagram[MAX_DATAGRAM_SIZE]; int empty_default_tags = 1, empty_tags = 1; int len = 0, tags_len = 0, i = 0; + long chunk_len = 0; MEMZERO(&datagram, char, MAX_DATAGRAM_SIZE); prefix = rb_ivar_get(self, idPrefix); if (RSTRING_LEN(prefix) != 0) { - memcpy(datagram, StringValuePtr(prefix), RSTRING_LEN(prefix)); - len += RSTRING_LEN(prefix); + chunk_len = RSTRING_LEN(prefix); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram, StringValuePtr(prefix), chunk_len); + len += chunk_len; } normalized_name = normalize_name(self, name); - memcpy(datagram + len, StringValuePtr(normalized_name), RSTRING_LEN(normalized_name)); - len += RSTRING_LEN(normalized_name); + chunk_len = RSTRING_LEN(normalized_name); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram + len, StringValuePtr(normalized_name), chunk_len); + len += chunk_len; + if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; memcpy(datagram + len, ":", 1); len += 1; str_value = rb_obj_as_string(value); - memcpy(datagram + len, StringValuePtr(str_value), RSTRING_LEN(str_value)); - len += RSTRING_LEN(str_value); + chunk_len = RSTRING_LEN(str_value); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram + len, StringValuePtr(str_value), chunk_len); + len += chunk_len; + if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; memcpy(datagram + len, "|", 1); len += 1; - memcpy(datagram + len, StringValuePtr(type), RSTRING_LEN(type)); - len += RSTRING_LEN(type); + chunk_len = RSTRING_LEN(type); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram + len, StringValuePtr(type), chunk_len); + len += chunk_len; if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { + if (len + 2 > MAX_DATAGRAM_SIZE) goto finalize_datagram; memcpy(datagram + len, "|@", 2); len += 2; str_sample_rate = rb_obj_as_string(sample_rate); - memcpy(datagram + len, StringValuePtr(str_sample_rate), RSTRING_LEN(str_sample_rate)); - len += RSTRING_LEN(str_sample_rate); + chunk_len = RSTRING_LEN(str_sample_rate); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram + len, StringValuePtr(str_sample_rate), chunk_len); + len += chunk_len; } default_tags = rb_ivar_get(self, idDefaultTags); @@ -85,21 +99,26 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE } if (RTEST(normalized_tags)) { + if (len + 2 > MAX_DATAGRAM_SIZE) goto finalize_datagram; memcpy(datagram + len, "|#", 2); len += 2; tags_len = RARRAY_LEN(normalized_tags); for (i = 0; i < tags_len; ++i) { tag = RARRAY_AREF(normalized_tags, i); - memcpy(datagram + len, StringValuePtr(tag), RSTRING_LEN(tag)); - len += RSTRING_LEN(tag); + chunk_len = RSTRING_LEN(tag); + if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + memcpy(datagram + len, StringValuePtr(tag), chunk_len); + len += chunk_len; if (i < tags_len - 1) { + if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; memcpy(datagram + len, ",", 1); len += 1; } } } +finalize_datagram: return rb_str_new(datagram, len); RB_GC_GUARD(normalized_tags); } diff --git a/test/datagram_builder_test.rb b/test/datagram_builder_test.rb index b47e65a8..67053b97 100644 --- a/test/datagram_builder_test.rb +++ b/test/datagram_builder_test.rb @@ -116,4 +116,32 @@ def test_default_tags datagram = datagram_builder.c('bar', 1, nil, nil) assert_equal 'bar:1|c|#foo', datagram end + + def test_builder_buffer_overflow + # prefix + name overflow + datagram_builder = StatsD::Instrument::DatagramBuilder.new(prefix: 'a' * 2047) + datagram = datagram_builder.c('b' * 2048, 1, nil, nil) + assert_equal ('a' * 2047) + "." + ('b' * 2048), datagram + # name overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4096, 1, nil, nil) + assert_equal 'a' * 4096, datagram + # value overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4093, 100, nil, nil) + assert_equal ('a' * 4093) + ":", datagram + # type overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4093, 1, nil, nil) + assert_equal ('a' * 4093) + ":1|", datagram + # sample rate overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4088, 1, 0.5, nil) + assert_equal ('a' * 4088) + ":1|c|@", datagram + # tag overflows + tags = 1000.times {|i| "tag:#{i}" } + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 2048, 1, 0.5, tags) + assert_equal ('a' * 2048) + ":1|c|@0.5", datagram + end end From f0c9cec622e15971acec27a107e6eff431d6cffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 01:43:32 +0000 Subject: [PATCH 06/21] Introduce a bounded cache for DatagramBuilder#normalize_tags used from the C ext only (normalize_tags is a pure function) --- ext/statsd/statsd.c | 56 ++++++++++++++++------- lib/statsd/instrument/datagram_builder.rb | 1 + 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index af746ac7..7d935c07 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -1,9 +1,10 @@ #include #include -#define MAX_DATAGRAM_SIZE 4096 +#define DATAGRAM_SIZE_MAX 4096 +#define TAGS_CACHE_MAX 1024 -static ID idTr, idNormalizeTags, idDefaultTags, idPrefix; +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idTagsCache; static VALUE strNormalizeChars, strNormalizeReplacement; static VALUE @@ -28,55 +29,75 @@ normalize_name(VALUE self, VALUE name) { return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); } +/* pure function not exposed to ruby with an intermediate bounded cache */ +static VALUE +normalized_tags_cached(VALUE self, VALUE tags) +{ + VALUE cached; + VALUE cache = rb_ivar_get(self, idTagsCache); + VALUE key = rb_hash(tags); + cached = rb_hash_aref(cache, key); + if (RTEST(cached)) { + return cached; + } else if (rb_hash_size_num(cache) < TAGS_CACHE_MAX) { + cached = rb_funcall(self, idNormalizeTags, 1, tags); + rb_hash_aset(cache, key, cached); + return cached; + } + return rb_funcall(self, idNormalizeTags, 1, tags); + RB_GC_GUARD(key); + RB_GC_GUARD(cached); +} + static VALUE generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { VALUE prefix, normalized_name, str_value, str_sample_rate, default_tags, tag; VALUE normalized_tags = Qnil; - char datagram[MAX_DATAGRAM_SIZE]; + char datagram[DATAGRAM_SIZE_MAX]; int empty_default_tags = 1, empty_tags = 1; int len = 0, tags_len = 0, i = 0; long chunk_len = 0; - MEMZERO(&datagram, char, MAX_DATAGRAM_SIZE); + MEMZERO(&datagram, char, DATAGRAM_SIZE_MAX); prefix = rb_ivar_get(self, idPrefix); if (RSTRING_LEN(prefix) != 0) { chunk_len = RSTRING_LEN(prefix); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram, StringValuePtr(prefix), chunk_len); len += chunk_len; } normalized_name = normalize_name(self, name); chunk_len = RSTRING_LEN(normalized_name); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(normalized_name), chunk_len); len += chunk_len; - if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, ":", 1); len += 1; str_value = rb_obj_as_string(value); chunk_len = RSTRING_LEN(str_value); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(str_value), chunk_len); len += chunk_len; - if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, "|", 1); len += 1; chunk_len = RSTRING_LEN(type); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(type), chunk_len); len += chunk_len; if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { - if (len + 2 > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, "|@", 2); len += 2; str_sample_rate = rb_obj_as_string(sample_rate); chunk_len = RSTRING_LEN(str_sample_rate); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(str_sample_rate), chunk_len); len += chunk_len; } @@ -91,15 +112,15 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE } if (empty_default_tags && !empty_tags) { - normalized_tags = rb_funcall(self, idNormalizeTags, 1, tags); + normalized_tags = normalized_tags_cached(self, tags); } else if (!empty_default_tags && !empty_tags) { - normalized_tags = rb_ary_concat(rb_funcall(self, idNormalizeTags, 1, tags), default_tags); + normalized_tags = rb_ary_concat(normalized_tags_cached(self, tags), default_tags); } else if (!empty_default_tags && empty_tags) { normalized_tags = default_tags; } if (RTEST(normalized_tags)) { - if (len + 2 > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, "|#", 2); len += 2; @@ -107,11 +128,11 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE for (i = 0; i < tags_len; ++i) { tag = RARRAY_AREF(normalized_tags, i); chunk_len = RSTRING_LEN(tag); - if (len + chunk_len > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(tag), chunk_len); len += chunk_len; if (i < tags_len - 1) { - if (len + 1 > MAX_DATAGRAM_SIZE) goto finalize_datagram; + if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, ",", 1); len += 1; } @@ -135,6 +156,7 @@ void Init_statsd() idNormalizeTags = rb_intern("normalize_tags"); idDefaultTags = rb_intern("@default_tags"); idPrefix = rb_intern("@prefix"); + idTagsCache = rb_intern("@tags_cache"); strNormalizeChars = rb_str_new_cstr(":|@"); rb_global_variable(&strNormalizeChars); strNormalizeReplacement = rb_str_new_cstr("_"); diff --git a/lib/statsd/instrument/datagram_builder.rb b/lib/statsd/instrument/datagram_builder.rb index c7272718..dbc849fc 100644 --- a/lib/statsd/instrument/datagram_builder.rb +++ b/lib/statsd/instrument/datagram_builder.rb @@ -30,6 +30,7 @@ def self.datagram_class def initialize(prefix: nil, default_tags: nil) @prefix = prefix.nil? ? "" : "#{normalize_name(prefix)}." @default_tags = normalize_tags(default_tags) + @tags_cache = {} end def c(name, value, sample_rate, tags) From 86a26feb7bdefc8daf66849ff2020ba8f4574c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 15:11:21 +0000 Subject: [PATCH 07/21] Let the C ext be a prepended module as opposed to redefinition which also allows the normalized tags cache to be decoupled from Ruby land --- ext/statsd/statsd.c | 24 +++++++++++++++++------ lib/statsd/instrument/datagram_builder.rb | 1 - 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 7d935c07..9e257554 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -4,9 +4,17 @@ #define DATAGRAM_SIZE_MAX 4096 #define TAGS_CACHE_MAX 1024 -static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idTagsCache; +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idNormalizedTagsCache; static VALUE strNormalizeChars, strNormalizeReplacement; +static VALUE +initialize(int argc, VALUE *argv, VALUE self) +{ + rb_call_super(argc, argv); + rb_ivar_set(self, idNormalizedTagsCache, rb_hash_new()); + return self; +} + static VALUE normalize_name(VALUE self, VALUE name) { char *name_start = NULL; @@ -34,7 +42,7 @@ static VALUE normalized_tags_cached(VALUE self, VALUE tags) { VALUE cached; - VALUE cache = rb_ivar_get(self, idTagsCache); + VALUE cache = rb_ivar_get(self, idNormalizedTagsCache); VALUE key = rb_hash(tags); cached = rb_hash_aref(cache, key); if (RTEST(cached)) { @@ -146,22 +154,26 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE void Init_statsd() { - VALUE mStatsd, mInstrument, cDatagramBuilder; + VALUE mStatsd, mInstrument, cDatagramBuilder, mCDatagramBuilder; mStatsd = rb_define_module("StatsD"); mInstrument = rb_define_module_under(mStatsd, "Instrument"); cDatagramBuilder = rb_define_class_under(mInstrument, "DatagramBuilder", rb_cObject); + mCDatagramBuilder = rb_define_module_under(mInstrument, "CDatagramBuilder"); idTr = rb_intern("tr"); idNormalizeTags = rb_intern("normalize_tags"); idDefaultTags = rb_intern("@default_tags"); idPrefix = rb_intern("@prefix"); - idTagsCache = rb_intern("@tags_cache"); + idNormalizedTagsCache = rb_intern("@__normalized_tags_cache"); strNormalizeChars = rb_str_new_cstr(":|@"); rb_global_variable(&strNormalizeChars); strNormalizeReplacement = rb_str_new_cstr("_"); rb_global_variable(&strNormalizeReplacement); - rb_define_protected_method(cDatagramBuilder, "normalize_name", normalize_name, 1); - rb_define_protected_method(cDatagramBuilder, "generate_generic_datagram", generate_generic_datagram, 5); + rb_define_method(mCDatagramBuilder, "initialize", initialize, -1); + rb_define_protected_method(mCDatagramBuilder, "normalize_name", normalize_name, 1); + rb_define_protected_method(mCDatagramBuilder, "generate_generic_datagram", generate_generic_datagram, 5); + + rb_prepend_module(cDatagramBuilder, mCDatagramBuilder); } diff --git a/lib/statsd/instrument/datagram_builder.rb b/lib/statsd/instrument/datagram_builder.rb index dbc849fc..c7272718 100644 --- a/lib/statsd/instrument/datagram_builder.rb +++ b/lib/statsd/instrument/datagram_builder.rb @@ -30,7 +30,6 @@ def self.datagram_class def initialize(prefix: nil, default_tags: nil) @prefix = prefix.nil? ? "" : "#{normalize_name(prefix)}." @default_tags = normalize_tags(default_tags) - @tags_cache = {} end def c(name, value, sample_rate, tags) From 2272013b3ec47d563649fe57d84ec966089c9a1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 18:09:49 +0000 Subject: [PATCH 08/21] Let the normalized tags cache be a compile time option --- ext/statsd/statsd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 9e257554..a4b1ac20 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -2,6 +2,7 @@ #include #define DATAGRAM_SIZE_MAX 4096 +#define TAGS_CACHE_ENABLED 1 #define TAGS_CACHE_MAX 1024 static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idNormalizedTagsCache; @@ -41,6 +42,7 @@ normalize_name(VALUE self, VALUE name) { static VALUE normalized_tags_cached(VALUE self, VALUE tags) { +#ifdef TAGS_CACHE_ENABLED VALUE cached; VALUE cache = rb_ivar_get(self, idNormalizedTagsCache); VALUE key = rb_hash(tags); @@ -55,6 +57,9 @@ normalized_tags_cached(VALUE self, VALUE tags) return rb_funcall(self, idNormalizeTags, 1, tags); RB_GC_GUARD(key); RB_GC_GUARD(cached); +#else + return rb_funcall(self, idNormalizeTags, 1, tags); +#endif } static VALUE @@ -69,8 +74,7 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE MEMZERO(&datagram, char, DATAGRAM_SIZE_MAX); prefix = rb_ivar_get(self, idPrefix); - if (RSTRING_LEN(prefix) != 0) { - chunk_len = RSTRING_LEN(prefix); + if ((chunk_len = RSTRING_LEN(prefix)) != 0) { if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram, StringValuePtr(prefix), chunk_len); len += chunk_len; From ab7b420fc6da650fdf0d77df9d79560296e79987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 18:26:38 +0000 Subject: [PATCH 09/21] No need to memset the buffer to 0s for 4096 bytes on every call as we are very careful with offsets, even when preventing overflows --- ext/statsd/statsd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index a4b1ac20..0fccfb19 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -71,8 +71,6 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE int len = 0, tags_len = 0, i = 0; long chunk_len = 0; - MEMZERO(&datagram, char, DATAGRAM_SIZE_MAX); - prefix = rb_ivar_get(self, idPrefix); if ((chunk_len = RSTRING_LEN(prefix)) != 0) { if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; From ce81bbb0a8b53d7d2509b844fad345f2a5240fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Sun, 22 Mar 2020 19:16:28 +0000 Subject: [PATCH 10/21] Also extract a bounded normalized metrics name cache (perhaps normalized cache limits should be configurable at runtime, idk?) --- ext/statsd/statsd.c | 46 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 0fccfb19..d5ab60c4 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -2,17 +2,20 @@ #include #define DATAGRAM_SIZE_MAX 4096 -#define TAGS_CACHE_ENABLED 1 -#define TAGS_CACHE_MAX 1024 +#define NORMALIZED_TAGS_CACHE_ENABLED 1 +#define NORMALIZED_TAGS_CACHE_MAX 512 +#define NORMALIZED_NAMES_CACHE_ENABLED 1 +#define NORMALIZED_NAMES_CACHE_MAX 512 -static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idNormalizedTagsCache; +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idNormalizedTagsCache, idNormalizedNamesCache; static VALUE strNormalizeChars, strNormalizeReplacement; static VALUE initialize(int argc, VALUE *argv, VALUE self) { rb_call_super(argc, argv); - rb_ivar_set(self, idNormalizedTagsCache, rb_hash_new()); + rb_ivar_set(self, idNormalizedTagsCache, rb_obj_hide(rb_hash_new())); + rb_ivar_set(self, idNormalizedNamesCache, rb_obj_hide(rb_hash_new())); return self; } @@ -38,24 +41,44 @@ normalize_name(VALUE self, VALUE name) { return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); } +/* pure function not exposed to ruby with an intermediate bounded cache */ +static VALUE +normalized_names_cached(VALUE self, VALUE name) +{ +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + VALUE cached; + VALUE cache = rb_ivar_get(self, idNormalizedNamesCache); + cached = rb_hash_aref(cache, name); + if (RTEST(cached)) { + return cached; + } else if (rb_hash_size_num(cache) < NORMALIZED_NAMES_CACHE_MAX) { + cached = normalize_name(self, name); + rb_hash_aset(cache, name, cached); + return cached; + } + return normalize_name(self, name); + RB_GC_GUARD(cached); +#else + return normalize_name(self, name); +#endif +} + /* pure function not exposed to ruby with an intermediate bounded cache */ static VALUE normalized_tags_cached(VALUE self, VALUE tags) { -#ifdef TAGS_CACHE_ENABLED +#ifdef NORMALIZED_TAGS_CACHE_ENABLED VALUE cached; VALUE cache = rb_ivar_get(self, idNormalizedTagsCache); - VALUE key = rb_hash(tags); - cached = rb_hash_aref(cache, key); + cached = rb_hash_aref(cache, tags); if (RTEST(cached)) { return cached; - } else if (rb_hash_size_num(cache) < TAGS_CACHE_MAX) { + } else if (rb_hash_size_num(cache) < NORMALIZED_TAGS_CACHE_MAX) { cached = rb_funcall(self, idNormalizeTags, 1, tags); - rb_hash_aset(cache, key, cached); + rb_hash_aset(cache, tags, cached); return cached; } return rb_funcall(self, idNormalizeTags, 1, tags); - RB_GC_GUARD(key); RB_GC_GUARD(cached); #else return rb_funcall(self, idNormalizeTags, 1, tags); @@ -78,7 +101,7 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE len += chunk_len; } - normalized_name = normalize_name(self, name); + normalized_name = normalized_names_cached(self, name); chunk_len = RSTRING_LEN(normalized_name); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(datagram + len, StringValuePtr(normalized_name), chunk_len); @@ -167,6 +190,7 @@ void Init_statsd() idNormalizeTags = rb_intern("normalize_tags"); idDefaultTags = rb_intern("@default_tags"); idPrefix = rb_intern("@prefix"); + idNormalizedNamesCache = rb_intern("@__normalized_names_cache"); idNormalizedTagsCache = rb_intern("@__normalized_tags_cache"); strNormalizeChars = rb_str_new_cstr(":|@"); rb_global_variable(&strNormalizeChars); From 84e28f1c988b86a4bf91c06139d60f3a658d55fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Mon, 23 Mar 2020 16:35:19 +0000 Subject: [PATCH 11/21] The gist of this change is moving normalized caches to symbol tables instead of hashes. Along with that came several requirements which also illustrates how to implement wrapped structs and plug them clean into the GC. * The ivar lookups for caches were not efficient as the total ivar count for the builder went up to 4, meaning 1 of them would incur 2 symbol table lookups for a cached element as ivar count over 3 overflows to a symbol table * 'struct datagram_builder' now becomes a wrapped struct for tracking state such as optional normalized cache symbol tables, the datagram encode buffer, the initial offset of the first chunk of that buffer and also caches the default tags ivar * 'rb_data_type_t datagram_builder_type' and functions datagram_builder_mark, datagram_builder_free and datagram_builder_size illustrates the callbacks required for proper GC integration of wrapped structs * The primary reason for the wrapped struct is to not have global state as if there's multiple instances ever instantiated from Ruby land, the global state can be clobbered. * With the struct we are able to now pre seed the initial part of the buffer with the given prefix ivar and track the offset into the buffer for #generate_generic_datagram to pick up from so we don't have to do the ivar lookup and memcpy for EVERY datagram built * Normalized caches are now backed by symbol tables. I am happy with the normalized name implementation as rb_str_hash is exposed and computes a numeric hash from the String contents. Unfortunately rb_ary_hash and rb_hash_hash are not exposed and rb_hash defaults to a method call instead. I flagged to revisit. --- ext/statsd/statsd.c | 197 +++++++++++++++++++++++++++++++++----------- 1 file changed, 149 insertions(+), 48 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index d5ab60c4..d0e6e4c8 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -1,5 +1,6 @@ #include #include +#include #define DATAGRAM_SIZE_MAX 4096 #define NORMALIZED_TAGS_CACHE_ENABLED 1 @@ -7,15 +8,117 @@ #define NORMALIZED_NAMES_CACHE_ENABLED 1 #define NORMALIZED_NAMES_CACHE_MAX 512 -static ID idTr, idNormalizeTags, idDefaultTags, idPrefix, idNormalizedTagsCache, idNormalizedNamesCache; +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix; static VALUE strNormalizeChars, strNormalizeReplacement; +struct datagram_builder { +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_table *normalized_tags_cache; +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_table *normalized_names_cache; +#endif + // cached default tags ivar to skip a lookup + VALUE default_tags; + long prefix_len; + // last member to not glob up cache lines to access other struct members + char datagram[DATAGRAM_SIZE_MAX]; +}; + +// GC callback to mark the wrapper struct. Conditionally symbol tables if caching is enabled (values only) +// and the cached default tags as well. +void +datagram_builder_mark(void *ptr) +{ + const struct datagram_builder *builder = (struct datagram_builder *)ptr; +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + rb_mark_tbl(builder->normalized_tags_cache); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + rb_mark_tbl(builder->normalized_names_cache); +#endif + rb_gc_mark(builder->default_tags); +} + +// GC callback to free the wrapper struct. Conditionally symbol tables if caching is enabled +// and the struct itself. +void +datagram_builder_free(void *ptr) +{ + struct datagram_builder *builder = (struct datagram_builder *)ptr; + if (!builder) return; +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_free_table(builder->normalized_tags_cache); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_free_table(builder->normalized_names_cache); +#endif + xfree(builder); +} + +// Be nice to ObjectSpace and let the size be known. We'd likely want some feedback on +// this with various normalized cache size values. +size_t +datagram_builder_size(const void *ptr) +{ + size_t size; + const struct datagram_builder *builder = (struct datagram_builder *)ptr; + size = sizeof(struct datagram_builder); +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + size += st_memsize(builder->normalized_tags_cache); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + size += st_memsize(builder->normalized_names_cache); +#endif + return size; +} + +const rb_data_type_t datagram_builder_type = { + .wrap_struct_name = "datagram_builder", + .function = { + .dmark = datagram_builder_mark, + .dfree = datagram_builder_free, + .dsize = datagram_builder_size, + }, + .data = NULL, + .flags = RUBY_TYPED_FREE_IMMEDIATELY, +}; + +#define get_datagram_builder_struct(self) \ + struct datagram_builder *builder = NULL; \ + TypedData_Get_Struct(self, struct datagram_builder, &datagram_builder_type, builder); \ + +static VALUE +datagram_builder_alloc(VALUE self) +{ + struct datagram_builder *builder = ZALLOC(struct datagram_builder); +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + builder->normalized_tags_cache = st_init_numtable_with_size(NORMALIZED_TAGS_CACHE_MAX); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + builder->normalized_names_cache = st_init_numtable_with_size(NORMALIZED_NAMES_CACHE_MAX); +#endif + return TypedData_Wrap_Struct(self, &datagram_builder_type, builder); +} + static VALUE initialize(int argc, VALUE *argv, VALUE self) { + VALUE prefix; + long chunk_len = 0; + get_datagram_builder_struct(self); rb_call_super(argc, argv); - rb_ivar_set(self, idNormalizedTagsCache, rb_obj_hide(rb_hash_new())); - rb_ivar_set(self, idNormalizedNamesCache, rb_obj_hide(rb_hash_new())); + + // pre seed the buffer with the prefix and advance the offset as it's fixed for the lifetime of + // the builder + prefix = rb_ivar_get(self, idPrefix); + if ((chunk_len = RSTRING_LEN(prefix)) != 0 && chunk_len < DATAGRAM_SIZE_MAX) { + memcpy(builder->datagram, StringValuePtr(prefix), chunk_len); + builder->prefix_len = chunk_len; + } + + // Cache the defaukt tags ivar on the lookup struct + builder->default_tags = rb_ivar_get(self, idDefaultTags); return self; } @@ -43,17 +146,20 @@ normalize_name(VALUE self, VALUE name) { /* pure function not exposed to ruby with an intermediate bounded cache */ static VALUE -normalized_names_cached(VALUE self, VALUE name) +normalized_names_cached(struct datagram_builder *builder, VALUE self, VALUE name) { #ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_index_t key; + st_data_t val; VALUE cached; - VALUE cache = rb_ivar_get(self, idNormalizedNamesCache); - cached = rb_hash_aref(cache, name); - if (RTEST(cached)) { - return cached; - } else if (rb_hash_size_num(cache) < NORMALIZED_NAMES_CACHE_MAX) { + Check_Type(name, T_STRING); + // Can hash on string contents directly as type has already been checked + key = rb_str_hash(name); + if (st_lookup(builder->normalized_names_cache, key, &val)){ + return (VALUE)val; + } else if (builder->normalized_names_cache->num_entries < NORMALIZED_NAMES_CACHE_MAX) { cached = normalize_name(self, name); - rb_hash_aset(cache, name, cached); + st_insert(builder->normalized_names_cache, key, (st_data_t)cached); return cached; } return normalize_name(self, name); @@ -65,17 +171,20 @@ normalized_names_cached(VALUE self, VALUE name) /* pure function not exposed to ruby with an intermediate bounded cache */ static VALUE -normalized_tags_cached(VALUE self, VALUE tags) +normalized_tags_cached(struct datagram_builder *builder, VALUE self, VALUE tags) { #ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_index_t key; + st_data_t val; VALUE cached; - VALUE cache = rb_ivar_get(self, idNormalizedTagsCache); - cached = rb_hash_aref(cache, tags); - if (RTEST(cached)) { - return cached; - } else if (rb_hash_size_num(cache) < NORMALIZED_TAGS_CACHE_MAX) { + // More involved hashing as we need to hash on the content of the container too + // XXX: revisit + key = (st_index_t)(FIX2LONG(rb_hash(tags))); + if (st_lookup(builder->normalized_tags_cache, key, &val)){ + return (VALUE)val; + } else if (builder->normalized_tags_cache->num_entries < NORMALIZED_TAGS_CACHE_MAX) { cached = rb_funcall(self, idNormalizeTags, 1, tags); - rb_hash_aset(cache, tags, cached); + st_insert(builder->normalized_tags_cache, key, (st_data_t)cached); return cached; } return rb_funcall(self, idNormalizeTags, 1, tags); @@ -87,74 +196,65 @@ normalized_tags_cached(VALUE self, VALUE tags) static VALUE generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { - VALUE prefix, normalized_name, str_value, str_sample_rate, default_tags, tag; + VALUE normalized_name, str_value, str_sample_rate, tag; VALUE normalized_tags = Qnil; - char datagram[DATAGRAM_SIZE_MAX]; int empty_default_tags = 1, empty_tags = 1; int len = 0, tags_len = 0, i = 0; long chunk_len = 0; + get_datagram_builder_struct(self); - prefix = rb_ivar_get(self, idPrefix); - if ((chunk_len = RSTRING_LEN(prefix)) != 0) { - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram, StringValuePtr(prefix), chunk_len); - len += chunk_len; - } + len = builder->prefix_len; - normalized_name = normalized_names_cached(self, name); + normalized_name = normalized_names_cached(builder, self, name); chunk_len = RSTRING_LEN(normalized_name); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, StringValuePtr(normalized_name), chunk_len); + memcpy(builder->datagram + len, StringValuePtr(normalized_name), chunk_len); len += chunk_len; if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, ":", 1); + memcpy(builder->datagram + len, ":", 1); len += 1; str_value = rb_obj_as_string(value); chunk_len = RSTRING_LEN(str_value); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, StringValuePtr(str_value), chunk_len); + memcpy(builder->datagram + len, StringValuePtr(str_value), chunk_len); len += chunk_len; if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, "|", 1); + memcpy(builder->datagram + len, "|", 1); len += 1; chunk_len = RSTRING_LEN(type); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, StringValuePtr(type), chunk_len); + memcpy(builder->datagram + len, StringValuePtr(type), chunk_len); len += chunk_len; if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, "|@", 2); + memcpy(builder->datagram + len, "|@", 2); len += 2; str_sample_rate = rb_obj_as_string(sample_rate); chunk_len = RSTRING_LEN(str_sample_rate); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, StringValuePtr(str_sample_rate), chunk_len); + memcpy(builder->datagram + len, StringValuePtr(str_sample_rate), chunk_len); len += chunk_len; } - default_tags = rb_ivar_get(self, idDefaultTags); - - empty_default_tags = (RTEST(default_tags) ? RARRAY_LEN(default_tags) == 0 : 0); - if (RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) { - empty_tags = 0; - } else if (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0) { + empty_default_tags = (RTEST(builder->default_tags) ? RARRAY_LEN(builder->default_tags) == 0 : 0); + if ((RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) || (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0)) { empty_tags = 0; } if (empty_default_tags && !empty_tags) { - normalized_tags = normalized_tags_cached(self, tags); + normalized_tags = normalized_tags_cached(builder, self, tags); } else if (!empty_default_tags && !empty_tags) { - normalized_tags = rb_ary_concat(normalized_tags_cached(self, tags), default_tags); + normalized_tags = rb_ary_concat(normalized_tags_cached(builder, self, tags), builder->default_tags); } else if (!empty_default_tags && empty_tags) { - normalized_tags = default_tags; + normalized_tags = builder->default_tags; } if (RTEST(normalized_tags)) { if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, "|#", 2); + memcpy(builder->datagram + len, "|#", 2); len += 2; tags_len = RARRAY_LEN(normalized_tags); @@ -162,18 +262,18 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE tag = RARRAY_AREF(normalized_tags, i); chunk_len = RSTRING_LEN(tag); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, StringValuePtr(tag), chunk_len); + memcpy(builder->datagram + len, StringValuePtr(tag), chunk_len); len += chunk_len; if (i < tags_len - 1) { if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(datagram + len, ",", 1); + memcpy(builder->datagram + len, ",", 1); len += 1; } } } finalize_datagram: - return rb_str_new(datagram, len); + return rb_str_new(builder->datagram, len); RB_GC_GUARD(normalized_tags); } @@ -184,14 +284,15 @@ void Init_statsd() mStatsd = rb_define_module("StatsD"); mInstrument = rb_define_module_under(mStatsd, "Instrument"); cDatagramBuilder = rb_define_class_under(mInstrument, "DatagramBuilder", rb_cObject); + + rb_define_alloc_func(cDatagramBuilder, datagram_builder_alloc); + mCDatagramBuilder = rb_define_module_under(mInstrument, "CDatagramBuilder"); idTr = rb_intern("tr"); idNormalizeTags = rb_intern("normalize_tags"); idDefaultTags = rb_intern("@default_tags"); idPrefix = rb_intern("@prefix"); - idNormalizedNamesCache = rb_intern("@__normalized_names_cache"); - idNormalizedTagsCache = rb_intern("@__normalized_tags_cache"); strNormalizeChars = rb_str_new_cstr(":|@"); rb_global_variable(&strNormalizeChars); strNormalizeReplacement = rb_str_new_cstr("_"); From b7a19e872ad16825fe321c0649c8f1551a5255e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Tue, 24 Mar 2020 15:11:05 +0000 Subject: [PATCH 12/21] Extract normalize_name_fast_path which is guaranteed to not do a function call, still has O(n) complexity but cheaper than str hash + symbol table lookup --- ext/statsd/statsd.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index d0e6e4c8..75671f2a 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -122,8 +122,9 @@ initialize(int argc, VALUE *argv, VALUE self) return self; } -static VALUE -normalize_name(VALUE self, VALUE name) { +inline static VALUE +normalize_name_fast_path(VALUE self, VALUE name) +{ char *name_start = NULL; char *name_end = NULL; Check_Type(name, T_STRING); @@ -141,6 +142,13 @@ normalize_name(VALUE self, VALUE name) { if (name_start == name_end) { return name; } + return Qnil; +} + +static VALUE +normalize_name(VALUE self, VALUE name) { + VALUE _name = normalize_name_fast_path(self, name); + if (!NIL_P(_name)) return _name; return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); } @@ -205,7 +213,10 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE len = builder->prefix_len; - normalized_name = normalized_names_cached(builder, self, name); + if (NIL_P(normalized_name = normalize_name_fast_path(self, name))) { + normalized_name = normalized_names_cached(builder, self, name); + } + chunk_len = RSTRING_LEN(normalized_name); if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(builder->datagram + len, StringValuePtr(normalized_name), chunk_len); From 430e72baab8a931760908563d98e8ad82be2ffc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Thu, 26 Mar 2020 13:04:24 +0000 Subject: [PATCH 13/21] Fix warnings about losing integer precision --- ext/statsd/statsd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 75671f2a..f15b51fe 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -211,7 +211,7 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE long chunk_len = 0; get_datagram_builder_struct(self); - len = builder->prefix_len; + len = (int)builder->prefix_len; if (NIL_P(normalized_name = normalize_name_fast_path(self, name))) { normalized_name = normalized_names_cached(builder, self, name); @@ -268,7 +268,7 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE memcpy(builder->datagram + len, "|#", 2); len += 2; - tags_len = RARRAY_LEN(normalized_tags); + tags_len = (int)RARRAY_LEN(normalized_tags); for (i = 0; i < tags_len; ++i) { tag = RARRAY_AREF(normalized_tags, i); chunk_len = RSTRING_LEN(tag); From be655a5b24fe30273bc3d8df696e1cea685eb307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Thu, 26 Mar 2020 14:56:20 +0000 Subject: [PATCH 14/21] Introduce a fast path for sample rate Float and Fixnum types that prevents a Ruby String allocation per datagram --- ext/statsd/statsd.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index f15b51fe..31fc00a8 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -3,6 +3,7 @@ #include #define DATAGRAM_SIZE_MAX 4096 +#define SAMPLE_RATE_SIZE_MAX 16 #define NORMALIZED_TAGS_CACHE_ENABLED 1 #define NORMALIZED_TAGS_CACHE_MAX 512 #define NORMALIZED_NAMES_CACHE_ENABLED 1 @@ -206,6 +207,7 @@ static VALUE generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { VALUE normalized_name, str_value, str_sample_rate, tag; VALUE normalized_tags = Qnil; + char sr_buf[SAMPLE_RATE_SIZE_MAX]; int empty_default_tags = 1, empty_tags = 1; int len = 0, tags_len = 0, i = 0; long chunk_len = 0; @@ -243,18 +245,29 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(builder->datagram + len, "|@", 2); len += 2; - str_sample_rate = rb_obj_as_string(sample_rate); - chunk_len = RSTRING_LEN(str_sample_rate); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(str_sample_rate), chunk_len); - len += chunk_len; + if (RB_FIXNUM_P(sample_rate)) { + chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%d", FIX2INT(sample_rate)); + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + len, sr_buf, chunk_len); + len += chunk_len; + } else if (RB_FLOAT_TYPE_P(sample_rate)) { + chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%g", RFLOAT_VALUE(sample_rate)); + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + len, sr_buf, chunk_len); + len += chunk_len; + } else { + str_sample_rate = rb_obj_as_string(sample_rate); + chunk_len = RSTRING_LEN(str_sample_rate); + if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + len, StringValuePtr(str_sample_rate), chunk_len); + len += chunk_len; + } } empty_default_tags = (RTEST(builder->default_tags) ? RARRAY_LEN(builder->default_tags) == 0 : 0); if ((RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) || (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0)) { empty_tags = 0; } - if (empty_default_tags && !empty_tags) { normalized_tags = normalized_tags_cached(builder, self, tags); } else if (!empty_default_tags && !empty_tags) { From 2200dec43bbe902807edaeff93e94767be2e2c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Thu, 26 Mar 2020 18:52:11 +0000 Subject: [PATCH 15/21] Avoid Array alloc + concat for when both default and given tags are specified --- ext/statsd/statsd.c | 118 +++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 56 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 31fc00a8..4a1c9db1 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -21,7 +21,8 @@ struct datagram_builder { #endif // cached default tags ivar to skip a lookup VALUE default_tags; - long prefix_len; + int prefix_len; + int len; // last member to not glob up cache lines to access other struct members char datagram[DATAGRAM_SIZE_MAX]; }; @@ -115,7 +116,7 @@ initialize(int argc, VALUE *argv, VALUE self) prefix = rb_ivar_get(self, idPrefix); if ((chunk_len = RSTRING_LEN(prefix)) != 0 && chunk_len < DATAGRAM_SIZE_MAX) { memcpy(builder->datagram, StringValuePtr(prefix), chunk_len); - builder->prefix_len = chunk_len; + builder->prefix_len = (int)chunk_len; } // Cache the defaukt tags ivar on the lookup struct @@ -203,64 +204,83 @@ normalized_tags_cached(struct datagram_builder *builder, VALUE self, VALUE tags) #endif } +inline static int append_normalized_tags(struct datagram_builder *builder, VALUE normalized_tags, int trim_trailing_comma) +{ + VALUE tag; + int tags_len = 0, chunk_len = 0, i = 0; + tags_len = (int)RARRAY_LEN(normalized_tags); + for (i = 0; i < tags_len; ++i) { + tag = RARRAY_AREF(normalized_tags, i); + chunk_len = (int)RSTRING_LEN(tag); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) return 0; + memcpy(builder->datagram + builder->len, StringValuePtr(tag), chunk_len); + builder->len += chunk_len; + if (!trim_trailing_comma || i < tags_len - 1) { + if (builder->len + 1 > DATAGRAM_SIZE_MAX) return 0; + memcpy(builder->datagram + builder->len, ",", 1); + builder->len += 1; + } + } + return 1; +} + static VALUE generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { - VALUE normalized_name, str_value, str_sample_rate, tag; + VALUE normalized_name, str_value, str_sample_rate; VALUE normalized_tags = Qnil; char sr_buf[SAMPLE_RATE_SIZE_MAX]; int empty_default_tags = 1, empty_tags = 1; - int len = 0, tags_len = 0, i = 0; long chunk_len = 0; get_datagram_builder_struct(self); - len = (int)builder->prefix_len; + builder->len = builder->prefix_len; if (NIL_P(normalized_name = normalize_name_fast_path(self, name))) { normalized_name = normalized_names_cached(builder, self, name); } chunk_len = RSTRING_LEN(normalized_name); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(normalized_name), chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(normalized_name), chunk_len); + builder->len += chunk_len; - if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, ":", 1); - len += 1; + if (builder->len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, ":", 1); + builder->len += 1; str_value = rb_obj_as_string(value); chunk_len = RSTRING_LEN(str_value); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(str_value), chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(str_value), chunk_len); + builder->len += chunk_len; - if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, "|", 1); - len += 1; + if (builder->len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|", 1); + builder->len += 1; chunk_len = RSTRING_LEN(type); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(type), chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(type), chunk_len); + builder->len += chunk_len; if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { - if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, "|@", 2); - len += 2; + if (builder->len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|@", 2); + builder->len += 2; if (RB_FIXNUM_P(sample_rate)) { chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%d", FIX2INT(sample_rate)); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, sr_buf, chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, sr_buf, chunk_len); + builder->len += chunk_len; } else if (RB_FLOAT_TYPE_P(sample_rate)) { chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%g", RFLOAT_VALUE(sample_rate)); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, sr_buf, chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, sr_buf, chunk_len); + builder->len += chunk_len; } else { str_sample_rate = rb_obj_as_string(sample_rate); chunk_len = RSTRING_LEN(str_sample_rate); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(str_sample_rate), chunk_len); - len += chunk_len; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(str_sample_rate), chunk_len); + builder->len += chunk_len; } } @@ -268,36 +288,22 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE if ((RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) || (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0)) { empty_tags = 0; } + if (!(empty_default_tags && empty_tags)) { + if (builder->len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|#", 2); + builder->len += 2; + } if (empty_default_tags && !empty_tags) { - normalized_tags = normalized_tags_cached(builder, self, tags); + if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 1)) goto finalize_datagram; } else if (!empty_default_tags && !empty_tags) { - normalized_tags = rb_ary_concat(normalized_tags_cached(builder, self, tags), builder->default_tags); + if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 0)) goto finalize_datagram; + if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; } else if (!empty_default_tags && empty_tags) { - normalized_tags = builder->default_tags; - } - - if (RTEST(normalized_tags)) { - if (len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, "|#", 2); - len += 2; - - tags_len = (int)RARRAY_LEN(normalized_tags); - for (i = 0; i < tags_len; ++i) { - tag = RARRAY_AREF(normalized_tags, i); - chunk_len = RSTRING_LEN(tag); - if (len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, StringValuePtr(tag), chunk_len); - len += chunk_len; - if (i < tags_len - 1) { - if (len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + len, ",", 1); - len += 1; - } - } + if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; } finalize_datagram: - return rb_str_new(builder->datagram, len); + return rb_str_new(builder->datagram, builder->len); RB_GC_GUARD(normalized_tags); } From ef094cd9b0d620c6440daefecd8ace92ecbd7383 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Thu, 26 Mar 2020 19:24:00 +0000 Subject: [PATCH 16/21] Hide #generate_generic_datagram and #normalize_name from Ruby and direct dispatch to them from the respective metric helper functions --- ext/statsd/statsd.c | 50 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 4a1c9db1..e2ec3289 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -225,7 +225,7 @@ inline static int append_normalized_tags(struct datagram_builder *builder, VALUE } static VALUE -generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE sample_rate, VALUE tags) { +generate_generic_datagram(VALUE self, VALUE name, VALUE value, const char *type, VALUE sample_rate, VALUE tags) { VALUE normalized_name, str_value, str_sample_rate; VALUE normalized_tags = Qnil; char sr_buf[SAMPLE_RATE_SIZE_MAX]; @@ -256,9 +256,9 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE if (builder->len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(builder->datagram + builder->len, "|", 1); builder->len += 1; - chunk_len = RSTRING_LEN(type); + chunk_len = strlen(type); if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; - memcpy(builder->datagram + builder->len, StringValuePtr(type), chunk_len); + memcpy(builder->datagram + builder->len, type, chunk_len); builder->len += chunk_len; if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { @@ -307,6 +307,41 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, VALUE type, VALUE RB_GC_GUARD(normalized_tags); } +static VALUE metric_c(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "c", sample_rate, tags); +} + +static VALUE metric_g(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "g", sample_rate, tags); +} + +static VALUE metric_ms(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "ms", sample_rate, tags); +} + +static VALUE metric_s(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "s", sample_rate, tags); +} + +static VALUE metric_h(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "h", sample_rate, tags); +} + +static VALUE metric_d(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "d", sample_rate, tags); +} + +static VALUE metric_kv(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "ms", sample_rate, tags); +} + void Init_statsd() { VALUE mStatsd, mInstrument, cDatagramBuilder, mCDatagramBuilder; @@ -329,8 +364,13 @@ void Init_statsd() rb_global_variable(&strNormalizeReplacement); rb_define_method(mCDatagramBuilder, "initialize", initialize, -1); - rb_define_protected_method(mCDatagramBuilder, "normalize_name", normalize_name, 1); - rb_define_protected_method(mCDatagramBuilder, "generate_generic_datagram", generate_generic_datagram, 5); + rb_define_method(mCDatagramBuilder, "c", metric_c, 4); + rb_define_method(mCDatagramBuilder, "g", metric_g, 4); + rb_define_method(mCDatagramBuilder, "ms", metric_ms, 4); + rb_define_method(mCDatagramBuilder, "s", metric_s, 4); + rb_define_method(mCDatagramBuilder, "h", metric_h, 4); + rb_define_method(mCDatagramBuilder, "d", metric_d, 4); + rb_define_method(mCDatagramBuilder, "kv", metric_kv, 4); rb_prepend_module(cDatagramBuilder, mCDatagramBuilder); } From 16647893ad56ae3e6fad5053b42ea438367d4644 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Mon, 30 Mar 2020 19:26:50 +0100 Subject: [PATCH 17/21] Prefer a strhash table for the normalized names cache --- ext/statsd/statsd.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index e2ec3289..12cbd613 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -98,7 +98,7 @@ datagram_builder_alloc(VALUE self) builder->normalized_tags_cache = st_init_numtable_with_size(NORMALIZED_TAGS_CACHE_MAX); #endif #ifdef NORMALIZED_NAMES_CACHE_ENABLED - builder->normalized_names_cache = st_init_numtable_with_size(NORMALIZED_NAMES_CACHE_MAX); + builder->normalized_names_cache = st_init_strtable_with_size(NORMALIZED_NAMES_CACHE_MAX); #endif return TypedData_Wrap_Struct(self, &datagram_builder_type, builder); } @@ -162,9 +162,7 @@ normalized_names_cached(struct datagram_builder *builder, VALUE self, VALUE name st_index_t key; st_data_t val; VALUE cached; - Check_Type(name, T_STRING); - // Can hash on string contents directly as type has already been checked - key = rb_str_hash(name); + key = (st_index_t)RSTRING_PTR(name); if (st_lookup(builder->normalized_names_cache, key, &val)){ return (VALUE)val; } else if (builder->normalized_names_cache->num_entries < NORMALIZED_NAMES_CACHE_MAX) { @@ -238,7 +236,6 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, const char *type, if (NIL_P(normalized_name = normalize_name_fast_path(self, name))) { normalized_name = normalized_names_cached(builder, self, name); } - chunk_len = RSTRING_LEN(normalized_name); if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(builder->datagram + builder->len, StringValuePtr(normalized_name), chunk_len); From d60dbe1b0fee5f9443e2d6a0c30af8e637c88029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Mon, 30 Mar 2020 19:41:24 +0100 Subject: [PATCH 18/21] Move the global string allocations to builder struct members instead --- ext/statsd/statsd.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 12cbd613..43ea8def 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -10,7 +10,6 @@ #define NORMALIZED_NAMES_CACHE_MAX 512 static ID idTr, idNormalizeTags, idDefaultTags, idPrefix; -static VALUE strNormalizeChars, strNormalizeReplacement; struct datagram_builder { #ifdef NORMALIZED_TAGS_CACHE_ENABLED @@ -19,6 +18,8 @@ struct datagram_builder { #ifdef NORMALIZED_NAMES_CACHE_ENABLED st_table *normalized_names_cache; #endif + VALUE str_normalize_chars; + VALUE str_normalize_replacement; // cached default tags ivar to skip a lookup VALUE default_tags; int prefix_len; @@ -39,6 +40,8 @@ datagram_builder_mark(void *ptr) #ifdef NORMALIZED_NAMES_CACHE_ENABLED rb_mark_tbl(builder->normalized_names_cache); #endif + rb_gc_mark(builder->str_normalize_chars); + rb_gc_mark(builder->str_normalize_replacement); rb_gc_mark(builder->default_tags); } @@ -100,6 +103,8 @@ datagram_builder_alloc(VALUE self) #ifdef NORMALIZED_NAMES_CACHE_ENABLED builder->normalized_names_cache = st_init_strtable_with_size(NORMALIZED_NAMES_CACHE_MAX); #endif + builder->str_normalize_chars = rb_str_new_cstr(":|@"); + builder->str_normalize_replacement = rb_str_new_cstr("_"); return TypedData_Wrap_Struct(self, &datagram_builder_type, builder); } @@ -148,10 +153,10 @@ normalize_name_fast_path(VALUE self, VALUE name) } static VALUE -normalize_name(VALUE self, VALUE name) { +normalize_name(struct datagram_builder *builder, VALUE self, VALUE name) { VALUE _name = normalize_name_fast_path(self, name); if (!NIL_P(_name)) return _name; - return rb_funcall(name, idTr, 2, strNormalizeChars, strNormalizeReplacement); + return rb_funcall(name, idTr, 2, builder->str_normalize_chars, builder->str_normalize_replacement); } /* pure function not exposed to ruby with an intermediate bounded cache */ @@ -166,11 +171,11 @@ normalized_names_cached(struct datagram_builder *builder, VALUE self, VALUE name if (st_lookup(builder->normalized_names_cache, key, &val)){ return (VALUE)val; } else if (builder->normalized_names_cache->num_entries < NORMALIZED_NAMES_CACHE_MAX) { - cached = normalize_name(self, name); + cached = normalize_name(builder, self, name); st_insert(builder->normalized_names_cache, key, (st_data_t)cached); return cached; } - return normalize_name(self, name); + return normalize_name(builder, self, name); RB_GC_GUARD(cached); #else return normalize_name(self, name); @@ -355,10 +360,6 @@ void Init_statsd() idNormalizeTags = rb_intern("normalize_tags"); idDefaultTags = rb_intern("@default_tags"); idPrefix = rb_intern("@prefix"); - strNormalizeChars = rb_str_new_cstr(":|@"); - rb_global_variable(&strNormalizeChars); - strNormalizeReplacement = rb_str_new_cstr("_"); - rb_global_variable(&strNormalizeReplacement); rb_define_method(mCDatagramBuilder, "initialize", initialize, -1); rb_define_method(mCDatagramBuilder, "c", metric_c, 4); From a40d369fa95172942e2021a78e1355e57b98231c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Mon, 30 Mar 2020 20:01:41 +0100 Subject: [PATCH 19/21] Perform the empty default tags predicate check on init --- ext/statsd/statsd.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 43ea8def..0d70d30d 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -22,6 +23,7 @@ struct datagram_builder { VALUE str_normalize_replacement; // cached default tags ivar to skip a lookup VALUE default_tags; + bool empty_default_tags; int prefix_len; int len; // last member to not glob up cache lines to access other struct members @@ -126,6 +128,7 @@ initialize(int argc, VALUE *argv, VALUE self) // Cache the defaukt tags ivar on the lookup struct builder->default_tags = rb_ivar_get(self, idDefaultTags); + builder->empty_default_tags = (RTEST(builder->default_tags) ? RARRAY_LEN(builder->default_tags) == 0 : false); return self; } @@ -232,7 +235,7 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, const char *type, VALUE normalized_name, str_value, str_sample_rate; VALUE normalized_tags = Qnil; char sr_buf[SAMPLE_RATE_SIZE_MAX]; - int empty_default_tags = 1, empty_tags = 1; + bool empty_tags = true; long chunk_len = 0; get_datagram_builder_struct(self); @@ -286,21 +289,20 @@ generate_generic_datagram(VALUE self, VALUE name, VALUE value, const char *type, } } - empty_default_tags = (RTEST(builder->default_tags) ? RARRAY_LEN(builder->default_tags) == 0 : 0); if ((RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) || (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0)) { - empty_tags = 0; + empty_tags = false; } - if (!(empty_default_tags && empty_tags)) { + if (!(builder->empty_default_tags && empty_tags)) { if (builder->len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; memcpy(builder->datagram + builder->len, "|#", 2); builder->len += 2; } - if (empty_default_tags && !empty_tags) { + if (builder->empty_default_tags && !empty_tags) { if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 1)) goto finalize_datagram; - } else if (!empty_default_tags && !empty_tags) { + } else if (!builder->empty_default_tags && !empty_tags) { if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 0)) goto finalize_datagram; if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; - } else if (!empty_default_tags && empty_tags) { + } else if (!builder->empty_default_tags && empty_tags) { if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; } From 16d6f145b417babae980f198465fa840dbb63a11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Mon, 30 Mar 2020 20:15:16 +0100 Subject: [PATCH 20/21] Let append_normalized_tags prefer a bool return too --- ext/statsd/statsd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index 0d70d30d..deb1cc46 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -210,7 +210,7 @@ normalized_tags_cached(struct datagram_builder *builder, VALUE self, VALUE tags) #endif } -inline static int append_normalized_tags(struct datagram_builder *builder, VALUE normalized_tags, int trim_trailing_comma) +inline static bool append_normalized_tags(struct datagram_builder *builder, VALUE normalized_tags, int trim_trailing_comma) { VALUE tag; int tags_len = 0, chunk_len = 0, i = 0; @@ -218,16 +218,16 @@ inline static int append_normalized_tags(struct datagram_builder *builder, VALUE for (i = 0; i < tags_len; ++i) { tag = RARRAY_AREF(normalized_tags, i); chunk_len = (int)RSTRING_LEN(tag); - if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) return 0; + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) return false; memcpy(builder->datagram + builder->len, StringValuePtr(tag), chunk_len); builder->len += chunk_len; if (!trim_trailing_comma || i < tags_len - 1) { - if (builder->len + 1 > DATAGRAM_SIZE_MAX) return 0; + if (builder->len + 1 > DATAGRAM_SIZE_MAX) return false; memcpy(builder->datagram + builder->len, ",", 1); builder->len += 1; } } - return 1; + return true; } static VALUE From 666ff836dd4f05fdd0c92b8ab267e10f3fd91e0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lourens=20Naud=C3=A9?= Date: Wed, 1 Apr 2020 11:58:28 +0100 Subject: [PATCH 21/21] Explicitly set freed struct members and the builder to NULL in datagram_builder_free --- ext/statsd/statsd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c index deb1cc46..32343eb6 100644 --- a/ext/statsd/statsd.c +++ b/ext/statsd/statsd.c @@ -56,11 +56,14 @@ datagram_builder_free(void *ptr) if (!builder) return; #ifdef NORMALIZED_TAGS_CACHE_ENABLED st_free_table(builder->normalized_tags_cache); + builder->normalized_tags_cache = NULL; #endif #ifdef NORMALIZED_NAMES_CACHE_ENABLED st_free_table(builder->normalized_names_cache); + builder->normalized_names_cache = NULL; #endif xfree(builder); + builder = NULL; } // Be nice to ObjectSpace and let the size be known. We'd likely want some feedback on