diff --git a/.gitignore b/.gitignore index 804eafeb..36ce0d79 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ doc *.gem .bundle Gemfile.lock +lib/statsd/instrument/ext/* pkg/* vendor/ tmp/* diff --git a/Rakefile b/Rakefile index 068e3e2e..f016ddfb 100644 --- a/Rakefile +++ b/Rakefile @@ -3,10 +3,21 @@ require 'bundler/gem_tasks' require 'rake/testtask' +GEMSPEC = eval(File.read('statsd-instrument.gemspec')) + +require 'rake/extensiontask' +Rake::ExtensionTask.new('statsd', GEMSPEC) do |ext| + ext.ext_dir = 'ext/statsd' + ext.lib_dir = 'lib/statsd/instrument/ext' +end +task :build => :compile + Rake::TestTask.new('test') do |t| t.ruby_opts << '-r rubygems' t.libs << 'lib' << 'test' t.test_files = FileList['test/**/*_test.rb'] end +task :test => :build + task default: :test diff --git a/ext/statsd/extconf.rb b/ext/statsd/extconf.rb new file mode 100644 index 00000000..5ab47bf2 --- /dev/null +++ b/ext/statsd/extconf.rb @@ -0,0 +1,12 @@ +require 'mkmf' + +append_cflags '-pedantic' +append_cflags '-Wall' +if ENV['STATSD_EXT_DEBUG'] + append_cflags "-Og" + append_cflags '-ggdb3' +else + append_cflags "-O3" +end + +create_makefile('statsd/statsd') diff --git a/ext/statsd/statsd.c b/ext/statsd/statsd.c new file mode 100644 index 00000000..32343eb6 --- /dev/null +++ b/ext/statsd/statsd.c @@ -0,0 +1,379 @@ +#include +#include +#include +#include + +#define DATAGRAM_SIZE_MAX 4096 +#define SAMPLE_RATE_SIZE_MAX 16 +#define NORMALIZED_TAGS_CACHE_ENABLED 1 +#define NORMALIZED_TAGS_CACHE_MAX 512 +#define NORMALIZED_NAMES_CACHE_ENABLED 1 +#define NORMALIZED_NAMES_CACHE_MAX 512 + +static ID idTr, idNormalizeTags, idDefaultTags, idPrefix; + +struct datagram_builder { +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_table *normalized_tags_cache; +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_table *normalized_names_cache; +#endif + VALUE str_normalize_chars; + VALUE str_normalize_replacement; + // cached default tags ivar to skip a lookup + VALUE default_tags; + bool empty_default_tags; + int prefix_len; + int len; + // last member to not glob up cache lines to access other struct members + char datagram[DATAGRAM_SIZE_MAX]; +}; + +// GC callback to mark the wrapper struct. Conditionally symbol tables if caching is enabled (values only) +// and the cached default tags as well. +void +datagram_builder_mark(void *ptr) +{ + const struct datagram_builder *builder = (struct datagram_builder *)ptr; +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + rb_mark_tbl(builder->normalized_tags_cache); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + rb_mark_tbl(builder->normalized_names_cache); +#endif + rb_gc_mark(builder->str_normalize_chars); + rb_gc_mark(builder->str_normalize_replacement); + rb_gc_mark(builder->default_tags); +} + +// GC callback to free the wrapper struct. Conditionally symbol tables if caching is enabled +// and the struct itself. +void +datagram_builder_free(void *ptr) +{ + struct datagram_builder *builder = (struct datagram_builder *)ptr; + if (!builder) return; +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_free_table(builder->normalized_tags_cache); + builder->normalized_tags_cache = NULL; +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_free_table(builder->normalized_names_cache); + builder->normalized_names_cache = NULL; +#endif + xfree(builder); + builder = NULL; +} + +// Be nice to ObjectSpace and let the size be known. We'd likely want some feedback on +// this with various normalized cache size values. +size_t +datagram_builder_size(const void *ptr) +{ + size_t size; + const struct datagram_builder *builder = (struct datagram_builder *)ptr; + size = sizeof(struct datagram_builder); +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + size += st_memsize(builder->normalized_tags_cache); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + size += st_memsize(builder->normalized_names_cache); +#endif + return size; +} + +const rb_data_type_t datagram_builder_type = { + .wrap_struct_name = "datagram_builder", + .function = { + .dmark = datagram_builder_mark, + .dfree = datagram_builder_free, + .dsize = datagram_builder_size, + }, + .data = NULL, + .flags = RUBY_TYPED_FREE_IMMEDIATELY, +}; + +#define get_datagram_builder_struct(self) \ + struct datagram_builder *builder = NULL; \ + TypedData_Get_Struct(self, struct datagram_builder, &datagram_builder_type, builder); \ + +static VALUE +datagram_builder_alloc(VALUE self) +{ + struct datagram_builder *builder = ZALLOC(struct datagram_builder); +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + builder->normalized_tags_cache = st_init_numtable_with_size(NORMALIZED_TAGS_CACHE_MAX); +#endif +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + builder->normalized_names_cache = st_init_strtable_with_size(NORMALIZED_NAMES_CACHE_MAX); +#endif + builder->str_normalize_chars = rb_str_new_cstr(":|@"); + builder->str_normalize_replacement = rb_str_new_cstr("_"); + return TypedData_Wrap_Struct(self, &datagram_builder_type, builder); +} + +static VALUE +initialize(int argc, VALUE *argv, VALUE self) +{ + VALUE prefix; + long chunk_len = 0; + get_datagram_builder_struct(self); + rb_call_super(argc, argv); + + // pre seed the buffer with the prefix and advance the offset as it's fixed for the lifetime of + // the builder + prefix = rb_ivar_get(self, idPrefix); + if ((chunk_len = RSTRING_LEN(prefix)) != 0 && chunk_len < DATAGRAM_SIZE_MAX) { + memcpy(builder->datagram, StringValuePtr(prefix), chunk_len); + builder->prefix_len = (int)chunk_len; + } + + // Cache the defaukt tags ivar on the lookup struct + builder->default_tags = rb_ivar_get(self, idDefaultTags); + builder->empty_default_tags = (RTEST(builder->default_tags) ? RARRAY_LEN(builder->default_tags) == 0 : false); + return self; +} + +inline static VALUE +normalize_name_fast_path(VALUE self, VALUE name) +{ + char *name_start = NULL; + char *name_end = NULL; + Check_Type(name, T_STRING); + + name_start = RSTRING_PTR(name); + name_end = RSTRING_END(name); + + while (name_start < name_end) { + if (*name_start == ':' || *name_start == '|' || *name_start == '@') { + break; + } + name_start++; + } + + if (name_start == name_end) { + return name; + } + return Qnil; +} + +static VALUE +normalize_name(struct datagram_builder *builder, VALUE self, VALUE name) { + VALUE _name = normalize_name_fast_path(self, name); + if (!NIL_P(_name)) return _name; + return rb_funcall(name, idTr, 2, builder->str_normalize_chars, builder->str_normalize_replacement); +} + +/* pure function not exposed to ruby with an intermediate bounded cache */ +static VALUE +normalized_names_cached(struct datagram_builder *builder, VALUE self, VALUE name) +{ +#ifdef NORMALIZED_NAMES_CACHE_ENABLED + st_index_t key; + st_data_t val; + VALUE cached; + key = (st_index_t)RSTRING_PTR(name); + if (st_lookup(builder->normalized_names_cache, key, &val)){ + return (VALUE)val; + } else if (builder->normalized_names_cache->num_entries < NORMALIZED_NAMES_CACHE_MAX) { + cached = normalize_name(builder, self, name); + st_insert(builder->normalized_names_cache, key, (st_data_t)cached); + return cached; + } + return normalize_name(builder, self, name); + RB_GC_GUARD(cached); +#else + return normalize_name(self, name); +#endif +} + +/* pure function not exposed to ruby with an intermediate bounded cache */ +static VALUE +normalized_tags_cached(struct datagram_builder *builder, VALUE self, VALUE tags) +{ +#ifdef NORMALIZED_TAGS_CACHE_ENABLED + st_index_t key; + st_data_t val; + VALUE cached; + // More involved hashing as we need to hash on the content of the container too + // XXX: revisit + key = (st_index_t)(FIX2LONG(rb_hash(tags))); + if (st_lookup(builder->normalized_tags_cache, key, &val)){ + return (VALUE)val; + } else if (builder->normalized_tags_cache->num_entries < NORMALIZED_TAGS_CACHE_MAX) { + cached = rb_funcall(self, idNormalizeTags, 1, tags); + st_insert(builder->normalized_tags_cache, key, (st_data_t)cached); + return cached; + } + return rb_funcall(self, idNormalizeTags, 1, tags); + RB_GC_GUARD(cached); +#else + return rb_funcall(self, idNormalizeTags, 1, tags); +#endif +} + +inline static bool append_normalized_tags(struct datagram_builder *builder, VALUE normalized_tags, int trim_trailing_comma) +{ + VALUE tag; + int tags_len = 0, chunk_len = 0, i = 0; + tags_len = (int)RARRAY_LEN(normalized_tags); + for (i = 0; i < tags_len; ++i) { + tag = RARRAY_AREF(normalized_tags, i); + chunk_len = (int)RSTRING_LEN(tag); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) return false; + memcpy(builder->datagram + builder->len, StringValuePtr(tag), chunk_len); + builder->len += chunk_len; + if (!trim_trailing_comma || i < tags_len - 1) { + if (builder->len + 1 > DATAGRAM_SIZE_MAX) return false; + memcpy(builder->datagram + builder->len, ",", 1); + builder->len += 1; + } + } + return true; +} + +static VALUE +generate_generic_datagram(VALUE self, VALUE name, VALUE value, const char *type, VALUE sample_rate, VALUE tags) { + VALUE normalized_name, str_value, str_sample_rate; + VALUE normalized_tags = Qnil; + char sr_buf[SAMPLE_RATE_SIZE_MAX]; + bool empty_tags = true; + long chunk_len = 0; + get_datagram_builder_struct(self); + + builder->len = builder->prefix_len; + + if (NIL_P(normalized_name = normalize_name_fast_path(self, name))) { + normalized_name = normalized_names_cached(builder, self, name); + } + chunk_len = RSTRING_LEN(normalized_name); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(normalized_name), chunk_len); + builder->len += chunk_len; + + if (builder->len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, ":", 1); + builder->len += 1; + str_value = rb_obj_as_string(value); + chunk_len = RSTRING_LEN(str_value); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(str_value), chunk_len); + builder->len += chunk_len; + + if (builder->len + 1 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|", 1); + builder->len += 1; + chunk_len = strlen(type); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, type, chunk_len); + builder->len += chunk_len; + + if (RTEST(sample_rate) && NUM2INT(sample_rate) < 1) { + if (builder->len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|@", 2); + builder->len += 2; + if (RB_FIXNUM_P(sample_rate)) { + chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%d", FIX2INT(sample_rate)); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, sr_buf, chunk_len); + builder->len += chunk_len; + } else if (RB_FLOAT_TYPE_P(sample_rate)) { + chunk_len = snprintf(sr_buf, SAMPLE_RATE_SIZE_MAX, "%g", RFLOAT_VALUE(sample_rate)); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, sr_buf, chunk_len); + builder->len += chunk_len; + } else { + str_sample_rate = rb_obj_as_string(sample_rate); + chunk_len = RSTRING_LEN(str_sample_rate); + if (builder->len + chunk_len > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, StringValuePtr(str_sample_rate), chunk_len); + builder->len += chunk_len; + } + } + + if ((RB_TYPE_P(tags, T_HASH) && !RHASH_EMPTY_P(tags)) || (RB_TYPE_P(tags, T_ARRAY) && RARRAY_LEN(tags) != 0)) { + empty_tags = false; + } + if (!(builder->empty_default_tags && empty_tags)) { + if (builder->len + 2 > DATAGRAM_SIZE_MAX) goto finalize_datagram; + memcpy(builder->datagram + builder->len, "|#", 2); + builder->len += 2; + } + if (builder->empty_default_tags && !empty_tags) { + if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 1)) goto finalize_datagram; + } else if (!builder->empty_default_tags && !empty_tags) { + if (!append_normalized_tags(builder, normalized_tags_cached(builder, self, tags), 0)) goto finalize_datagram; + if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; + } else if (!builder->empty_default_tags && empty_tags) { + if (!append_normalized_tags(builder, builder->default_tags, 1)) goto finalize_datagram; + } + +finalize_datagram: + return rb_str_new(builder->datagram, builder->len); + RB_GC_GUARD(normalized_tags); +} + +static VALUE metric_c(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "c", sample_rate, tags); +} + +static VALUE metric_g(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "g", sample_rate, tags); +} + +static VALUE metric_ms(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "ms", sample_rate, tags); +} + +static VALUE metric_s(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "s", sample_rate, tags); +} + +static VALUE metric_h(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "h", sample_rate, tags); +} + +static VALUE metric_d(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "d", sample_rate, tags); +} + +static VALUE metric_kv(VALUE self, VALUE name, VALUE value, VALUE sample_rate, VALUE tags) +{ + return generate_generic_datagram(self, name, value, "ms", sample_rate, tags); +} + +void Init_statsd() +{ + VALUE mStatsd, mInstrument, cDatagramBuilder, mCDatagramBuilder; + + mStatsd = rb_define_module("StatsD"); + mInstrument = rb_define_module_under(mStatsd, "Instrument"); + cDatagramBuilder = rb_define_class_under(mInstrument, "DatagramBuilder", rb_cObject); + + rb_define_alloc_func(cDatagramBuilder, datagram_builder_alloc); + + mCDatagramBuilder = rb_define_module_under(mInstrument, "CDatagramBuilder"); + + idTr = rb_intern("tr"); + idNormalizeTags = rb_intern("normalize_tags"); + idDefaultTags = rb_intern("@default_tags"); + idPrefix = rb_intern("@prefix"); + + rb_define_method(mCDatagramBuilder, "initialize", initialize, -1); + rb_define_method(mCDatagramBuilder, "c", metric_c, 4); + rb_define_method(mCDatagramBuilder, "g", metric_g, 4); + rb_define_method(mCDatagramBuilder, "ms", metric_ms, 4); + rb_define_method(mCDatagramBuilder, "s", metric_s, 4); + rb_define_method(mCDatagramBuilder, "h", metric_h, 4); + rb_define_method(mCDatagramBuilder, "d", metric_d, 4); + rb_define_method(mCDatagramBuilder, "kv", metric_kv, 4); + + rb_prepend_module(cDatagramBuilder, mCDatagramBuilder); +} diff --git a/lib/statsd/instrument.rb b/lib/statsd/instrument.rb index 20d00436..7001669b 100644 --- a/lib/statsd/instrument.rb +++ b/lib/statsd/instrument.rb @@ -359,6 +359,7 @@ def singleton_client require 'statsd/instrument/helpers' require 'statsd/instrument/assertions' require 'statsd/instrument/expectation' +require 'statsd/instrument/ext/statsd' require 'statsd/instrument/matchers' if defined?(::RSpec) require 'statsd/instrument/railtie' if defined?(::Rails::Railtie) require 'statsd/instrument/strict' if ENV['STATSD_STRICT_MODE'] diff --git a/statsd-instrument.gemspec b/statsd-instrument.gemspec index e6c51900..05a950a7 100644 --- a/statsd-instrument.gemspec +++ b/statsd-instrument.gemspec @@ -18,6 +18,7 @@ Gem::Specification.new do |spec| spec.files = `git ls-files`.split($/) spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) + spec.extensions = ['ext/statsd/extconf.rb'] spec.require_paths = ["lib"] spec.add_development_dependency 'rake' @@ -27,4 +28,5 @@ Gem::Specification.new do |spec| spec.add_development_dependency 'yard' spec.add_development_dependency 'rubocop' spec.add_development_dependency 'benchmark-ips' + spec.add_development_dependency 'rake-compiler' end diff --git a/test/datagram_builder_test.rb b/test/datagram_builder_test.rb index fe3ec3db..67053b97 100644 --- a/test/datagram_builder_test.rb +++ b/test/datagram_builder_test.rb @@ -12,6 +12,7 @@ def test_normalize_name assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo|o') assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo@o') assert_equal 'fo_o', @datagram_builder.send(:normalize_name, 'fo:o') + assert_equal 'foo_', @datagram_builder.send(:normalize_name, 'foo:') end def test_normalize_unsupported_tag_names @@ -115,4 +116,32 @@ def test_default_tags datagram = datagram_builder.c('bar', 1, nil, nil) assert_equal 'bar:1|c|#foo', datagram end + + def test_builder_buffer_overflow + # prefix + name overflow + datagram_builder = StatsD::Instrument::DatagramBuilder.new(prefix: 'a' * 2047) + datagram = datagram_builder.c('b' * 2048, 1, nil, nil) + assert_equal ('a' * 2047) + "." + ('b' * 2048), datagram + # name overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4096, 1, nil, nil) + assert_equal 'a' * 4096, datagram + # value overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4093, 100, nil, nil) + assert_equal ('a' * 4093) + ":", datagram + # type overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4093, 1, nil, nil) + assert_equal ('a' * 4093) + ":1|", datagram + # sample rate overflows + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 4088, 1, 0.5, nil) + assert_equal ('a' * 4088) + ":1|c|@", datagram + # tag overflows + tags = 1000.times {|i| "tag:#{i}" } + datagram_builder = StatsD::Instrument::DatagramBuilder.new + datagram = datagram_builder.c('a' * 2048, 1, 0.5, tags) + assert_equal ('a' * 2048) + ":1|c|@0.5", datagram + end end diff --git a/test/test_helper.rb b/test/test_helper.rb index 96a676f3..353b6ea8 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -17,3 +17,4 @@ def self.strict_mode_enabled? end StatsD.logger = Logger.new(File::NULL) +GC.stress = true if ENV['GC_STRESS']