diff --git a/src/libtl/libtweetlength.c b/src/libtl/libtweetlength.c index 83b22bd9d..722bcf88a 100644 --- a/src/libtl/libtweetlength.c +++ b/src/libtl/libtweetlength.c @@ -27,6 +27,7 @@ typedef struct { gsize start_character_index; gsize length_in_bytes; gsize length_in_characters; + gsize length_in_weighted_characters; } Token; #ifdef LIBTL_DEBUG @@ -180,7 +181,8 @@ emplace_token (GArray *array, const char *token_start, gsize token_length, gsize start_character_index, - gsize length_in_characters) + gsize length_in_characters, + gsize length_in_weighted_characters) { Token *t; @@ -192,6 +194,7 @@ emplace_token (GArray *array, t->length_in_bytes = token_length; t->start_character_index = start_character_index; t->length_in_characters = length_in_characters; + t->length_in_weighted_characters = length_in_weighted_characters; } static inline void @@ -211,11 +214,13 @@ emplace_entity_for_tokens (GArray *array, e->start = tokens[start_token_index].start; e->length_in_bytes = 0; e->length_in_characters = 0; + e->length_in_weighted_characters = 0; e->start_character_index = tokens[start_token_index].start_character_index; for (i = start_token_index; i <= end_token_index; i ++) { e->length_in_bytes += tokens[i].length_in_bytes; e->length_in_characters += tokens[i].length_in_characters; + e->length_in_weighted_characters += tokens[i].length_in_weighted_characters; } } @@ -339,6 +344,25 @@ entity_length_in_characters (const TlEntity *e) } } +static gsize +weighted_length_for_character (gunichar ch) +{ + // Based on https://developer.twitter.com/en/docs/developer-utilities/twitter-text + // then the following ranges count as "1", everything else is "2": + // * 0 - 4351 (Latin through to Georgian) + // * 8192 - 8205 (Unicode spaces) + // * 8208 - 8223 (Unicode hyphens and smart quotes) + // * 8242 - 8247 (Prime marks) + if ((ch >= 0 && ch <= 4351) || + (ch >= 8192 && ch <= 8205) || + (ch >= 8208 && ch <= 8223) || + (ch >= 8424 && ch <= 8247)) { + return 1; + } else { + return 2; + } +} + /* * tokenize: * @@ -357,19 +381,21 @@ tokenize (const char *input, gunichar cur_char = g_utf8_get_char (p); gsize cur_length = 0; gsize length_in_chars = 0; + gsize length_in_weighted_chars = 0; guint last_token_type = 0; /* If this char already splits, it's a one-char token */ if (char_splits (cur_char)) { const char *old_p = p; p = g_utf8_next_char (p); - emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1); + emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1, weighted_length_for_character (cur_char)); cur_character_index ++; continue; } last_token_type = token_type_from_char (cur_char); do { + length_in_weighted_chars += weighted_length_for_character (cur_char); const char *old_p = p; p = g_utf8_next_char (p); cur_char = g_utf8_get_char (p); @@ -382,7 +408,7 @@ tokenize (const char *input, } while (!char_splits (cur_char) && p - input < (long)length_in_bytes); - emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars); + emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars, length_in_weighted_chars); cur_character_index += length_in_chars; } @@ -399,6 +425,8 @@ parse_link_tail (GArray *entities, guint i = *current_position; const Token *t; + g_debug ("--------"); + gsize paren_level = 0; int first_paren_index = -1; for (;;) { @@ -413,6 +441,7 @@ parse_link_tail (GArray *entities, if (first_paren_index == -1) { first_paren_index = i; + g_debug ("First paren index: %d", (int)first_paren_index); } paren_level ++; if (paren_level == 3) { @@ -421,7 +450,9 @@ parse_link_tail (GArray *entities, } else if (tokens[i].type == TOK_CLOSE_PAREN) { if (first_paren_index == -1) { first_paren_index = i; + g_debug ("First paren index: %d", (int)first_paren_index); } + g_debug ("Close paren"); paren_level --; } @@ -433,6 +464,8 @@ parse_link_tail (GArray *entities, } } + g_debug ("After i: %u", i); + g_debug ("paren level: %d", (int)paren_level); if (paren_level != 0) { g_assert (first_paren_index != -1); i = first_paren_index - 1; // Before that paren @@ -510,6 +543,7 @@ parse_link (GArray *entities, guint tld_index = i; guint tld_iter = i; gboolean tld_found = FALSE; + g_debug ("Looking for TLD starting from %u of %ld", i, n_tokens); while (tld_iter < n_tokens - 1) { const Token *t = &tokens[tld_iter]; @@ -534,10 +568,12 @@ parse_link (GArray *entities, token_is_tld (&tokens[tld_iter + 1], has_protocol)) { tld_index = tld_iter; tld_found = TRUE; + g_debug ("TLD found at %u", tld_iter); } tld_iter ++; } + g_debug ("tld_index: %u", tld_index); if (tld_index >= n_tokens - 1 || !tld_found || @@ -561,6 +597,7 @@ parse_link (GArray *entities, } } + g_debug ("After reading a port: %u", i); // To continue a link, the next token must be a slash or a question mark // If it isn't, we stop here. @@ -587,6 +624,7 @@ parse_link (GArray *entities, } } + g_debug ("end_token = i = %u", i); end_token = i; g_assert (end_token < n_tokens); @@ -883,6 +921,85 @@ tl_count_characters_n (const char *input, return length; } +static inline gsize +entity_length_in_weighted_characters (const TlEntity *e) +{ + switch (e->type) { + case TL_ENT_LINK: + return LINK_LENGTH; + + default: + return e->length_in_weighted_characters; + } +} + +static gsize +count_entities_in_weighted_characters (GArray *entities) +{ + guint i; + gsize sum = 0; + + for (i = 0; i < entities->len; i ++) { + const TlEntity *e = &g_array_index (entities, TlEntity, i); + + sum += entity_length_in_weighted_characters (e); + } + + return sum; +} + +/* + * tl_count_weighted_chararacters: + * input: (nullable): NUL-terminated tweet text + * + * Returns: The length of @input, in Twitter's weighted characters. + */ +gsize +tl_count_weighted_characters (const char *input) +{ + if (input == NULL || input[0] == '\0') { + return 0; + } + + return tl_count_weighted_characters_n (input, strlen (input)); +} + +/* + * tl_count_weighted_characters_n: + * input: (nullable): Text to measure + * length_in_bytes: Length of @input, in bytes. + * + * Returns: The length of @input, in characters. + */ +gsize +tl_count_weighted_characters_n (const char *input, + gsize length_in_bytes) +{ + GArray *tokens; + const Token *token_array; + gsize n_tokens; + GArray *entities; + gsize length; + + if (input == NULL || input[0] == '\0') { + return 0; + } + + // From here on, input/length_in_bytes are trusted to be OK + tokens = tokenize (input, length_in_bytes); + + n_tokens = tokens->len; + token_array = (const Token *)g_array_free (tokens, FALSE); + + entities = parse (token_array, n_tokens, FALSE, NULL); + + length = count_entities_in_weighted_characters (entities); + g_array_free (entities, TRUE); + g_free ((char *)token_array); + + return length; +} + /** * tl_extract_entities: * @input: The input text to extract entities from @@ -934,6 +1051,16 @@ tl_extract_entities_internal (const char *input, guint result_index = 0; tokens = tokenize (input, length_in_bytes); + +#ifdef LIBTL_DEBUG + g_debug ("############ %s: %.*s", __FUNCTION__, (guint)length_in_bytes, input); + for (guint i = 0; i < tokens->len; i ++) { + const Token *t = &g_array_index (tokens, Token, i); + g_debug ("Token %u: Type: %d, Length: %u, Text:%.*s, start char: %u, chars: %u", i, t->type, (guint)t->length_in_bytes, + (int)t->length_in_bytes, t->start, (guint)t->start_character_index, (guint)t->length_in_characters); + } +#endif + n_tokens = tokens->len; token_array = (const Token *)g_array_free (tokens, FALSE); entities = parse (token_array, n_tokens, extract_text_entities, &n_relevant_entities); @@ -941,6 +1068,14 @@ tl_extract_entities_internal (const char *input, *out_text_length = count_entities_in_characters (entities); g_free ((char *)token_array); +#ifdef LIBTL_DEBUG + for (guint i = 0; i < entities->len; i ++) { + const TlEntity *e = &g_array_index (entities, TlEntity, i); + g_debug ("TlEntity %u: Text: '%.*s', Type: %u, Bytes: %u, Length: %u, start character: %u", i, (int)e->length_in_bytes, e->start, + e->type, (guint)e->length_in_bytes, (guint)entity_length_in_characters (e), (guint)e->start_character_index); + } +#endif + // Only pass mentions, hashtags and links out result_entities = g_malloc (sizeof (TlEntity) * n_relevant_entities); for (guint i = 0; i < entities->len; i ++) { diff --git a/src/libtl/libtweetlength.h b/src/libtl/libtweetlength.h index 8aef0f177..bc7957ae0 100644 --- a/src/libtl/libtweetlength.h +++ b/src/libtl/libtweetlength.h @@ -27,6 +27,7 @@ struct _TlEntity { gsize start_character_index; gsize length_in_characters; + gsize length_in_weighted_characters; }; typedef struct _TlEntity TlEntity; @@ -41,6 +42,9 @@ typedef enum { gsize tl_count_characters (const char *input); gsize tl_count_characters_n (const char *input, gsize length_in_bytes); +gsize tl_count_weighted_characters (const char *input); +gsize tl_count_weighted_characters_n (const char *input, + gsize length_in_bytes); TlEntity * tl_extract_entities (const char *input, gsize *out_n_entities, gsize *out_text_length); diff --git a/src/window/ComposeTweetWindow.vala b/src/window/ComposeTweetWindow.vala index 4dcb78b99..d4908de7a 100644 --- a/src/window/ComposeTweetWindow.vala +++ b/src/window/ComposeTweetWindow.vala @@ -210,7 +210,7 @@ class ComposeTweetWindow : Gtk.ApplicationWindow { tweet_text.buffer.get_bounds (out start, out end); string text = tweet_text.buffer.get_text (start, end, true); - int length = (int)Tl.count_characters (text); + int length = (int)Tl.count_weighted_characters (text); length_label.label = (Cb.Tweet.MAX_LENGTH - length).to_string (); if (length > 0 && length <= Cb.Tweet.MAX_LENGTH || diff --git a/vapi/libtl.vapi b/vapi/libtl.vapi index 13ab74768..64db7355c 100644 --- a/vapi/libtl.vapi +++ b/vapi/libtl.vapi @@ -15,12 +15,16 @@ namespace Tl { size_t start_character_index; size_t length_in_characters; size_t length_in_bytes; + size_t length_in_weighted_characters; } [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")] size_t count_characters (string input); + [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")] + size_t count_weighted_characters (string input); + [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h", array_length_pos = 1)] Entity[]? extract_entities (string input, out size_t text_length);