baedert · IBBoard · Jan 15, 2018
diff --git a/src/libtl/libtweetlength.c b/src/libtl/libtweetlength.c
@@ -27,6 +27,7 @@ typedef struct {
   gsize start_character_index;
   gsize length_in_bytes;
   gsize length_in_characters;
+  gsize length_in_weighted_characters;
 } Token;
 
 #ifdef LIBTL_DEBUG
@@ -180,7 +181,8 @@ emplace_token (GArray     *array,
                const char *token_start,
                gsize       token_length,
                gsize       start_character_index,
-               gsize       length_in_characters)
+               gsize       length_in_characters,
+               gsize       length_in_weighted_characters)
 {
   Token *t;
 
@@ -192,6 +194,7 @@ emplace_token (GArray     *array,
   t->length_in_bytes = token_length;
   t->start_character_index = start_character_index;
   t->length_in_characters = length_in_characters;
+  t->length_in_weighted_characters = length_in_weighted_characters;
 }
 
 static inline void
@@ -211,11 +214,13 @@ emplace_entity_for_tokens (GArray      *array,
   e->start = tokens[start_token_index].start;
   e->length_in_bytes = 0;
   e->length_in_characters = 0;
+  e->length_in_weighted_characters = 0;
   e->start_character_index = tokens[start_token_index].start_character_index;
 
   for (i = start_token_index; i <= end_token_index; i ++) {
     e->length_in_bytes += tokens[i].length_in_bytes;
     e->length_in_characters += tokens[i].length_in_characters;
+    e->length_in_weighted_characters += tokens[i].length_in_weighted_characters;
   }
 }
 
@@ -339,6 +344,25 @@ entity_length_in_characters (const TlEntity *e)
   }
 }
 
+static gsize
+weighted_length_for_character (gunichar ch)
+{
+  // Based on https://developer.twitter.com/en/docs/developer-utilities/twitter-text
+  // then the following ranges count as "1", everything else is "2":
+  //   * 0 - 4351 (Latin through to Georgian)
+  //   * 8192 - 8205 (Unicode spaces)
+  //   * 8208 - 8223 (Unicode hyphens and smart quotes)
+  //   * 8242 - 8247 (Prime marks)
+  if ((ch >= 0    && ch <= 4351) ||
+      (ch >= 8192 && ch <= 8205) ||
+      (ch >= 8208 && ch <= 8223) ||
+      (ch >= 8424 && ch <= 8247)) {
+    return 1;
+  } else {
+    return 2;
+  }
+}
+
 /*
  * tokenize:
  *
@@ -357,19 +381,21 @@ tokenize (const char *input,
     gunichar cur_char = g_utf8_get_char (p);
     gsize cur_length = 0;
     gsize length_in_chars = 0;
+    gsize length_in_weighted_chars = 0;
     guint last_token_type = 0;
 
     /* If this char already splits, it's a one-char token */
     if (char_splits (cur_char)) {
       const char *old_p = p;
       p = g_utf8_next_char (p);
-      emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1);
+      emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1, weighted_length_for_character (cur_char));
       cur_character_index ++;
       continue;
     }
 
     last_token_type = token_type_from_char (cur_char);
     do {
+      length_in_weighted_chars += weighted_length_for_character (cur_char);
       const char *old_p = p;
       p = g_utf8_next_char (p);
       cur_char = g_utf8_get_char (p);
@@ -382,7 +408,7 @@ tokenize (const char *input,
     } while (!char_splits (cur_char) &&
              p - input < (long)length_in_bytes);
 
-    emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars);
+    emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars, length_in_weighted_chars);
 
     cur_character_index += length_in_chars;
   }
@@ -399,6 +425,8 @@ parse_link_tail (GArray      *entities,
   guint i = *current_position;
   const Token *t;
 
+  g_debug ("--------");
+
   gsize paren_level = 0;
   int first_paren_index = -1;
   for (;;) {
@@ -413,6 +441,7 @@ parse_link_tail (GArray      *entities,
 
       if (first_paren_index == -1) {
         first_paren_index = i;
+        g_debug ("First paren index: %d", (int)first_paren_index);
       }
       paren_level ++;
       if (paren_level == 3) {
@@ -421,7 +450,9 @@ parse_link_tail (GArray      *entities,
     } else if (tokens[i].type == TOK_CLOSE_PAREN) {
       if (first_paren_index == -1) {
         first_paren_index = i;
+        g_debug ("First paren index: %d", (int)first_paren_index);
       }
+      g_debug ("Close paren");
       paren_level --;
     }
 
@@ -433,6 +464,8 @@ parse_link_tail (GArray      *entities,
     }
   }
 
+  g_debug ("After i: %u", i);
+  g_debug ("paren level: %d", (int)paren_level);
   if (paren_level != 0) {
     g_assert (first_paren_index != -1);
     i = first_paren_index - 1; // Before that paren
@@ -510,6 +543,7 @@ parse_link (GArray      *entities,
   guint tld_index = i;
   guint tld_iter = i;
   gboolean tld_found = FALSE;
+  g_debug ("Looking for TLD starting from %u of %ld", i, n_tokens);
   while (tld_iter < n_tokens - 1) {
     const Token *t = &tokens[tld_iter];
 
@@ -534,10 +568,12 @@ parse_link (GArray      *entities,
         token_is_tld (&tokens[tld_iter + 1], has_protocol)) {
       tld_index = tld_iter;
       tld_found = TRUE;
+      g_debug ("TLD found at %u", tld_iter);
     }
 
     tld_iter ++;
   }
+  g_debug ("tld_index: %u", tld_index);
 
   if (tld_index >= n_tokens - 1 ||
       !tld_found ||
@@ -561,6 +597,7 @@ parse_link (GArray      *entities,
     }
   }
 
+  g_debug ("After reading a port: %u", i);
 
   // To continue a link, the next token must be a slash or a question mark
   // If it isn't, we stop here.
@@ -587,6 +624,7 @@ parse_link (GArray      *entities,
     }
   }
 
+  g_debug ("end_token = i = %u", i);
   end_token = i;
   g_assert (end_token < n_tokens);
 
@@ -883,6 +921,85 @@ tl_count_characters_n (const char *input,
   return length;
 }
 
+static inline gsize
+entity_length_in_weighted_characters (const TlEntity *e)
+{
+  switch (e->type) {
+    case TL_ENT_LINK:
+      return LINK_LENGTH;
+
+    default:
+      return e->length_in_weighted_characters;
+  }
+}
+
+static gsize
+count_entities_in_weighted_characters (GArray *entities)
+{
+  guint i;
+  gsize sum = 0;
+
+  for (i = 0; i < entities->len; i ++) {
+    const TlEntity *e = &g_array_index (entities, TlEntity, i);
+
+    sum += entity_length_in_weighted_characters (e);
+  }
+
+  return sum;
+}
+
+/*
+ * tl_count_weighted_chararacters:
+ * input: (nullable): NUL-terminated tweet text
+ *
+ * Returns: The length of @input, in Twitter's weighted characters.
+ */
+gsize
+tl_count_weighted_characters (const char *input)
+{
+  if (input == NULL || input[0] == '\0') {
+    return 0;
+  }
+
+  return tl_count_weighted_characters_n (input, strlen (input));
+}
+
+/*
+ * tl_count_weighted_characters_n:
+ * input: (nullable): Text to measure
+ * length_in_bytes: Length of @input, in bytes.
+ *
+ * Returns: The length of @input, in characters.
+ */
+gsize
+tl_count_weighted_characters_n (const char *input,
+                       gsize       length_in_bytes)
+{
+  GArray *tokens;
+  const Token *token_array;
+  gsize n_tokens;
+  GArray *entities;
+  gsize length;
+
+  if (input == NULL || input[0] == '\0') {
+    return 0;
+  }
+
+  // From here on, input/length_in_bytes are trusted to be OK
+  tokens = tokenize (input, length_in_bytes);
+
+  n_tokens = tokens->len;
+  token_array = (const Token *)g_array_free (tokens, FALSE);
+
+  entities = parse (token_array, n_tokens, FALSE, NULL);
+
+  length = count_entities_in_weighted_characters (entities);
+  g_array_free (entities, TRUE);
+  g_free ((char *)token_array);
+
+  return length;
+}
+
 /**
  * tl_extract_entities:
  * @input: The input text to extract entities from
@@ -934,13 +1051,31 @@ tl_extract_entities_internal (const char *input,
   guint result_index = 0;
 
   tokens = tokenize (input, length_in_bytes);
+
+#ifdef LIBTL_DEBUG
+  g_debug ("############ %s: %.*s", __FUNCTION__, (guint)length_in_bytes, input);
+  for (guint i = 0; i < tokens->len; i ++) {
+    const Token *t = &g_array_index (tokens, Token, i);
+    g_debug ("Token %u: Type: %d, Length: %u, Text:%.*s, start char: %u, chars: %u", i, t->type, (guint)t->length_in_bytes,
+         (int)t->length_in_bytes, t->start, (guint)t->start_character_index, (guint)t->length_in_characters);
+  }
+#endif
+
   n_tokens = tokens->len;
   token_array = (const Token *)g_array_free (tokens, FALSE);
   entities = parse (token_array, n_tokens, extract_text_entities, &n_relevant_entities);
 
   *out_text_length = count_entities_in_characters (entities);
   g_free ((char *)token_array);
 
+#ifdef LIBTL_DEBUG
+  for (guint i = 0; i < entities->len; i ++) {
+    const TlEntity *e = &g_array_index (entities, TlEntity, i);
+    g_debug ("TlEntity %u: Text: '%.*s', Type: %u, Bytes: %u, Length: %u, start character: %u", i, (int)e->length_in_bytes, e->start,
+               e->type, (guint)e->length_in_bytes, (guint)entity_length_in_characters (e), (guint)e->start_character_index);
+  }
+#endif
+
   // Only pass mentions, hashtags and links out
   result_entities = g_malloc (sizeof (TlEntity) * n_relevant_entities);
   for (guint i = 0; i < entities->len; i ++) {

diff --git a/src/libtl/libtweetlength.h b/src/libtl/libtweetlength.h
@@ -27,6 +27,7 @@ struct _TlEntity {
 
   gsize start_character_index;
   gsize length_in_characters;
+  gsize length_in_weighted_characters;
 };
 typedef struct _TlEntity TlEntity;
 
@@ -41,6 +42,9 @@ typedef enum {
 gsize      tl_count_characters            (const char *input);
 gsize      tl_count_characters_n          (const char *input,
                                            gsize       length_in_bytes);
+gsize      tl_count_weighted_characters   (const char *input);
+gsize      tl_count_weighted_characters_n (const char *input,
+                                           gsize       length_in_bytes);
 TlEntity * tl_extract_entities            (const char *input,
                                            gsize      *out_n_entities,
                                            gsize      *out_text_length);

diff --git a/src/window/ComposeTweetWindow.vala b/src/window/ComposeTweetWindow.vala
@@ -210,7 +210,7 @@ class ComposeTweetWindow : Gtk.ApplicationWindow {
     tweet_text.buffer.get_bounds (out start, out end);
     string text = tweet_text.buffer.get_text (start, end, true);
 
-    int length = (int)Tl.count_characters (text);
+    int length = (int)Tl.count_weighted_characters (text);
     length_label.label = (Cb.Tweet.MAX_LENGTH - length).to_string ();
 
     if (length > 0 && length <= Cb.Tweet.MAX_LENGTH ||

diff --git a/vapi/libtl.vapi b/vapi/libtl.vapi
@@ -15,12 +15,16 @@ namespace Tl {
     size_t start_character_index;
     size_t length_in_characters;
     size_t length_in_bytes;
+    size_t length_in_weighted_characters;
   }
 
 
   [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")]
   size_t count_characters (string input);
 
+  [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")]
+  size_t count_weighted_characters (string input);
+
   [CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h",
           array_length_pos = 1)]
   Entity[]? extract_entities (string input, out size_t text_length);