Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 138 additions & 3 deletions src/libtl/libtweetlength.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ typedef struct {
gsize start_character_index;
gsize length_in_bytes;
gsize length_in_characters;
gsize length_in_weighted_characters;
} Token;

#ifdef LIBTL_DEBUG
Expand Down Expand Up @@ -180,7 +181,8 @@ emplace_token (GArray *array,
const char *token_start,
gsize token_length,
gsize start_character_index,
gsize length_in_characters)
gsize length_in_characters,
gsize length_in_weighted_characters)
{
Token *t;

Expand All @@ -192,6 +194,7 @@ emplace_token (GArray *array,
t->length_in_bytes = token_length;
t->start_character_index = start_character_index;
t->length_in_characters = length_in_characters;
t->length_in_weighted_characters = length_in_weighted_characters;
}

static inline void
Expand All @@ -211,11 +214,13 @@ emplace_entity_for_tokens (GArray *array,
e->start = tokens[start_token_index].start;
e->length_in_bytes = 0;
e->length_in_characters = 0;
e->length_in_weighted_characters = 0;
e->start_character_index = tokens[start_token_index].start_character_index;

for (i = start_token_index; i <= end_token_index; i ++) {
e->length_in_bytes += tokens[i].length_in_bytes;
e->length_in_characters += tokens[i].length_in_characters;
e->length_in_weighted_characters += tokens[i].length_in_weighted_characters;
}
}

Expand Down Expand Up @@ -339,6 +344,25 @@ entity_length_in_characters (const TlEntity *e)
}
}

static gsize
weighted_length_for_character (gunichar ch)
{
// Based on https://developer.twitter.com/en/docs/developer-utilities/twitter-text
// then the following ranges count as "1", everything else is "2":
// * 0 - 4351 (Latin through to Georgian)
// * 8192 - 8205 (Unicode spaces)
// * 8208 - 8223 (Unicode hyphens and smart quotes)
// * 8242 - 8247 (Prime marks)
if ((ch >= 0 && ch <= 4351) ||
(ch >= 8192 && ch <= 8205) ||
(ch >= 8208 && ch <= 8223) ||
(ch >= 8424 && ch <= 8247)) {
return 1;
} else {
return 2;
}
}

/*
* tokenize:
*
Expand All @@ -357,19 +381,21 @@ tokenize (const char *input,
gunichar cur_char = g_utf8_get_char (p);
gsize cur_length = 0;
gsize length_in_chars = 0;
gsize length_in_weighted_chars = 0;
guint last_token_type = 0;

/* If this char already splits, it's a one-char token */
if (char_splits (cur_char)) {
const char *old_p = p;
p = g_utf8_next_char (p);
emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1);
emplace_token (tokens, cur_start, p - old_p, cur_character_index, 1, weighted_length_for_character (cur_char));
cur_character_index ++;
continue;
}

last_token_type = token_type_from_char (cur_char);
do {
length_in_weighted_chars += weighted_length_for_character (cur_char);
const char *old_p = p;
p = g_utf8_next_char (p);
cur_char = g_utf8_get_char (p);
Expand All @@ -382,7 +408,7 @@ tokenize (const char *input,
} while (!char_splits (cur_char) &&
p - input < (long)length_in_bytes);

emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars);
emplace_token (tokens, cur_start, cur_length, cur_character_index, length_in_chars, length_in_weighted_chars);

cur_character_index += length_in_chars;
}
Expand All @@ -399,6 +425,8 @@ parse_link_tail (GArray *entities,
guint i = *current_position;
const Token *t;

g_debug ("--------");

gsize paren_level = 0;
int first_paren_index = -1;
for (;;) {
Expand All @@ -413,6 +441,7 @@ parse_link_tail (GArray *entities,

if (first_paren_index == -1) {
first_paren_index = i;
g_debug ("First paren index: %d", (int)first_paren_index);
}
paren_level ++;
if (paren_level == 3) {
Expand All @@ -421,7 +450,9 @@ parse_link_tail (GArray *entities,
} else if (tokens[i].type == TOK_CLOSE_PAREN) {
if (first_paren_index == -1) {
first_paren_index = i;
g_debug ("First paren index: %d", (int)first_paren_index);
}
g_debug ("Close paren");
paren_level --;
}

Expand All @@ -433,6 +464,8 @@ parse_link_tail (GArray *entities,
}
}

g_debug ("After i: %u", i);
g_debug ("paren level: %d", (int)paren_level);
if (paren_level != 0) {
g_assert (first_paren_index != -1);
i = first_paren_index - 1; // Before that paren
Expand Down Expand Up @@ -510,6 +543,7 @@ parse_link (GArray *entities,
guint tld_index = i;
guint tld_iter = i;
gboolean tld_found = FALSE;
g_debug ("Looking for TLD starting from %u of %ld", i, n_tokens);
while (tld_iter < n_tokens - 1) {
const Token *t = &tokens[tld_iter];

Expand All @@ -534,10 +568,12 @@ parse_link (GArray *entities,
token_is_tld (&tokens[tld_iter + 1], has_protocol)) {
tld_index = tld_iter;
tld_found = TRUE;
g_debug ("TLD found at %u", tld_iter);
}

tld_iter ++;
}
g_debug ("tld_index: %u", tld_index);

if (tld_index >= n_tokens - 1 ||
!tld_found ||
Expand All @@ -561,6 +597,7 @@ parse_link (GArray *entities,
}
}

g_debug ("After reading a port: %u", i);

// To continue a link, the next token must be a slash or a question mark
// If it isn't, we stop here.
Expand All @@ -587,6 +624,7 @@ parse_link (GArray *entities,
}
}

g_debug ("end_token = i = %u", i);
end_token = i;
g_assert (end_token < n_tokens);

Expand Down Expand Up @@ -883,6 +921,85 @@ tl_count_characters_n (const char *input,
return length;
}

static inline gsize
entity_length_in_weighted_characters (const TlEntity *e)
{
switch (e->type) {
case TL_ENT_LINK:
return LINK_LENGTH;

default:
return e->length_in_weighted_characters;
}
}

static gsize
count_entities_in_weighted_characters (GArray *entities)
{
guint i;
gsize sum = 0;

for (i = 0; i < entities->len; i ++) {
const TlEntity *e = &g_array_index (entities, TlEntity, i);

sum += entity_length_in_weighted_characters (e);
}

return sum;
}

/*
* tl_count_weighted_chararacters:
* input: (nullable): NUL-terminated tweet text
*
* Returns: The length of @input, in Twitter's weighted characters.
*/
gsize
tl_count_weighted_characters (const char *input)
{
if (input == NULL || input[0] == '\0') {
return 0;
}

return tl_count_weighted_characters_n (input, strlen (input));
}

/*
* tl_count_weighted_characters_n:
* input: (nullable): Text to measure
* length_in_bytes: Length of @input, in bytes.
*
* Returns: The length of @input, in characters.
*/
gsize
tl_count_weighted_characters_n (const char *input,
gsize length_in_bytes)
{
GArray *tokens;
const Token *token_array;
gsize n_tokens;
GArray *entities;
gsize length;

if (input == NULL || input[0] == '\0') {
return 0;
}

// From here on, input/length_in_bytes are trusted to be OK
tokens = tokenize (input, length_in_bytes);

n_tokens = tokens->len;
token_array = (const Token *)g_array_free (tokens, FALSE);

entities = parse (token_array, n_tokens, FALSE, NULL);

length = count_entities_in_weighted_characters (entities);
g_array_free (entities, TRUE);
g_free ((char *)token_array);

return length;
}

/**
* tl_extract_entities:
* @input: The input text to extract entities from
Expand Down Expand Up @@ -934,13 +1051,31 @@ tl_extract_entities_internal (const char *input,
guint result_index = 0;

tokens = tokenize (input, length_in_bytes);

#ifdef LIBTL_DEBUG
g_debug ("############ %s: %.*s", __FUNCTION__, (guint)length_in_bytes, input);
for (guint i = 0; i < tokens->len; i ++) {
const Token *t = &g_array_index (tokens, Token, i);
g_debug ("Token %u: Type: %d, Length: %u, Text:%.*s, start char: %u, chars: %u", i, t->type, (guint)t->length_in_bytes,
(int)t->length_in_bytes, t->start, (guint)t->start_character_index, (guint)t->length_in_characters);
}
#endif

n_tokens = tokens->len;
token_array = (const Token *)g_array_free (tokens, FALSE);
entities = parse (token_array, n_tokens, extract_text_entities, &n_relevant_entities);

*out_text_length = count_entities_in_characters (entities);
g_free ((char *)token_array);

#ifdef LIBTL_DEBUG
for (guint i = 0; i < entities->len; i ++) {
const TlEntity *e = &g_array_index (entities, TlEntity, i);
g_debug ("TlEntity %u: Text: '%.*s', Type: %u, Bytes: %u, Length: %u, start character: %u", i, (int)e->length_in_bytes, e->start,
e->type, (guint)e->length_in_bytes, (guint)entity_length_in_characters (e), (guint)e->start_character_index);
}
#endif

// Only pass mentions, hashtags and links out
result_entities = g_malloc (sizeof (TlEntity) * n_relevant_entities);
for (guint i = 0; i < entities->len; i ++) {
Expand Down
4 changes: 4 additions & 0 deletions src/libtl/libtweetlength.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct _TlEntity {

gsize start_character_index;
gsize length_in_characters;
gsize length_in_weighted_characters;
};
typedef struct _TlEntity TlEntity;

Expand All @@ -41,6 +42,9 @@ typedef enum {
gsize tl_count_characters (const char *input);
gsize tl_count_characters_n (const char *input,
gsize length_in_bytes);
gsize tl_count_weighted_characters (const char *input);
gsize tl_count_weighted_characters_n (const char *input,
gsize length_in_bytes);
TlEntity * tl_extract_entities (const char *input,
gsize *out_n_entities,
gsize *out_text_length);
Expand Down
2 changes: 1 addition & 1 deletion src/window/ComposeTweetWindow.vala
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class ComposeTweetWindow : Gtk.ApplicationWindow {
tweet_text.buffer.get_bounds (out start, out end);
string text = tweet_text.buffer.get_text (start, end, true);

int length = (int)Tl.count_characters (text);
int length = (int)Tl.count_weighted_characters (text);
length_label.label = (Cb.Tweet.MAX_LENGTH - length).to_string ();

if (length > 0 && length <= Cb.Tweet.MAX_LENGTH ||
Expand Down
4 changes: 4 additions & 0 deletions vapi/libtl.vapi
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@ namespace Tl {
size_t start_character_index;
size_t length_in_characters;
size_t length_in_bytes;
size_t length_in_weighted_characters;
}


[CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")]
size_t count_characters (string input);

[CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h")]
size_t count_weighted_characters (string input);

[CCode (cprefix = "tl_", lower_case_cprefix = "tl_", cheader_filename = "libtl/libtweetlength.h",
array_length_pos = 1)]
Entity[]? extract_entities (string input, out size_t text_length);
Expand Down