-
Notifications
You must be signed in to change notification settings - Fork 1
Optimized text for full unicode and some escape sequences (#129169) #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: pr_017_before
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -13,16 +13,14 @@ | |||||
| import com.fasterxml.jackson.core.JsonParser; | ||||||
| import com.fasterxml.jackson.core.JsonToken; | ||||||
|
|
||||||
| import org.elasticsearch.common.Strings; | ||||||
| import org.elasticsearch.core.CheckedConsumer; | ||||||
| import org.elasticsearch.test.ESTestCase; | ||||||
| import org.elasticsearch.xcontent.XContentBuilder; | ||||||
| import org.elasticsearch.xcontent.XContentString; | ||||||
| import org.elasticsearch.xcontent.json.JsonXContent; | ||||||
| import org.hamcrest.Matchers; | ||||||
|
|
||||||
| import java.io.IOException; | ||||||
| import java.nio.charset.StandardCharsets; | ||||||
| import java.util.Locale; | ||||||
|
|
||||||
| public class ESUTF8StreamJsonParserTests extends ESTestCase { | ||||||
|
|
||||||
|
|
@@ -45,11 +43,13 @@ public void testGetValueAsText() throws IOException { | |||||
| assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); | ||||||
| assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); | ||||||
|
|
||||||
| var textRef = parser.getValueAsText().bytes(); | ||||||
| assertThat(textRef, Matchers.notNullValue()); | ||||||
| assertThat(textRef.offset(), Matchers.equalTo(9)); | ||||||
| assertThat(textRef.offset() + textRef.length(), Matchers.equalTo(12)); | ||||||
| assertTextRef(textRef, "bar"); | ||||||
| var text = parser.getValueAsText(); | ||||||
| assertThat(text, Matchers.notNullValue()); | ||||||
|
|
||||||
| var bytes = text.bytes(); | ||||||
| assertThat(bytes.offset(), Matchers.equalTo(9)); | ||||||
| assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(12)); | ||||||
| assertTextRef(bytes, "bar"); | ||||||
|
|
||||||
| assertThat(parser.getValueAsString(), Matchers.equalTo("bar")); | ||||||
| assertThat(parser.getValueAsText(), Matchers.nullValue()); | ||||||
|
|
@@ -62,17 +62,36 @@ public void testGetValueAsText() throws IOException { | |||||
| assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); | ||||||
| assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); | ||||||
|
|
||||||
| var text = parser.getValueAsText(); | ||||||
| assertThat(text, Matchers.notNullValue()); | ||||||
| assertTextRef(text.bytes(), "bar\"baz\""); | ||||||
| }); | ||||||
|
|
||||||
| testParseJson("{\"foo\": \"b\\u00e5r\"}", parser -> { | ||||||
| assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); | ||||||
| assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); | ||||||
| assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); | ||||||
|
|
||||||
| assertThat(parser.getValueAsText(), Matchers.nullValue()); | ||||||
| assertThat(parser.getValueAsString(), Matchers.equalTo("bar\"baz\"")); | ||||||
| assertThat(parser.getValueAsString(), Matchers.equalTo("bår")); | ||||||
| }); | ||||||
|
|
||||||
| testParseJson("{\"foo\": \"bår\"}", parser -> { | ||||||
| assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.START_OBJECT)); | ||||||
| assertThat(parser.nextFieldName(), Matchers.equalTo("foo")); | ||||||
| assertThat(parser.nextValue(), Matchers.equalTo(JsonToken.VALUE_STRING)); | ||||||
|
|
||||||
| assertThat(parser.getValueAsText(), Matchers.nullValue()); | ||||||
| var text = parser.getValueAsText(); | ||||||
| assertThat(text, Matchers.notNullValue()); | ||||||
|
|
||||||
| var bytes = text.bytes(); | ||||||
| assertThat(bytes.offset(), Matchers.equalTo(9)); | ||||||
| assertThat(bytes.offset() + bytes.length(), Matchers.equalTo(13)); | ||||||
| assertTextRef(bytes, "bår"); | ||||||
|
|
||||||
| assertThat(parser.getValueAsString(), Matchers.equalTo("bår")); | ||||||
|
|
||||||
| assertThat(parser.nextToken(), Matchers.equalTo(JsonToken.END_OBJECT)); | ||||||
| }); | ||||||
|
|
||||||
| testParseJson("{\"foo\": [\"lorem\", \"ipsum\", \"dolor\"]}", parser -> { | ||||||
|
|
@@ -112,43 +131,97 @@ public void testGetValueAsText() throws IOException { | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| private boolean validForTextRef(String value) { | ||||||
| for (char c : value.toCharArray()) { | ||||||
| if (c == '"') { | ||||||
| return false; | ||||||
| private record TestInput(String input, String result, boolean supportsOptimized) {} | ||||||
|
|
||||||
| private static final TestInput[] ESCAPE_SEQUENCES = { | ||||||
| new TestInput("\\b", "\b", false), | ||||||
| new TestInput("\\t", "\t", false), | ||||||
| new TestInput("\\n", "\n", false), | ||||||
| new TestInput("\\f", "\f", false), | ||||||
| new TestInput("\\r", "\r", false), | ||||||
| new TestInput("\\\"", "\"", true), | ||||||
| new TestInput("\\/", "/", true), | ||||||
| new TestInput("\\\\", "\\", true) }; | ||||||
|
|
||||||
| private int randomCodepoint(boolean includeAscii) { | ||||||
| while (true) { | ||||||
| char val = Character.toChars(randomInt(0xFFFF))[0]; | ||||||
|
||||||
| char val = Character.toChars(randomInt(0xFFFF))[0]; | |
| char val = (char) randomInt(0xFFFF); |
Copilot
AI
Jan 31, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The magic number 9 in the switch statement is unclear. This should be documented or replaced with a named constant that explains why 10 possible values (0-9) are used, where case 0 handles escape sequences, case 1 handles unicode escapes, and default handles unicode characters. The probability distribution (1/10 for escapes, 1/10 for unicode escapes, 8/10 for unicode chars) appears intentional but is not obvious.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using an ArrayList as an instance field that's cleared and reused could lead to memory retention if large strings with many backslashes are parsed. Consider using a smaller initial capacity or switching to a primitive int array with manual size tracking to reduce allocation overhead and memory footprint for the common case of few or no backslashes.