Skip to content

Commit 888f9f4

Browse files
authored
Merge pull request #72 from r-uehara0219/feature/varint-length-field
Use varint length field for last_path encoding to support longer GCP object names
2 parents 5713617 + df160ff commit 888f9f4

File tree

3 files changed

+56
-13
lines changed

3 files changed

+56
-13
lines changed

src/main/java/org/embulk/input/gcs/GcsFileInput.java

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,28 +91,50 @@ static FileList listFiles(final PluginTask task) {
9191

9292
// String nextToken = base64Encode(0x0a + ASCII character according to utf8EncodeLength position+ filePath);
9393
static String base64Encode(final String path) {
94-
byte[] encoding;
9594
byte[] utf8 = path.getBytes(StandardCharsets.UTF_8);
9695
LOG.debug("path string: {} ,path length:{} \" + ", path, utf8.length);
9796

9897
int utf8EncodeLength = utf8.length;
99-
if (utf8EncodeLength >= 128) {
100-
throw new ConfigException(String.format("last_path '%s' is too long to encode. Please try to reduce its length", path));
98+
// GCP object names can be up to 1024 bytes in length.
99+
// This limit aligns with task.getLastPath() expectations.
100+
if (utf8EncodeLength >= 1025) {
101+
throw new ConfigException(String.format("last_path '%s' is too long to encode. Maximum allowed is 1024 bytes", path));
101102
}
102103

103-
encoding = new byte[utf8.length + 2];
104+
byte[] lengthVarint;
105+
byte[] encoding;
106+
lengthVarint = encodeVarint(utf8EncodeLength);
107+
encoding = new byte[1 + lengthVarint.length + utf8.length];
104108
encoding[0] = 0x0a;
105109

106-
// for example: 60 -> '<'
107-
char temp = (char) utf8EncodeLength;
108-
encoding[1] = (byte) temp;
109-
System.arraycopy(utf8, 0, encoding, 2, utf8.length);
110+
System.arraycopy(lengthVarint, 0, encoding, 1, lengthVarint.length);
111+
System.arraycopy(utf8, 0, encoding, 1 + lengthVarint.length, utf8.length);
110112

111113
final String s = Base64.getEncoder().encodeToString(encoding);
112114
LOG.debug("last_path(base64 encoded): {}", s);
113115
return s;
114116
}
115117

118+
// see: https://protobuf.dev/programming-guides/encoding/#varints
119+
static byte[] encodeVarint(int value) {
120+
// utf8EncodeLength.length is up to 65535, so 2 bytes are enough for buffer
121+
byte[] buffer = new byte[2];
122+
int pos = 0;
123+
while (true) {
124+
int bits = value & 0x7F;
125+
value >>>= 7;
126+
if (value != 0) {
127+
buffer[pos++] = (byte) (bits | 0x80);
128+
} else {
129+
buffer[pos++] = (byte) bits;
130+
break;
131+
}
132+
}
133+
byte[] result = new byte[pos];
134+
System.arraycopy(buffer, 0, result, 0, pos);
135+
return result;
136+
}
137+
116138
private static void printBucketInfo(final Storage client, final String bucket) {
117139
// get Bucket
118140
Storage.BucketGetOption fields = Storage.BucketGetOption.fields(

src/main/java/org/embulk/input/gcs/GcsFileInputPlugin.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.embulk.input.gcs;
1818

1919
import java.io.IOException;
20+
import java.nio.charset.StandardCharsets;
2021
import java.util.List;
2122
import java.util.Optional;
2223
import org.embulk.config.ConfigDiff;
@@ -64,10 +65,10 @@ public ConfigDiff transaction(final ConfigSource config, final FileInputPlugin.C
6465
}
6566
}
6667

67-
// @see https://cloud.google.com/storage/docs/bucket-naming
68+
// @see https://cloud.google.com/storage/docs/objects#naming
6869
if (task.getLastPath().isPresent()) {
69-
if (task.getLastPath().get().length() >= 128) {
70-
throw new ConfigException("last_path length is allowed up to 127 characters");
70+
if (task.getLastPath().get().getBytes(StandardCharsets.UTF_8).length >= 1025) {
71+
throw new ConfigException("last_path is too long, which can contain a maximum of 1024 bytes encoded in UTF-8.");
7172
}
7273
}
7374

src/test/java/org/embulk/input/gcs/TestGcsFileInputPlugin.java

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import static org.embulk.input.gcs.GcsFileInputPlugin.CONFIG_MAPPER;
2020
import static org.embulk.input.gcs.GcsFileInputPlugin.CONFIG_MAPPER_FACTORY;
21+
import static org.junit.Assert.assertArrayEquals;
2122
import static org.junit.Assert.assertEquals;
2223
import static org.junit.Assert.assertTrue;
2324
import static org.junit.Assume.assumeNotNull;
@@ -394,15 +395,34 @@ public void testBase64() {
394395
assertEquals("CgJjMg==", GcsFileInput.base64Encode("c2"));
395396
assertEquals("Cgh0ZXN0LmNzdg==", GcsFileInput.base64Encode("test.csv"));
396397
assertEquals("ChZnY3MtdGVzdC9zYW1wbGVfMDEuY3N2", GcsFileInput.base64Encode("gcs-test/sample_01.csv"));
397-
String params = "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc127";
398-
String expected = "Cn9jY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjMTI3";
398+
String params = "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc200";
399+
String expected = "CsgBY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2NjY2MyMDA=";
399400
assertEquals(expected, GcsFileInput.base64Encode(params));
400401

401402
params = "テストダミー/テス123/テストダミー/テストダミ.csv";
402403
expected = "CkPjg4bjgrnjg4jjg4Djg5/jg7wv44OG44K5MTIzL+ODhuOCueODiOODgOODn+ODvC/jg4bjgrnjg4jjg4Djg58uY3N2";
403404
assertEquals(expected, GcsFileInput.base64Encode(params));
404405
}
405406

407+
@Test
408+
public void testEncodeVarint() {
409+
byte[] expected1 = new byte[]{0x01};
410+
byte[] result1 = GcsFileInput.encodeVarint(1);
411+
assertArrayEquals("encodeVarint(1) should return {0x01}", expected1, result1);
412+
413+
byte[] expected127 = new byte[]{0x7F};
414+
byte[] result127 = GcsFileInput.encodeVarint(127);
415+
assertArrayEquals("encodeVarint(127) should return {0x7F}", expected127, result127);
416+
417+
byte[] expected128 = new byte[]{(byte) 0x80, 0x01};
418+
byte[] result128 = GcsFileInput.encodeVarint(128);
419+
assertArrayEquals("encodeVarint(128) should return {0x80, 0x01}", expected128, result128);
420+
421+
byte[] expected1024 = new byte[]{(byte) 0x80, 0x08};
422+
byte[] result1024 = GcsFileInput.encodeVarint(1024);
423+
assertArrayEquals("encodeVarint(1024) should return {0x80, 0x08}", expected1024, result1024);
424+
}
425+
406426
private ConfigSource config() {
407427
ConfigSource config = CONFIG_MAPPER_FACTORY.newConfigSource()
408428
.set("bucket", GCP_BUCKET)

0 commit comments

Comments
 (0)