Skip to content

Use short string in Variant when possible #13284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ static byte arrayHeader(boolean isLarge, int offsetSize) {
return (byte) ((isLarge ? 0b10000 : 0) | (offsetSize - 1) << 2 | 0b11);
}

static byte shortStringHeader(int length) {
return (byte) ((length << 2) | BASIC_TYPE_SHORT_STRING);
}

static BasicType basicType(int header) {
int basicType = header & BASIC_TYPE_MASK;
switch (basicType) {
Expand Down
104 changes: 33 additions & 71 deletions api/src/test/java/org/apache/iceberg/variants/TestSerializedArray.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import java.util.Random;
import org.apache.iceberg.util.RandomUtil;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

public class TestSerializedArray {
private static final VariantMetadata EMPTY_METADATA = SerializedMetadata.EMPTY_V1_METADATA;
Expand Down Expand Up @@ -75,16 +77,11 @@ public void testStringArray() {

assertThat(array.type()).isEqualTo(PhysicalType.ARRAY);
assertThat(array.numElements()).isEqualTo(5);
assertThat(array.get(0).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(0).asPrimitive().get()).isEqualTo("a");
assertThat(array.get(1).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(1).asPrimitive().get()).isEqualTo("b");
assertThat(array.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(2).asPrimitive().get()).isEqualTo("c");
assertThat(array.get(3).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(3).asPrimitive().get()).isEqualTo("d");
assertThat(array.get(4).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(4).asPrimitive().get()).isEqualTo("e");
VariantTestUtil.assertVariantString(array.get(0), "a");
VariantTestUtil.assertVariantString(array.get(1), "b");
VariantTestUtil.assertVariantString(array.get(2), "c");
VariantTestUtil.assertVariantString(array.get(3), "d");
VariantTestUtil.assertVariantString(array.get(4), "e");

assertThatThrownBy(() -> array.get(5))
.isInstanceOf(ArrayIndexOutOfBoundsException.class)
Expand All @@ -98,18 +95,12 @@ public void testStringDifferentLengths() {

assertThat(array.type()).isEqualTo(PhysicalType.ARRAY);
assertThat(array.numElements()).isEqualTo(6);
assertThat(array.get(0).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(0).asPrimitive().get()).isEqualTo("a");
assertThat(array.get(1).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(1).asPrimitive().get()).isEqualTo("b");
assertThat(array.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(2).asPrimitive().get()).isEqualTo("c");
assertThat(array.get(3).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(3).asPrimitive().get()).isEqualTo("iceberg");
assertThat(array.get(4).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(4).asPrimitive().get()).isEqualTo("d");
assertThat(array.get(5).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(5).asPrimitive().get()).isEqualTo("e");
VariantTestUtil.assertVariantString(array.get(0), "a");
VariantTestUtil.assertVariantString(array.get(1), "b");
VariantTestUtil.assertVariantString(array.get(2), "c");
VariantTestUtil.assertVariantString(array.get(3), "iceberg");
VariantTestUtil.assertVariantString(array.get(4), "d");
VariantTestUtil.assertVariantString(array.get(5), "e");

assertThatThrownBy(() -> array.get(6))
.isInstanceOf(ArrayIndexOutOfBoundsException.class)
Expand All @@ -131,14 +122,11 @@ public void testArrayOfMixedTypes() {
assertThat(array.get(0).asPrimitive().get()).isEqualTo(17396);
assertThat(array.get(1).type()).isEqualTo(PhysicalType.INT8);
assertThat(array.get(1).asPrimitive().get()).isEqualTo((byte) 34);
assertThat(array.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(2).asPrimitive().get()).isEqualTo("iceberg");
VariantTestUtil.assertVariantString(array.get(2), "iceberg");
assertThat(array.get(3).type()).isEqualTo(PhysicalType.NULL);
assertThat(array.get(3).asPrimitive().get()).isEqualTo(null);
assertThat(array.get(4).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(4).asPrimitive().get()).isEqualTo("e");
assertThat(array.get(5).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(5).asPrimitive().get()).isEqualTo("b");
VariantTestUtil.assertVariantString(array.get(4), "e");
VariantTestUtil.assertVariantString(array.get(5), "b");
assertThat(array.get(6).type()).isEqualTo(PhysicalType.BOOLEAN_FALSE);
assertThat(array.get(6).asPrimitive().get()).isEqualTo(false);
assertThat(array.get(8).type()).isEqualTo(PhysicalType.BOOLEAN_TRUE);
Expand All @@ -153,62 +141,36 @@ public void testArrayOfMixedTypes() {
assertThat(array.get(7).type()).isEqualTo(PhysicalType.ARRAY);
SerializedArray actualNested = (SerializedArray) array.get(7);
assertThat(actualNested.numElements()).isEqualTo(3);
assertThat(actualNested.get(0).type()).isEqualTo(PhysicalType.STRING);
assertThat(actualNested.get(0).asPrimitive().get()).isEqualTo("a");
assertThat(actualNested.get(1).type()).isEqualTo(PhysicalType.STRING);
assertThat(actualNested.get(1).asPrimitive().get()).isEqualTo("c");
assertThat(actualNested.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(actualNested.get(2).asPrimitive().get()).isEqualTo("d");
VariantTestUtil.assertVariantString(actualNested.get(0), "a");
VariantTestUtil.assertVariantString(actualNested.get(1), "c");
VariantTestUtil.assertVariantString(actualNested.get(2), "d");

assertThatThrownBy(() -> actualNested.get(3))
.isInstanceOf(ArrayIndexOutOfBoundsException.class)
.hasMessage("Index 3 out of bounds for length 3");
}

@Test
public void testTwoByteOffsets() {
// a string larger than 255 bytes to push the value offset size above 1 byte
String randomString = RandomUtil.generateString(300, random);
@ParameterizedTest
@ValueSource(
ints = {
300, // a big string larger than 255 bytes to push the value offset size above 1 byte to
// test TwoByteOffsets
70_000 // a really-big string larger than 65535 bytes to push the value offset size above 1
// byte to test ThreeByteOffsets
})
public void testMultiByteOffsets(int multiByteOffset) {
String randomString = RandomUtil.generateString(multiByteOffset, random);
SerializedPrimitive bigString = VariantTestUtil.createString(randomString);

ByteBuffer buffer = VariantTestUtil.createArray(bigString, A, B, C);
SerializedArray array = SerializedArray.from(EMPTY_METADATA, buffer, buffer.get(0));

assertThat(array.type()).isEqualTo(PhysicalType.ARRAY);
assertThat(array.numElements()).isEqualTo(4);
assertThat(array.get(0).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(0).asPrimitive().get()).isEqualTo(randomString);
assertThat(array.get(1).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(1).asPrimitive().get()).isEqualTo("a");
assertThat(array.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(2).asPrimitive().get()).isEqualTo("b");
assertThat(array.get(3).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(3).asPrimitive().get()).isEqualTo("c");

assertThatThrownBy(() -> array.get(4))
.isInstanceOf(ArrayIndexOutOfBoundsException.class)
.hasMessage("Index 4 out of bounds for length 4");
}

@Test
public void testThreeByteOffsets() {
// a string larger than 65535 bytes to push the value offset size above 1 byte
String randomString = RandomUtil.generateString(70_000, random);
SerializedPrimitive reallyBigString = VariantTestUtil.createString(randomString);

ByteBuffer buffer = VariantTestUtil.createArray(reallyBigString, A, B, C);
SerializedArray array = SerializedArray.from(EMPTY_METADATA, buffer, buffer.get(0));

assertThat(array.type()).isEqualTo(PhysicalType.ARRAY);
assertThat(array.numElements()).isEqualTo(4);
assertThat(array.get(0).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(0).asPrimitive().get()).isEqualTo(randomString);
assertThat(array.get(1).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(1).asPrimitive().get()).isEqualTo("a");
assertThat(array.get(2).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(2).asPrimitive().get()).isEqualTo("b");
assertThat(array.get(3).type()).isEqualTo(PhysicalType.STRING);
assertThat(array.get(3).asPrimitive().get()).isEqualTo("c");
VariantTestUtil.assertVariantString(array.get(0), randomString);
VariantTestUtil.assertVariantString(array.get(1), "a");
VariantTestUtil.assertVariantString(array.get(2), "b");
VariantTestUtil.assertVariantString(array.get(3), "c");

assertThatThrownBy(() -> array.get(4))
.isInstanceOf(ArrayIndexOutOfBoundsException.class)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,70 +182,59 @@ public void testMixedValueTypes() {
assertThat(actualInner.get("f").asPrimitive().get()).isEqualTo((byte) 3);
}

@Test
public void testTwoByteOffsets() {
// a string larger than 255 bytes to push the value offset size above 1 byte
String randomString = RandomUtil.generateString(300, random);
@ParameterizedTest
@ValueSource(
ints = {
300, // a big string larger than 255 bytes to push the value offset size above 1 byte to
// test TwoByteOffsets
70_000 // a really-big string larger than 65535 bytes to push the value offset size above 1
// byte to test ThreeByteOffsets
})
public void testMultiByteOffsets(int multiByteOffset) {
String randomString = RandomUtil.generateString(multiByteOffset, random);
SerializedPrimitive bigString = VariantTestUtil.createString(randomString);

// note that order doesn't matter. fields are sorted by name
Map<String, VariantValue> data = ImmutableMap.of("big", bigString, "a", I1, "b", I2, "c", I3);
ByteBuffer meta = VariantTestUtil.createMetadata(data.keySet(), true /* sort names */);
ByteBuffer value = VariantTestUtil.createObject(meta, data);

VariantMetadata metadata = VariantMetadata.from(meta);
SerializedObject object = SerializedObject.from(metadata, value, value.get(0));

assertThat(object.type()).isEqualTo(PhysicalType.OBJECT);
assertThat(object.numFields()).isEqualTo(4);

assertThat(object.get("a").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("a").asPrimitive().get()).isEqualTo((byte) 1);
assertThat(object.get("b").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("b").asPrimitive().get()).isEqualTo((byte) 2);
assertThat(object.get("c").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("c").asPrimitive().get()).isEqualTo((byte) 3);
assertThat(object.get("big").type()).isEqualTo(PhysicalType.STRING);
assertThat(object.get("big").asPrimitive().get()).isEqualTo(randomString);
}

@Test
public void testThreeByteOffsets() {
// a string larger than 65535 bytes to push the value offset size above 1 byte
String randomString = RandomUtil.generateString(70_000, random);
SerializedPrimitive reallyBigString = VariantTestUtil.createString(randomString);

// note that order doesn't matter. fields are sorted by name
Map<String, VariantValue> data =
ImmutableMap.of("really-big", reallyBigString, "a", I1, "b", I2, "c", I3);
ImmutableMap.of(
"small",
VariantTestUtil.createShortString("iceberg"),
"big",
bigString,
"a",
I1,
"b",
I2,
"c",
I3);
ByteBuffer meta = VariantTestUtil.createMetadata(data.keySet(), true /* sort names */);
ByteBuffer value = VariantTestUtil.createObject(meta, data);

VariantMetadata metadata = VariantMetadata.from(meta);
SerializedObject object = SerializedObject.from(metadata, value, value.get(0));

assertThat(object.type()).isEqualTo(PhysicalType.OBJECT);
assertThat(object.numFields()).isEqualTo(4);
assertThat(object.numFields()).isEqualTo(5);

assertThat(object.get("a").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("a").asPrimitive().get()).isEqualTo((byte) 1);
assertThat(object.get("b").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("b").asPrimitive().get()).isEqualTo((byte) 2);
assertThat(object.get("c").type()).isEqualTo(PhysicalType.INT8);
assertThat(object.get("c").asPrimitive().get()).isEqualTo((byte) 3);
assertThat(object.get("really-big").type()).isEqualTo(PhysicalType.STRING);
assertThat(object.get("really-big").asPrimitive().get()).isEqualTo(randomString);
VariantTestUtil.assertVariantString(object.get("big"), randomString);
VariantTestUtil.assertVariantString(object.get("small"), "iceberg");
}

@ParameterizedTest
@ValueSource(booleans = {true, false})
@SuppressWarnings({"unchecked", "rawtypes"})
public void testLargeObject(boolean sortFieldNames) {
Map<String, SerializedPrimitive> fields = Maps.newHashMap();
Map<String, VariantPrimitive<?>> fields = Maps.newHashMap();
for (int i = 0; i < 10_000; i += 1) {
fields.put(
RandomUtil.generateString(10, random),
VariantTestUtil.createString(RandomUtil.generateString(10, random)));
VariantTestUtil.createShortString(RandomUtil.generateString(10, random)));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the change here? The test should work with the previous method correct? I don't think it's broken using the larger header and since this test is explicitly trying to to make a large object shouldn't we want that?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, because assertVariant is assuming that a string this small should be a short string

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably should clean up the testUtil then to have createString use the correct layout depending on the input string length. Better to not leave in a method that isn't correct.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure whether I understood this completely. I think newly added testLargeObjectUsingShortStringWithBigHeader covers this too.

}

ByteBuffer meta = VariantTestUtil.createMetadata(fields.keySet(), sortFieldNames);
Expand All @@ -257,10 +246,9 @@ public void testLargeObject(boolean sortFieldNames) {
assertThat(object.type()).isEqualTo(PhysicalType.OBJECT);
assertThat(object.numFields()).isEqualTo(10_000);

for (Map.Entry<String, SerializedPrimitive> entry : fields.entrySet()) {
for (Map.Entry<String, VariantPrimitive<?>> entry : fields.entrySet()) {
VariantValue fieldValue = object.get(entry.getKey());
assertThat(fieldValue.type()).isEqualTo(PhysicalType.STRING);
assertThat(fieldValue.asPrimitive().get()).isEqualTo(entry.getValue().get());
VariantTestUtil.assertVariantString(fieldValue, entry.getValue().get().toString());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ public void testString() {

assertThat(value.type()).isEqualTo(PhysicalType.STRING);
assertThat(value.get()).isEqualTo("iceberg");
assertThat(value.sizeInBytes()).isEqualTo(12);
}

@Test
Expand All @@ -451,6 +452,7 @@ public void testShortString() {

assertThat(value.type()).isEqualTo(PhysicalType.STRING);
assertThat(value.get()).isEqualTo("iceberg");
assertThat(value.sizeInBytes()).isEqualTo(8);
}

@Test
Expand Down
23 changes: 23 additions & 0 deletions api/src/test/java/org/apache/iceberg/variants/VariantTestUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,20 @@ public static void assertEqual(VariantValue expected, VariantValue actual) {
}
}

public static void assertVariantString(VariantValue actual, String expected) {
int maxShortStringLength = 64;
int expectedLength = expected.length();
assertThat(actual.type()).isEqualTo(PhysicalType.STRING);
assertThat(actual.asPrimitive().get()).isEqualTo(expected);
if (expectedLength < maxShortStringLength) {
assertThat(actual.getClass()).isEqualTo(SerializedShortString.class);
assertThat(actual.asPrimitive().sizeInBytes()).isEqualTo(1 + expectedLength);
} else {
assertThat(actual.getClass()).isEqualTo(SerializedPrimitive.class);
assertThat(actual.asPrimitive().sizeInBytes()).isEqualTo(5 + expectedLength);
}
}

private static byte primitiveHeader(int primitiveType) {
return (byte) (primitiveType << 2);
}
Expand Down Expand Up @@ -107,6 +121,15 @@ static SerializedPrimitive createString(String string) {
return SerializedPrimitive.from(buffer, buffer.get(0));
}

/** Creates a short string primitive of max 63 chars to use only 1 header */
static SerializedShortString createShortString(String string) {
byte[] utf8 = string.getBytes(StandardCharsets.UTF_8);
ByteBuffer buffer = ByteBuffer.allocate(1 + utf8.length).order(ByteOrder.LITTLE_ENDIAN);
buffer.put(0, VariantUtil.shortStringHeader(utf8.length));
writeBufferAbsolute(buffer, 1, ByteBuffer.wrap(utf8));
return SerializedShortString.from(buffer, buffer.get(0));
}

public static ByteBuffer variantBuffer(Map<String, VariantValue> data) {
ByteBuffer meta = VariantTestUtil.createMetadata(data.keySet(), true /* sort names */);
ByteBuffer value = VariantTestUtil.createObject(meta, data);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class PrimitiveWrapper<T> implements VariantPrimitive<T> {
private static final byte TIMESTAMPNTZ_NANOS_HEADER =
VariantUtil.primitiveHeader(Primitives.TYPE_TIMESTAMPNTZ_NANOS);
private static final byte UUID_HEADER = VariantUtil.primitiveHeader(Primitives.TYPE_UUID);
private static final int MAX_SHORT_STRING_LENGTH = 64;

private final PhysicalType type;
private final T value;
Expand Down Expand Up @@ -114,7 +115,9 @@ public int sizeInBytes() {
if (null == buffer) {
this.buffer = ByteBuffer.wrap(((String) value).getBytes(StandardCharsets.UTF_8));
}

if (buffer.remaining() < MAX_SHORT_STRING_LENGTH) {
return 1 + buffer.remaining(); // 1 header + value length
}
return 5 + buffer.remaining(); // 1 header + 4 length + value length
case UUID:
return 1 + 16; // 1 header + 16 length
Expand Down Expand Up @@ -208,15 +211,19 @@ public int writeTo(ByteBuffer outBuffer, int offset) {
VariantUtil.writeBufferAbsolute(outBuffer, offset + 5, binary);
return 5 + binary.remaining();
case STRING:
// TODO: use short string when possible
if (null == buffer) {
this.buffer = ByteBuffer.wrap(((String) value).getBytes(StandardCharsets.UTF_8));
}

outBuffer.put(offset, STRING_HEADER);
outBuffer.putInt(offset + 1, buffer.remaining());
VariantUtil.writeBufferAbsolute(outBuffer, offset + 5, buffer);
return 5 + buffer.remaining();
if (buffer.remaining() < MAX_SHORT_STRING_LENGTH) {
outBuffer.put(offset, VariantUtil.shortStringHeader(buffer.remaining()));
VariantUtil.writeBufferAbsolute(outBuffer, offset + 1, buffer);
return 1 + buffer.remaining();
} else {
outBuffer.put(offset, STRING_HEADER);
outBuffer.putInt(offset + 1, buffer.remaining());
VariantUtil.writeBufferAbsolute(outBuffer, offset + 5, buffer);
return 5 + buffer.remaining();
}
case TIME:
outBuffer.put(offset, TIME_HEADER);
outBuffer.putLong(offset + 1, (Long) value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Random;
import org.apache.iceberg.util.RandomUtil;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.FieldSource;

Expand Down Expand Up @@ -57,7 +59,10 @@ public class TestPrimitiveWrapper {
Variants.of(new BigDecimal("9876543210.123456789")), // decimal16
Variants.of(new BigDecimal("-9876543210.123456789")), // decimal16
Variants.of(ByteBuffer.wrap(new byte[] {0x0a, 0x0b, 0x0c, 0x0d})),
Variants.of("iceberg"),
Variants.of(
"icebergicebergicebergicebergicebergicebergicebergicebergiceberg"), // short string of
// 63 (9*7) chars
Variants.of(RandomUtil.generateString(64, new Random(1))), // string of 64 chars
};

@ParameterizedTest
Expand Down
Loading