diff --git a/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj index c4f0c453..524eb12d 100644 --- a/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj +++ b/src/Apache.Arrow.Operations/Apache.Arrow.Operations.csproj @@ -18,6 +18,7 @@ + diff --git a/src/Apache.Arrow.Operations/Shredding/ShredOptions.cs b/src/Apache.Arrow.Operations/Shredding/ShredOptions.cs new file mode 100644 index 00000000..d69bdb9e --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShredOptions.cs @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Options controlling how infers a shredding schema. + /// + public sealed class ShredOptions + { + /// + /// Maximum nesting depth for shredded objects and arrays. + /// 0 means only top-level fields are shredded. + /// Default is 3. + /// + public int MaxDepth { get; set; } = 3; + + /// + /// Minimum fraction of values (0.0–1.0) in which a field must appear + /// to be considered for shredding. Fields appearing less frequently + /// than this threshold are left in the binary residual. + /// Default is 0.5 (50%). + /// + public double MinFieldFrequency { get; set; } = 0.5; + + /// + /// Minimum fraction of non-null values (0.0–1.0) for a field that must + /// share the same type for the field to be shredded as a typed column. + /// If the type consistency is below this threshold, the field gets a + /// schema (binary-only). + /// Default is 0.8 (80%). + /// + public double MinTypeConsistency { get; set; } = 0.8; + + /// Default options. + public static ShredOptions Default => new ShredOptions(); + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShredResult.cs b/src/Apache.Arrow.Operations/Shredding/ShredResult.cs new file mode 100644 index 00000000..e013e55e --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShredResult.cs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Collections.Generic; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// The result of shredding a single variant value: a (value, typed_value) pair. + /// + /// Follows the Parquet variant shredding spec encoding matrix: + /// + /// Both null → missing (only valid for object sub-fields) + /// value non-null, typed_value null → unshredded (value in binary) + /// value null, typed_value non-null → fully shredded into typed column + /// Both non-null → partially shredded object (typed_value has shredded fields, value has residual) + /// + /// + /// + public sealed class ShredResult + { + /// + /// The residual variant value bytes. These reference the column-level metadata + /// returned by ; + /// they are NOT self-contained. Non-null when the value (or part of it) could + /// not be shredded into the typed column. For partially shredded objects this + /// contains only the unshredded fields. + /// + public byte[] Value { get; } + + /// + /// The typed value extracted according to the schema. The runtime type depends + /// on the : + /// + /// Primitives: the corresponding CLR type (bool, int, long, double, string, etc.) + /// Object: + /// Array: + /// + /// Null when the value does not match the schema type (falls back to binary). + /// + public object TypedValue { get; } + + /// + /// True when both and are null, + /// indicating the field is missing (only valid for object sub-fields). + /// + public bool IsMissing => Value == null && TypedValue == null; + + /// Creates a shred result. + public ShredResult(byte[] value, object typedValue) + { + Value = value; + TypedValue = typedValue; + } + + /// A missing result (both null). + public static readonly ShredResult Missing = new ShredResult(null, null); + } + + /// + /// The typed_value result for a shredded object. Contains one + /// per field defined in the object's . + /// + public sealed class ShredObjectResult + { + /// + /// Shredded fields, keyed by field name matching the . + /// Each entry is the shredded (value, typed_value) pair for that field. + /// + public IReadOnlyDictionary Fields { get; } + + /// Creates a shredded object result. + public ShredObjectResult(IReadOnlyDictionary fields) + { + Fields = fields; + } + } + + /// + /// The typed_value result for a shredded array. Contains one + /// per element in the source array. + /// + public sealed class ShredArrayResult + { + /// + /// Shredded elements. Each entry is the shredded (value, typed_value) pair for that element. + /// Array elements are never missing — null elements are encoded as variant null in the value column. + /// + public IReadOnlyList Elements { get; } + + /// Creates a shredded array result. + public ShredArrayResult(IReadOnlyList elements) + { + Elements = elements; + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShredSchema.cs b/src/Apache.Arrow.Operations/Shredding/ShredSchema.cs new file mode 100644 index 00000000..e1ea5e3c --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShredSchema.cs @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Describes the shredding schema for a variant column — which fields + /// to extract into typed Parquet columns and at what types. + /// + public sealed class ShredSchema + { + /// + /// The type of the typed_value column. For primitives, this is the + /// expected scalar type. For objects, use + /// and populate . For arrays, use + /// and populate . + /// means no typed_value — everything goes to binary value. + /// + public ShredType TypedValueType { get; } + + /// + /// For : the shredding schemas for each named sub-field. + /// Null for non-object types. + /// + public IReadOnlyDictionary ObjectFields { get; } + + /// + /// For : the shredding schema applied to each element. + /// Null for non-array types. + /// + public ShredSchema ArrayElement { get; } + + private ShredSchema(ShredType typedValueType, IReadOnlyDictionary objectFields, ShredSchema arrayElement) + { + TypedValueType = typedValueType; + ObjectFields = objectFields; + ArrayElement = arrayElement; + } + + /// Creates a schema that does no shredding (all values go to binary). + public static ShredSchema Unshredded() => new ShredSchema(ShredType.None, null, null); + + /// + /// Creates a schema that shreds values into a typed primitive column. + /// Values not matching this type fall back to the binary value column. + /// + public static ShredSchema Primitive(ShredType type) + { + if (type == ShredType.None || type == ShredType.Object || type == ShredType.Array) + { + throw new ArgumentException($"Use the appropriate factory method for {type}.", nameof(type)); + } + return new ShredSchema(type, null, null); + } + + /// + /// Creates a schema that shreds object values by extracting named fields + /// into typed sub-columns. + /// + public static ShredSchema ForObject(IDictionary fields) + { + if (fields == null) throw new ArgumentNullException(nameof(fields)); + Dictionary copy = new Dictionary(fields); + return new ShredSchema(ShredType.Object, copy, null); + } + + /// + /// Creates a schema that shreds array values by applying the element + /// schema to each element. + /// + public static ShredSchema ForArray(ShredSchema elementSchema) + { + if (elementSchema == null) throw new ArgumentNullException(nameof(elementSchema)); + return new ShredSchema(ShredType.Array, null, elementSchema); + } + + /// + /// Maps a to the corresponding . + /// + public static ShredType ShredTypeFromPrimitive(VariantPrimitiveType primitiveType) + { + switch (primitiveType) + { + case VariantPrimitiveType.BooleanTrue: + case VariantPrimitiveType.BooleanFalse: + return ShredType.Boolean; + case VariantPrimitiveType.Int8: return ShredType.Int8; + case VariantPrimitiveType.Int16: return ShredType.Int16; + case VariantPrimitiveType.Int32: return ShredType.Int32; + case VariantPrimitiveType.Int64: return ShredType.Int64; + case VariantPrimitiveType.Float: return ShredType.Float; + case VariantPrimitiveType.Double: return ShredType.Double; + case VariantPrimitiveType.Decimal4: return ShredType.Decimal4; + case VariantPrimitiveType.Decimal8: return ShredType.Decimal8; + case VariantPrimitiveType.Decimal16: return ShredType.Decimal16; + case VariantPrimitiveType.Date: return ShredType.Date; + case VariantPrimitiveType.Timestamp: return ShredType.Timestamp; + case VariantPrimitiveType.TimestampNtz: return ShredType.TimestampNtz; + case VariantPrimitiveType.TimeNtz: return ShredType.TimeNtz; + case VariantPrimitiveType.TimestampTzNanos: return ShredType.TimestampTzNanos; + case VariantPrimitiveType.TimestampNtzNanos: return ShredType.TimestampNtzNanos; + case VariantPrimitiveType.String: return ShredType.String; + case VariantPrimitiveType.Binary: return ShredType.Binary; + case VariantPrimitiveType.Uuid: return ShredType.Uuid; + default: return ShredType.None; + } + } + + /// + /// Derives a from the Arrow type of a typed_value column. + /// + /// + /// The typed_value Arrow type, or null for a fully unshredded column. + /// + /// A describing the shredding. + /// + /// Thrown when is not a valid shredded type + /// per the Parquet variant shredding spec (for example, an unsigned integer or + /// a fixed-size binary that isn't UUID). + /// + public static ShredSchema FromArrowType(IArrowType typedValueType) + { + if (typedValueType == null) return Unshredded(); + return MapArrowType(typedValueType); + } + + private static ShredSchema MapArrowType(IArrowType type) + { + switch (type) + { + case BooleanType _: return Primitive(ShredType.Boolean); + case Int8Type _: return Primitive(ShredType.Int8); + case Int16Type _: return Primitive(ShredType.Int16); + case Int32Type _: return Primitive(ShredType.Int32); + case Int64Type _: return Primitive(ShredType.Int64); + case FloatType _: return Primitive(ShredType.Float); + case DoubleType _: return Primitive(ShredType.Double); + case StringType _: return Primitive(ShredType.String); + case BinaryType _: return Primitive(ShredType.Binary); + case LargeBinaryType _: return Primitive(ShredType.Binary); + case LargeStringType _: return Primitive(ShredType.String); + case Date32Type _: return Primitive(ShredType.Date); + + case Time64Type t when t.Unit == TimeUnit.Microsecond: + return Primitive(ShredType.TimeNtz); + + case TimestampType ts when ts.Unit == TimeUnit.Microsecond && ts.IsTimeZoneAware: + return Primitive(ShredType.Timestamp); + case TimestampType ts when ts.Unit == TimeUnit.Microsecond && !ts.IsTimeZoneAware: + return Primitive(ShredType.TimestampNtz); + case TimestampType ts when ts.Unit == TimeUnit.Nanosecond && ts.IsTimeZoneAware: + return Primitive(ShredType.TimestampTzNanos); + case TimestampType ts when ts.Unit == TimeUnit.Nanosecond && !ts.IsTimeZoneAware: + return Primitive(ShredType.TimestampNtzNanos); + + // The Parquet variant spec allows any Arrow decimal representation + // whose precision fits in one of the variant's decimal widths + // (≤9 digits → 4-byte unscaled, ≤18 → 8-byte, ≤38 → 16-byte). + // Decimal128Type extends FixedSizeBinaryType with byte_width=16, so + // we MUST match the decimal cases before the UUID fallback below, + // and dispatch by precision inside the cases rather than via `when` + // guards that can fall through into the FSB(16) branch. + case Decimal32Type d32: return MapDecimalByPrecision(d32.Precision, type); + case Decimal64Type d64: return MapDecimalByPrecision(d64.Precision, type); + case Decimal128Type d128: return MapDecimalByPrecision(d128.Precision, type); + + case ExtensionType ext when ext.Name == "arrow.uuid": + return Primitive(ShredType.Uuid); + + // When the Arrow IPC reader has no UUID extension registered, the + // column comes through as its storage type (16-byte fixed binary). + // Per the Parquet variant shredding spec, fixed_size_binary(16) is + // the only valid fixed-size binary type and represents UUID. + case FixedSizeBinaryType fsb when fsb.ByteWidth == 16: + return Primitive(ShredType.Uuid); + + case ListType list: + return MapArrayType(list); + + case StructType structType: + return MapObjectType(structType); + + default: + throw new ArgumentException( + $"Unsupported shredded value type: {type}", + nameof(type)); + } + } + + private static ShredSchema MapDecimalByPrecision(int precision, IArrowType type) + { + if (precision <= 9) return Primitive(ShredType.Decimal4); + if (precision <= 18) return Primitive(ShredType.Decimal8); + if (precision <= 38) return Primitive(ShredType.Decimal16); + throw new ArgumentException( + $"Unsupported decimal precision {precision} (max 38): {type}", + nameof(type)); + } + + private static ShredSchema MapArrayType(ListType list) + { + if (!(list.ValueDataType is StructType elementStruct) || !IsElementGroupStruct(elementStruct)) + { + throw new ArgumentException( + "Shredded array element must be a struct with 'value' and/or 'typed_value' fields.", + nameof(list)); + } + return ForArray(ParseElementGroup(elementStruct)); + } + + private static ShredSchema MapObjectType(StructType structType) + { + Dictionary fields = new Dictionary(structType.Fields.Count); + foreach (Field field in structType.Fields) + { + if (!(field.DataType is StructType elementGroup) || !IsElementGroupStruct(elementGroup)) + { + throw new ArgumentException( + $"Shredded object field '{field.Name}' must be a struct with 'value' and/or 'typed_value' fields.", + nameof(structType)); + } + fields[field.Name] = ParseElementGroup(elementGroup); + } + return ForObject(fields); + } + + /// + /// Tests whether a struct type is a valid shredded "element group": + /// a struct with at least one of value (binary) or typed_value, + /// and no other fields. + /// + private static bool IsElementGroupStruct(StructType st) + { + int valueIdx = st.GetFieldIndex("value"); + int typedIdx = st.GetFieldIndex("typed_value"); + + if (valueIdx < 0 && typedIdx < 0) + { + return false; + } + + if (valueIdx >= 0) + { + IArrowType valueFieldType = st.Fields[valueIdx].DataType; + if (!(valueFieldType is BinaryType || + valueFieldType is LargeBinaryType || + valueFieldType is BinaryViewType)) + { + return false; + } + } + + // Reject structs with unexpected extra fields. + foreach (Field f in st.Fields) + { + if (f.Name != "value" && f.Name != "typed_value") + { + return false; + } + } + + return true; + } + + private static ShredSchema ParseElementGroup(StructType elementStruct) + { + int typedIdx = elementStruct.GetFieldIndex("typed_value"); + if (typedIdx < 0) + { + return Unshredded(); + } + return MapArrowType(elementStruct.Fields[typedIdx].DataType); + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShredSchemaInferer.cs b/src/Apache.Arrow.Operations/Shredding/ShredSchemaInferer.cs new file mode 100644 index 00000000..4537c3cb --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShredSchemaInferer.cs @@ -0,0 +1,201 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Analyzes a batch of s and infers an optimal + /// for shredding them. + /// + public sealed class ShredSchemaInferer + { + /// + /// Infers a shredding schema by analyzing the given values. + /// + /// The variant values to analyze. + /// Options controlling depth, frequency, and type consistency thresholds. + /// An inferred . + public ShredSchema Infer(IEnumerable values, ShredOptions options = null) + { + if (values == null) throw new ArgumentNullException(nameof(values)); + if (options == null) options = ShredOptions.Default; + + TypeStats stats = new TypeStats(); + int totalCount = 0; + + foreach (VariantValue value in values) + { + CollectStats(value, stats, 0, options.MaxDepth); + totalCount++; + } + + if (totalCount == 0) + { + return ShredSchema.Unshredded(); + } + + return BuildSchema(stats, totalCount, options, 0); + } + + private void CollectStats(VariantValue value, TypeStats stats, int depth, int maxDepth) + { + ShredType type = VariantShredder.GetShredType(value); + stats.TypeCounts.TryGetValue(type, out int count); + stats.TypeCounts[type] = count + 1; + + if (type == ShredType.Object && depth <= maxDepth && value.IsObject) + { + if (stats.ObjectFieldStats == null) + { + stats.ObjectFieldStats = new Dictionary(); + } + + foreach (KeyValuePair field in value.AsObject()) + { + if (!stats.ObjectFieldStats.TryGetValue(field.Key, out TypeStats fieldStats)) + { + fieldStats = new TypeStats(); + stats.ObjectFieldStats[field.Key] = fieldStats; + } + CollectStats(field.Value, fieldStats, depth + 1, maxDepth); + } + } + else if (type == ShredType.Array && depth <= maxDepth && value.IsArray) + { + if (stats.ArrayElementStats == null) + { + stats.ArrayElementStats = new TypeStats(); + } + + foreach (VariantValue element in value.AsArray()) + { + CollectStats(element, stats.ArrayElementStats, depth + 1, maxDepth); + } + } + } + + private ShredSchema BuildSchema(TypeStats stats, int totalCount, ShredOptions options, int depth) + { + // Find the dominant type. + ShredType dominantType = ShredType.None; + int dominantCount = 0; + int nonNullCount = 0; + + foreach (KeyValuePair entry in stats.TypeCounts) + { + if (entry.Key != ShredType.None) + { + nonNullCount += entry.Value; + if (entry.Value > dominantCount) + { + dominantCount = entry.Value; + dominantType = entry.Key; + } + } + } + + if (nonNullCount == 0) + { + return ShredSchema.Unshredded(); + } + + // Check type consistency. + double consistency = (double)dominantCount / nonNullCount; + if (consistency < options.MinTypeConsistency) + { + return ShredSchema.Unshredded(); + } + + if (dominantType == ShredType.Object && stats.ObjectFieldStats != null) + { + return BuildObjectSchema(stats, totalCount, dominantCount, options, depth); + } + + if (dominantType == ShredType.Array && stats.ArrayElementStats != null) + { + return BuildArraySchema(stats, dominantCount, options, depth); + } + + // Object/Array without collected sub-stats (e.g., maxDepth reached) — can't shred further. + if (dominantType == ShredType.Object || dominantType == ShredType.Array) + { + return ShredSchema.Unshredded(); + } + + // Primitive type — shred as that type. + return ShredSchema.Primitive(dominantType); + } + + private ShredSchema BuildObjectSchema(TypeStats stats, int totalCount, int objectCount, ShredOptions options, int depth) + { + Dictionary fields = new Dictionary(); + + foreach (KeyValuePair fieldEntry in stats.ObjectFieldStats) + { + // Check field frequency: how often does this field appear relative to the number of objects? + int fieldAppearances = 0; + foreach (KeyValuePair tc in fieldEntry.Value.TypeCounts) + { + fieldAppearances += tc.Value; + } + + double frequency = (double)fieldAppearances / objectCount; + if (frequency < options.MinFieldFrequency) + { + continue; + } + + ShredSchema fieldSchema = BuildSchema(fieldEntry.Value, fieldAppearances, options, depth + 1); + fields[fieldEntry.Key] = fieldSchema; + } + + if (fields.Count == 0) + { + return ShredSchema.Unshredded(); + } + + return ShredSchema.ForObject(fields); + } + + private ShredSchema BuildArraySchema(TypeStats stats, int arrayCount, ShredOptions options, int depth) + { + // Count total elements across all arrays. + int totalElements = 0; + foreach (KeyValuePair entry in stats.ArrayElementStats.TypeCounts) + { + totalElements += entry.Value; + } + + if (totalElements == 0) + { + return ShredSchema.Unshredded(); + } + + ShredSchema elementSchema = BuildSchema(stats.ArrayElementStats, totalElements, options, depth + 1); + return ShredSchema.ForArray(elementSchema); + } + + private sealed class TypeStats + { + public Dictionary TypeCounts { get; } = new Dictionary(); + public Dictionary ObjectFieldStats { get; set; } + public TypeStats ArrayElementStats { get; set; } + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShredType.cs b/src/Apache.Arrow.Operations/Shredding/ShredType.cs new file mode 100644 index 00000000..19f1a98c --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShredType.cs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Describes the type that a shredded typed_value column expects. + /// Maps variant primitive types to the logical Parquet types used for shredding. + /// + public enum ShredType : byte + { + /// No typed_value column — all values go to the binary value column. + None = 0, + + /// Boolean (Parquet BOOLEAN). + Boolean, + + /// 8-bit signed integer (Parquet INT32 with INT_8 annotation). + Int8, + + /// 16-bit signed integer (Parquet INT32 with INT_16 annotation). + Int16, + + /// 32-bit signed integer (Parquet INT32). + Int32, + + /// 64-bit signed integer (Parquet INT64). + Int64, + + /// 32-bit float (Parquet FLOAT). + Float, + + /// 64-bit double (Parquet DOUBLE). + Double, + + /// Decimal with 4-byte unscaled value. + Decimal4, + + /// Decimal with 8-byte unscaled value. + Decimal8, + + /// Decimal with 16-byte unscaled value. + Decimal16, + + /// Date as days since epoch (Parquet DATE). + Date, + + /// Timestamp with UTC microseconds (Parquet TIMESTAMP with isAdjustedToUTC=true, MICROS). + Timestamp, + + /// Timestamp without timezone, microseconds (Parquet TIMESTAMP with isAdjustedToUTC=false, MICROS). + TimestampNtz, + + /// Time without timezone, microseconds (Parquet TIME with MICROS). + TimeNtz, + + /// Timestamp with UTC nanoseconds. + TimestampTzNanos, + + /// Timestamp without timezone, nanoseconds. + TimestampNtzNanos, + + /// UTF-8 string (Parquet BINARY with STRING logical type). + String, + + /// Binary data (Parquet BINARY). + Binary, + + /// UUID (Parquet FIXED_LEN_BYTE_ARRAY(16) with UUID logical type). + Uuid, + + /// Shredded as an object group with named sub-fields. + Object, + + /// Shredded as an array (Parquet LIST). + Array, + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShreddedArray.cs b/src/Apache.Arrow.Operations/Shredding/ShreddedArray.cs new file mode 100644 index 00000000..2dcea000 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShreddedArray.cs @@ -0,0 +1,153 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Reader for a single row of a shredded-array slot. The underlying storage + /// is a list of element groups (each a {value, typed_value} struct). + /// + public ref struct ShreddedArray + { + private readonly ShredSchema _schema; + private readonly ReadOnlySpan _metadata; + // The typed_value list (elements are {value, typed_value} structs). May be null. + private readonly ListArray _list; + // The residual binary column at the array level (for unshredded arrays). May be null. + private readonly IArrowArray _residual; + private readonly int _row; + + internal ShreddedArray( + ShredSchema schema, + ReadOnlySpan metadata, + ListArray list, + IArrowArray residual, + int row) + { + _schema = schema; + _metadata = metadata; + _list = list; + _residual = residual; + _row = row; + } + + /// + /// True when the typed list is populated at this row (the array is stored + /// element-by-element in the shredded column). + /// + public bool IsTypedList => _list != null && !_list.IsNull(_row); + + /// + /// The number of shredded elements at this row. Only valid when + /// is true. + /// + /// If the array is stored as a residual. + public int ElementCount + { + get + { + if (!IsTypedList) + { + throw new InvalidOperationException( + "Array at this row is stored as a residual (not a typed list). " + + "Use TryGetResidualReader and iterate via VariantArrayReader."); + } + return _list.ValueOffsets[_row + 1] - _list.ValueOffsets[_row]; + } + } + + /// + /// Gets a reader for the element at position + /// . Only valid when is true. + /// + public ShreddedVariant GetElement(int index) + { + if (!IsTypedList) + { + throw new InvalidOperationException( + "Array at this row is stored as a residual (not a typed list)."); + } + int start = _list.ValueOffsets[_row]; + int end = _list.ValueOffsets[_row + 1]; + if ((uint)index >= (uint)(end - start)) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + StructArray elementGroup = (StructArray)_list.Values; + return ShreddingHelpers.BuildSlot(_schema.ArrayElement, _metadata, elementGroup, start + index); + } + + /// + /// If the array is stored as a residual at this row (not shredded), returns + /// a over the residual bytes. Callers can then + /// inspect the array via VariantArrayReader. + /// + public bool TryGetResidualReader(out VariantReader reader) + { + if (_residual == null || _residual.IsNull(_row)) + { + reader = default; + return false; + } + ReadOnlySpan bytes = ((BinaryArray)_residual).GetBytes(_row, out _); + reader = new VariantReader(_metadata, bytes); + return true; + } + + /// + /// Materializes the array into a . If the typed + /// list is null at this row, falls back to the residual binary (the array + /// was stored unshredded for this row). When neither is populated, the + /// slot encodes a variant null — consistent with + /// and . + /// + public VariantValue ToVariantValue() + { + if (_list != null && !_list.IsNull(_row)) + { + int start = _list.ValueOffsets[_row]; + int end = _list.ValueOffsets[_row + 1]; + int count = end - start; + + StructArray elementGroup = (StructArray)_list.Values; + List elements = new List(count); + for (int i = start; i < end; i++) + { + // For array elements, a both-null slot encodes a variant null + // (arrays cannot contain "missing"). ShreddedVariant.ToVariantValue + // already returns VariantValue.Null for a missing slot. + ShreddedVariant slot = ShreddingHelpers.BuildSlot(_schema.ArrayElement, _metadata, elementGroup, i); + elements.Add(slot.ToVariantValue()); + } + return VariantValue.FromArray(elements); + } + + // No typed list at this row — decode from the residual if present, + // otherwise the slot is variant null. + if (_residual == null || _residual.IsNull(_row)) + { + return VariantValue.Null; + } + BinaryArray residualBinary = (BinaryArray)_residual; + ReadOnlySpan bytes = residualBinary.GetBytes(_row, out _); + return new VariantReader(_metadata, bytes).ToVariantValue(); + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShreddedObject.cs b/src/Apache.Arrow.Operations/Shredding/ShreddedObject.cs new file mode 100644 index 00000000..07e23811 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShreddedObject.cs @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Reader for a single row of a shredded-object slot. Provides field-wise + /// access to both typed sub-columns and residual unshredded fields. + /// + public ref struct ShreddedObject + { + private readonly ShredSchema _schema; + private readonly ReadOnlySpan _metadata; + // The typed_value struct (one field per shredded field, each itself a {value, typed_value} struct). + // May be null if this row's typed_value column is null (i.e., the whole slot is in residual). + private readonly StructArray _fields; + // The residual value at this level (a binary column holding unshredded fields). May be null. + private readonly IArrowArray _residual; + private readonly int _index; + + internal ShreddedObject( + ShredSchema schema, + ReadOnlySpan metadata, + StructArray typedValueStruct, + IArrowArray residualValue, + int index) + { + _schema = schema; + _metadata = metadata; + _fields = typedValueStruct; + _residual = residualValue; + _index = index; + } + + /// The names of the shredded fields, in schema order. + public IEnumerable FieldNames => _schema.ObjectFields.Keys; + + /// + /// Gets the shredded reader for a named field. The field must exist in the schema. + /// + /// If is not a shredded field. + public ShreddedVariant GetField(string name) + { + if (!TryGetField(name, out ShreddedVariant field)) + { + throw new KeyNotFoundException($"Field '{name}' is not in the shredded object schema."); + } + return field; + } + + /// + /// Tries to get a reader for a shredded sub-field by name. Returns false if + /// isn't a shredded field (it may still exist in the + /// residual — use to inspect). + /// + public bool TryGetField(string name, out ShreddedVariant field) + { + if (!_schema.ObjectFields.TryGetValue(name, out ShredSchema fieldSchema)) + { + field = default; + return false; + } + if (_fields == null || _fields.IsNull(_index)) + { + // typed_value is null at this row — the field is effectively missing + // from the typed column. Return a slot with no typed/residual set. + field = new ShreddedVariant(fieldSchema, _metadata, null, null, _index); + return true; + } + StructType fieldsStructType = (StructType)_fields.Data.DataType; + int fieldIdx = fieldsStructType.GetFieldIndex(name); + StructArray elementGroup = (StructArray)_fields.Fields[fieldIdx]; + field = ShreddingHelpers.BuildSlot(fieldSchema, _metadata, elementGroup, _index); + return true; + } + + /// + /// If the object's residual binary is populated at this row, returns a + /// over it. The residual holds whatever fields + /// were not shredded (or, for a non-object row, the whole value). + /// + public bool TryGetResidualReader(out VariantReader reader) + { + if (_residual == null || _residual.IsNull(_index)) + { + reader = default; + return false; + } + ReadOnlySpan bytes = ((BinaryArray)_residual).GetBytes(_index, out _); + reader = new VariantReader(_metadata, bytes); + return true; + } + + /// + /// Materializes the whole shredded object into a , + /// merging typed-column fields with residual unshredded fields. When the + /// typed_value column is null at this row, the residual is returned + /// as-is (it may be any variant type, not just an object). + /// + public VariantValue ToVariantValue() + { + bool typedPopulated = _fields != null && !_fields.IsNull(_index); + bool residualPopulated = _residual != null && !_residual.IsNull(_index); + + if (!typedPopulated && !residualPopulated) + { + return VariantValue.Null; + } + + // No shredded fields at this row — whatever is in the residual IS the value. + if (!typedPopulated) + { + BinaryArray binary = (BinaryArray)_residual; + ReadOnlySpan bytes = binary.GetBytes(_index, out _); + return new VariantReader(_metadata, bytes).ToVariantValue(); + } + + Dictionary fields = new Dictionary(); + + // Shredded fields (from typed_value). + StructType fieldsStructType = (StructType)_fields.Data.DataType; + foreach (KeyValuePair entry in _schema.ObjectFields) + { + int fieldIdx = fieldsStructType.GetFieldIndex(entry.Key); + StructArray elementGroup = (StructArray)_fields.Fields[fieldIdx]; + ShreddedVariant slot = ShreddingHelpers.BuildSlot(entry.Value, _metadata, elementGroup, _index); + if (!slot.IsMissing) + { + fields[entry.Key] = slot.ToVariantValue(); + } + } + + // Partially shredded object — merge residual unshredded fields. + if (residualPopulated) + { + BinaryArray residualBinary = (BinaryArray)_residual; + ReadOnlySpan residualBytes = residualBinary.GetBytes(_index, out _); + VariantReader residualReader = new VariantReader(_metadata, residualBytes); + if (!residualReader.IsObject) + { + throw new InvalidOperationException( + "Residual value for a partially shredded object must itself be a variant object."); + } + VariantValue residual = residualReader.ToVariantValue(); + foreach (KeyValuePair kv in residual.AsObject()) + { + fields[kv.Key] = kv.Value; + } + } + + return VariantValue.FromObject(fields); + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShreddedVariant.cs b/src/Apache.Arrow.Operations/Shredding/ShreddedVariant.cs new file mode 100644 index 00000000..5ef35759 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShreddedVariant.cs @@ -0,0 +1,358 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Data.SqlTypes; +using Apache.Arrow.Arrays; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Zero-copy reader for a single row of a (possibly shredded) variant column. + /// Composes with the for this position to expose the + /// typed columns and residual bytes side-by-side, or to materialize the logical + /// value on demand. + /// + /// A does not own any Arrow buffers; it is only + /// valid while the underlying Arrow arrays are alive. + /// + /// + public ref struct ShreddedVariant + { + private readonly ShredSchema _schema; + private readonly ReadOnlySpan _metadata; + // _valueArray is the residual binary column at this level (may be null). + private readonly IArrowArray _valueArray; + // _typedValueArray is the typed column at this level (may be null if no shredding here). + private readonly IArrowArray _typedValueArray; + private readonly int _index; + + internal ShreddedVariant( + ShredSchema schema, + ReadOnlySpan metadata, + IArrowArray valueArray, + IArrowArray typedValueArray, + int index) + { + _schema = schema ?? throw new ArgumentNullException(nameof(schema)); + _metadata = metadata; + _valueArray = valueArray; + _typedValueArray = typedValueArray; + _index = index; + } + + /// The schema describing how this slot is shredded. + public ShredSchema Schema => _schema; + + /// The column-level variant metadata. + public ReadOnlySpan Metadata => _metadata; + + /// True when the residual value column has a value at this index. + public bool HasResidual => _valueArray != null && !_valueArray.IsNull(_index); + + /// True when the typed_value column has a value at this index. + public bool HasTypedValue => _typedValueArray != null && !_typedValueArray.IsNull(_index); + + /// + /// True when neither the residual nor the typed column is populated at this index + /// — valid only for sub-fields of shredded objects. + /// + public bool IsMissing => !HasResidual && !HasTypedValue; + + /// + /// Materializes this slot into a logical , merging + /// typed-column values with residual bytes per the shredding spec. + /// + /// If the slot is missing. + public VariantValue ToVariantValue() + { + // Both-null at this slot means the logical value is variant null. (The + // "missing" encoding — omitting the field entirely from the output — is + // a choice made by the container: see ShreddedObject, which uses + // IsMissing to decide whether to drop a sub-field.) + if (IsMissing) + { + return VariantValue.Null; + } + + switch (_schema.TypedValueType) + { + case ShredType.None: + return ReadResidual(); + + case ShredType.Object: + return GetObject().ToVariantValue(); + + case ShredType.Array: + return GetArray().ToVariantValue(); + + default: + // Primitive shredding. Per the Parquet variant shredding spec, a + // primitive slot may have at most one of value / typed_value set. + // If both are populated at the same row, the shredded data is + // invalid and implementations should reject it. + if (HasTypedValue) + { + if (HasResidual) + { + throw new InvalidOperationException( + "Invalid shredded variant: primitive slot has both 'value' and 'typed_value' populated."); + } + return ReadTypedPrimitive(); + } + return ReadResidual(); + } + } + + /// + /// If the residual column has a value at this index, returns a + /// over its bytes. + /// + public bool TryGetResidualReader(out VariantReader reader) + { + if (HasResidual) + { + BinaryArray binary = (BinaryArray)_valueArray; + ReadOnlySpan bytes = binary.GetBytes(_index, out _); + reader = new VariantReader(_metadata, bytes); + return true; + } + reader = default; + return false; + } + + /// + /// Reader for a shredded object at this slot. Valid only when the schema's + /// is . + /// + public ShreddedObject GetObject() + { + if (_schema.TypedValueType != ShredType.Object) + { + throw new InvalidOperationException( + $"Slot is not shredded as an object (schema type {_schema.TypedValueType})."); + } + return new ShreddedObject(_schema, _metadata, _typedValueArray as StructArray, _valueArray, _index); + } + + /// + /// Reader for a shredded array at this slot. Valid only when the schema's + /// is . + /// + public ShreddedArray GetArray() + { + if (_schema.TypedValueType != ShredType.Array) + { + throw new InvalidOperationException( + $"Slot is not shredded as an array (schema type {_schema.TypedValueType})."); + } + return new ShreddedArray(_schema, _metadata, _typedValueArray as ListArray, _valueArray, _index); + } + + private VariantValue ReadResidual() + { + if (!HasResidual) + { + throw new InvalidOperationException("No residual value to read."); + } + BinaryArray binary = (BinaryArray)_valueArray; + ReadOnlySpan bytes = binary.GetBytes(_index, out _); + return new VariantReader(_metadata, bytes).ToVariantValue(); + } + + // --------------------------------------------------------------- + // Typed-column accessors — zero-copy access to the shredded value + // without materializing a VariantValue. + // + // Each getter requires: + // (a) the slot's schema to match the requested type, and + // (b) the typed_value column to be populated at this index. + // Otherwise it throws. Callers that want automatic residual fallback + // should use ToVariantValue instead. + // --------------------------------------------------------------- + + /// Reads the shredded boolean value at this slot. + public bool GetBoolean() => ((BooleanArray)RequireTyped(ShredType.Boolean)).GetValue(_index).Value; + + /// Reads the shredded 8-bit signed integer at this slot. + public sbyte GetInt8() => ((Int8Array)RequireTyped(ShredType.Int8)).GetValue(_index).Value; + + /// Reads the shredded 16-bit signed integer at this slot. + public short GetInt16() => ((Int16Array)RequireTyped(ShredType.Int16)).GetValue(_index).Value; + + /// Reads the shredded 32-bit signed integer at this slot. + public int GetInt32() => ((Int32Array)RequireTyped(ShredType.Int32)).GetValue(_index).Value; + + /// Reads the shredded 64-bit signed integer at this slot. + public long GetInt64() => ((Int64Array)RequireTyped(ShredType.Int64)).GetValue(_index).Value; + + /// Reads the shredded 32-bit float at this slot. + public float GetFloat() => ((FloatArray)RequireTyped(ShredType.Float)).GetValue(_index).Value; + + /// Reads the shredded 64-bit double at this slot. + public double GetDouble() => ((DoubleArray)RequireTyped(ShredType.Double)).GetValue(_index).Value; + + /// + /// Reads the shredded decimal value at this slot. Works for Decimal4, Decimal8, + /// and Decimal16 shred types, regardless of whether the Arrow column is backed + /// by Decimal32Array, Decimal64Array, or Decimal128Array. + /// + public decimal GetDecimal() + { + RequireDecimalSchema(); + if (!HasTypedValue) ThrowNoTyped(); + IArrowArray arr = UnwrapExtension(_typedValueArray); + switch (arr) + { + case Decimal32Array d32: return d32.GetValue(_index).Value; + case Decimal64Array d64: return d64.GetValue(_index).Value; + case Decimal128Array d128: return d128.GetValue(_index).Value; + default: + throw new InvalidOperationException( + $"Shredded decimal column is backed by {arr.GetType().Name}, which is not a supported decimal array type."); + } + } + + /// + /// Reads the shredded decimal value at this slot. Works for Decimal4, Decimal8, + /// and Decimal16 shred types, regardless of whether the Arrow column is backed + /// by Decimal32Array, Decimal64Array, or Decimal128Array. + /// + public SqlDecimal GetSqlDecimal() + { + RequireDecimalSchema(); + if (!HasTypedValue) ThrowNoTyped(); + IArrowArray arr = UnwrapExtension(_typedValueArray); + switch (arr) + { + case Decimal32Array d32: return d32.GetValue(_index).Value; + case Decimal64Array d64: return d64.GetValue(_index).Value; + case Decimal128Array d128: return d128.GetSqlDecimal(_index).Value; + default: + throw new InvalidOperationException( + $"Shredded decimal column is backed by {arr.GetType().Name}, which is not a supported decimal array type."); + } + } + + /// Reads the shredded date (days since epoch) at this slot. + public int GetDateDays() => ((Date32Array)RequireTyped(ShredType.Date)).GetValue(_index).Value; + + /// Reads the shredded timestamp (microseconds since epoch, UTC) at this slot. + public long GetTimestampMicros() => ((TimestampArray)RequireTyped(ShredType.Timestamp)).GetValue(_index).Value; + + /// Reads the shredded timestamp-without-tz (microseconds since epoch) at this slot. + public long GetTimestampNtzMicros() => ((TimestampArray)RequireTyped(ShredType.TimestampNtz)).GetValue(_index).Value; + + /// Reads the shredded time-without-tz (microseconds since midnight) at this slot. + public long GetTimeNtzMicros() => ((Time64Array)RequireTyped(ShredType.TimeNtz)).GetValue(_index).Value; + + /// Reads the shredded timestamp-with-tz (nanoseconds since epoch) at this slot. + public long GetTimestampTzNanos() => ((TimestampArray)RequireTyped(ShredType.TimestampTzNanos)).GetValue(_index).Value; + + /// Reads the shredded timestamp-without-tz (nanoseconds since epoch) at this slot. + public long GetTimestampNtzNanos() => ((TimestampArray)RequireTyped(ShredType.TimestampNtzNanos)).GetValue(_index).Value; + + /// Reads the shredded string value at this slot. + public string GetString() => ((StringArray)RequireTyped(ShredType.String)).GetString(_index); + + /// Reads the shredded binary value at this slot as a byte span. + public ReadOnlySpan GetBinaryBytes() => ((BinaryArray)RequireTyped(ShredType.Binary)).GetBytes(_index); + + /// Reads the shredded UUID at this slot. + public Guid GetUuid() + { + FixedSizeBinaryArray arr = (FixedSizeBinaryArray)RequireTyped(ShredType.Uuid); + ReadOnlySpan raw = arr.GetBytes(_index); +#if NET8_0_OR_GREATER + return new Guid(raw, bigEndian: true); +#else + byte[] bytes = new byte[16]; + bytes[0] = raw[3]; bytes[1] = raw[2]; bytes[2] = raw[1]; bytes[3] = raw[0]; + bytes[4] = raw[5]; bytes[5] = raw[4]; + bytes[6] = raw[7]; bytes[7] = raw[6]; + raw.Slice(8, 8).CopyTo(bytes.AsSpan(8)); + return new Guid(bytes); +#endif + } + + /// Reads the shredded UUID at this slot as raw big-endian (RFC 4122) bytes. + public ReadOnlySpan GetUuidBytes() + => ((FixedSizeBinaryArray)RequireTyped(ShredType.Uuid)).GetBytes(_index); + + // --------------------------------------------------------------- + // Primitive dispatch for internal materialization. Delegates to the + // typed getters so the two paths stay in sync. + // --------------------------------------------------------------- + + private VariantValue ReadTypedPrimitive() + { + switch (_schema.TypedValueType) + { + case ShredType.Boolean: return VariantValue.FromBoolean(GetBoolean()); + case ShredType.Int8: return VariantValue.FromInt8(GetInt8()); + case ShredType.Int16: return VariantValue.FromInt16(GetInt16()); + case ShredType.Int32: return VariantValue.FromInt32(GetInt32()); + case ShredType.Int64: return VariantValue.FromInt64(GetInt64()); + case ShredType.Float: return VariantValue.FromFloat(GetFloat()); + case ShredType.Double: return VariantValue.FromDouble(GetDouble()); + case ShredType.Decimal4: return VariantValue.FromDecimal4(GetDecimal()); + case ShredType.Decimal8: return VariantValue.FromDecimal8(GetDecimal()); + case ShredType.Decimal16: return VariantValue.FromDecimal16(GetSqlDecimal()); + case ShredType.Date: return VariantValue.FromDate(GetDateDays()); + case ShredType.Timestamp: return VariantValue.FromTimestamp(GetTimestampMicros()); + case ShredType.TimestampNtz: return VariantValue.FromTimestampNtz(GetTimestampNtzMicros()); + case ShredType.TimeNtz: return VariantValue.FromTimeNtz(GetTimeNtzMicros()); + case ShredType.TimestampTzNanos: return VariantValue.FromTimestampTzNanos(GetTimestampTzNanos()); + case ShredType.TimestampNtzNanos: return VariantValue.FromTimestampNtzNanos(GetTimestampNtzNanos()); + case ShredType.String: return VariantValue.FromString(GetString()); + case ShredType.Binary: return VariantValue.FromBinary(GetBinaryBytes().ToArray()); + case ShredType.Uuid: return VariantValue.FromUuid(GetUuid()); + default: + throw new InvalidOperationException( + $"Unexpected primitive shred type {_schema.TypedValueType}."); + } + } + + private IArrowArray RequireTyped(ShredType expected) + { + if (_schema.TypedValueType != expected) + { + throw new InvalidOperationException( + $"Slot schema is {_schema.TypedValueType}, not {expected}."); + } + if (!HasTypedValue) ThrowNoTyped(); + return UnwrapExtension(_typedValueArray); + } + + private void RequireDecimalSchema() + { + if (_schema.TypedValueType != ShredType.Decimal4 && + _schema.TypedValueType != ShredType.Decimal8 && + _schema.TypedValueType != ShredType.Decimal16) + { + throw new InvalidOperationException( + $"Slot schema is {_schema.TypedValueType}, not a decimal type."); + } + } + + private void ThrowNoTyped() => + throw new InvalidOperationException( + "No typed_value at this index (check HasTypedValue first, or use ToVariantValue for residual fallback)."); + + private static IArrowArray UnwrapExtension(IArrowArray arr) => + arr is ExtensionArray ext ? ext.Storage : arr; + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShreddedVariantArrayBuilder.cs b/src/Apache.Arrow.Operations/Shredding/ShreddedVariantArrayBuilder.cs new file mode 100644 index 00000000..44b6de59 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShreddedVariantArrayBuilder.cs @@ -0,0 +1,513 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Arrays; +using Apache.Arrow.Memory; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Assembles a shredded from pre-shredded rows. + /// Produces an Arrow struct with shared metadata, residual value, + /// and the typed_value tree whose Arrow shape matches the . + /// + public static class ShreddedVariantArrayBuilder + { + /// + /// Builds a shredded from the output of + /// . + /// + /// The shredding schema applied to each row. + /// The column-level variant metadata (shared across rows). + /// Per-row shred results whose residual bytes reference . + /// Arrow memory allocator, or default if null. + public static VariantArray Build( + ShredSchema schema, + byte[] metadata, + IReadOnlyList rows, + MemoryAllocator allocator = null) + { + if (schema == null) throw new ArgumentNullException(nameof(schema)); + if (metadata == null) throw new ArgumentNullException(nameof(metadata)); + if (rows == null) throw new ArgumentNullException(nameof(rows)); + + int rowCount = rows.Count; + + // metadata column: emit the shared bytes once per row. (A dictionary-encoded + // or run-end-encoded representation would compress this; VariantArray's reader + // already handles those, but for simplicity we emit the plain binary form.) + BinaryArray.Builder metadataBuilder = new BinaryArray.Builder(); + for (int i = 0; i < rowCount; i++) + { + metadataBuilder.Append((ReadOnlySpan)metadata); + } + BinaryArray metadataArr = metadataBuilder.Build(allocator); + + // value column: residual bytes (or null). + BinaryArray valueArr = BuildBinaryColumn(rows, allocator); + + // typed_value column (if the schema has one). + List fields = new List + { + new Field("metadata", BinaryType.Default, false), + new Field("value", BinaryType.Default, true), + }; + List children = new List { metadataArr, valueArr }; + + if (schema.TypedValueType != ShredType.None) + { + List typedValues = new List(rowCount); + for (int i = 0; i < rowCount; i++) typedValues.Add(rows[i].TypedValue); + IArrowArray typedValueArr = BuildTypedValueArray(schema, typedValues, allocator); + fields.Add(new Field("typed_value", typedValueArr.Data.DataType, true)); + children.Add(typedValueArr); + } + + StructType structType = new StructType(fields); + StructArray structArr = new StructArray( + structType, rowCount, children, ArrowBuffer.Empty, nullCount: 0); + // The public VariantArray(IArrowArray) constructor infers the VariantType + // from the struct's shape (including detecting the shredded layout). + return new VariantArray(structArr); + } + + // --------------------------------------------------------------- + // Recursive builders + // --------------------------------------------------------------- + + private static BinaryArray BuildBinaryColumn(IReadOnlyList rows, MemoryAllocator allocator) + { + BinaryArray.Builder b = new BinaryArray.Builder(); + foreach (ShredResult r in rows) + { + if (r.Value == null) b.AppendNull(); + else b.Append((ReadOnlySpan)r.Value); + } + return b.Build(allocator); + } + + private static IArrowArray BuildTypedValueArray( + ShredSchema schema, + IList typedValues, + MemoryAllocator allocator) + { + switch (schema.TypedValueType) + { + case ShredType.Object: return BuildObjectTyped(schema, typedValues, allocator); + case ShredType.Array: return BuildArrayTyped(schema, typedValues, allocator); + default: return BuildPrimitiveTyped(schema.TypedValueType, typedValues, allocator); + } + } + + private static StructArray BuildObjectTyped( + ShredSchema schema, + IList typedValues, + MemoryAllocator allocator) + { + int rowCount = typedValues.Count; + List fieldDefs = new List(schema.ObjectFields.Count); + List fieldArrays = new List(schema.ObjectFields.Count); + + foreach (KeyValuePair entry in schema.ObjectFields) + { + List fieldShreds = new List(rowCount); + foreach (object tv in typedValues) + { + if (tv is ShredObjectResult obj && + obj.Fields.TryGetValue(entry.Key, out ShredResult r)) + { + fieldShreds.Add(r); + } + else + { + fieldShreds.Add(ShredResult.Missing); + } + } + StructArray elementGroup = BuildElementGroupArray(entry.Value, fieldShreds, allocator); + fieldArrays.Add(elementGroup); + fieldDefs.Add(new Field(entry.Key, elementGroup.Data.DataType, true)); + } + + ArrowBuffer nullBitmap = BuildNullBitmap(typedValues, v => v != null, rowCount, + allocator, out int nullCount); + StructType structType = new StructType(fieldDefs); + return new StructArray(structType, rowCount, fieldArrays, nullBitmap, nullCount); + } + + private static ListArray BuildArrayTyped( + ShredSchema schema, + IList typedValues, + MemoryAllocator allocator) + { + int rowCount = typedValues.Count; + List flatElements = new List(); + ArrowBuffer.Builder offsets = new ArrowBuffer.Builder(); + offsets.Append(0); + ArrowBuffer.BitmapBuilder validity = new ArrowBuffer.BitmapBuilder(); + int nullCount = 0; + + foreach (object tv in typedValues) + { + if (tv is ShredArrayResult arr) + { + foreach (ShredResult e in arr.Elements) flatElements.Add(e); + validity.Append(true); + } + else + { + validity.Append(false); + nullCount++; + } + offsets.Append(flatElements.Count); + } + + StructArray elementGroup = BuildElementGroupArray(schema.ArrayElement, flatElements, allocator); + Field elementField = new Field("element", elementGroup.Data.DataType, true); + ListType listType = new ListType(elementField); + ArrowBuffer nullBitmap = nullCount > 0 ? validity.Build(allocator) : ArrowBuffer.Empty; + return new ListArray(listType, rowCount, offsets.Build(allocator), elementGroup, nullBitmap, nullCount); + } + + /// + /// Builds a {value?, typed_value?} element group. Always emits both + /// sub-fields (for simplicity) — readers tolerate the absent-field case via + /// null entries. + /// + private static StructArray BuildElementGroupArray( + ShredSchema schema, + IReadOnlyList results, + MemoryAllocator allocator) + { + int rowCount = results.Count; + List fieldDefs = new List(2); + List children = new List(2); + + // value column. + BinaryArray valueArr = BuildBinaryColumn(results, allocator); + fieldDefs.Add(new Field("value", BinaryType.Default, true)); + children.Add(valueArr); + + // typed_value column (only when schema has one). + if (schema.TypedValueType != ShredType.None) + { + List typedValues = new List(rowCount); + foreach (ShredResult r in results) typedValues.Add(r.TypedValue); + IArrowArray typedArr = BuildTypedValueArray(schema, typedValues, allocator); + fieldDefs.Add(new Field("typed_value", typedArr.Data.DataType, true)); + children.Add(typedArr); + } + + // Outer slot validity: non-null iff the slot isn't "missing" (both columns null). + ArrowBuffer.BitmapBuilder validity = new ArrowBuffer.BitmapBuilder(); + int nullCount = 0; + foreach (ShredResult r in results) + { + if (r.IsMissing) { validity.Append(false); nullCount++; } + else validity.Append(true); + } + ArrowBuffer nullBitmap = nullCount > 0 ? validity.Build(allocator) : ArrowBuffer.Empty; + + StructType structType = new StructType(fieldDefs); + return new StructArray(structType, rowCount, children, nullBitmap, nullCount); + } + + // --------------------------------------------------------------- + // Primitive typed-value builders + // --------------------------------------------------------------- + + private static IArrowArray BuildPrimitiveTyped( + ShredType shredType, + IList typedValues, + MemoryAllocator allocator) + { + switch (shredType) + { + case ShredType.Boolean: + { + BooleanArray.Builder b = new BooleanArray.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((bool)v); + } + return b.Build(allocator); + } + case ShredType.Int8: + { + Int8Array.Builder b = new Int8Array.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((sbyte)v); + } + return b.Build(allocator); + } + case ShredType.Int16: + { + Int16Array.Builder b = new Int16Array.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((short)v); + } + return b.Build(allocator); + } + case ShredType.Int32: + { + Int32Array.Builder b = new Int32Array.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((int)v); + } + return b.Build(allocator); + } + case ShredType.Int64: + { + Int64Array.Builder b = new Int64Array.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((long)v); + } + return b.Build(allocator); + } + case ShredType.Float: + { + FloatArray.Builder b = new FloatArray.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((float)v); + } + return b.Build(allocator); + } + case ShredType.Double: + { + DoubleArray.Builder b = new DoubleArray.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((double)v); + } + return b.Build(allocator); + } + case ShredType.Decimal4: + case ShredType.Decimal8: + case ShredType.Decimal16: + return BuildDecimalArray(shredType, typedValues, allocator); + case ShredType.Date: + return BuildDate32(typedValues, allocator); + case ShredType.Timestamp: + return BuildTimestamp(typedValues, TimeUnit.Microsecond, "UTC", allocator); + case ShredType.TimestampNtz: + return BuildTimestamp(typedValues, TimeUnit.Microsecond, null, allocator); + case ShredType.TimestampTzNanos: + return BuildTimestamp(typedValues, TimeUnit.Nanosecond, "UTC", allocator); + case ShredType.TimestampNtzNanos: + return BuildTimestamp(typedValues, TimeUnit.Nanosecond, null, allocator); + case ShredType.TimeNtz: + return BuildTime64(typedValues, allocator); + case ShredType.String: + { + StringArray.Builder b = new StringArray.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((string)v); + } + return b.Build(allocator); + } + case ShredType.Binary: + { + BinaryArray.Builder b = new BinaryArray.Builder(); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((ReadOnlySpan)(byte[])v); + } + return b.Build(allocator); + } + case ShredType.Uuid: + return BuildUuidArray(typedValues, allocator); + default: + throw new NotSupportedException($"Cannot build typed column for ShredType.{shredType}."); + } + } + + private static TimestampArray BuildTimestamp( + IList typedValues, TimeUnit unit, string timezone, MemoryAllocator allocator) + { + TimestampType type = new TimestampType(unit, timezone); + (ArrowBuffer values, ArrowBuffer nullBitmap, int nullCount) = + BuildLongBuffers(typedValues, allocator); + return new TimestampArray(type, values, nullBitmap, typedValues.Count, nullCount, offset: 0); + } + + private static Date32Array BuildDate32(IList typedValues, MemoryAllocator allocator) + { + (ArrowBuffer values, ArrowBuffer nullBitmap, int nullCount) = + BuildIntBuffers(typedValues, allocator); + return new Date32Array(values, nullBitmap, typedValues.Count, nullCount, offset: 0); + } + + private static Time64Array BuildTime64(IList typedValues, MemoryAllocator allocator) + { + Time64Type type = new Time64Type(TimeUnit.Microsecond); + (ArrowBuffer values, ArrowBuffer nullBitmap, int nullCount) = + BuildLongBuffers(typedValues, allocator); + return new Time64Array(type, values, nullBitmap, typedValues.Count, nullCount, offset: 0); + } + + private static (ArrowBuffer values, ArrowBuffer nullBitmap, int nullCount) BuildLongBuffers( + IList typedValues, MemoryAllocator allocator) + { + ArrowBuffer.Builder values = new ArrowBuffer.Builder(typedValues.Count); + ArrowBuffer.BitmapBuilder bitmap = new ArrowBuffer.BitmapBuilder(typedValues.Count); + int nullCount = 0; + foreach (object v in typedValues) + { + if (v == null) + { + values.Append(0L); + bitmap.Append(false); + nullCount++; + } + else + { + values.Append((long)v); + bitmap.Append(true); + } + } + ArrowBuffer nullBitmap = nullCount > 0 ? bitmap.Build(allocator) : ArrowBuffer.Empty; + return (values.Build(allocator), nullBitmap, nullCount); + } + + private static (ArrowBuffer values, ArrowBuffer nullBitmap, int nullCount) BuildIntBuffers( + IList typedValues, MemoryAllocator allocator) + { + ArrowBuffer.Builder values = new ArrowBuffer.Builder(typedValues.Count); + ArrowBuffer.BitmapBuilder bitmap = new ArrowBuffer.BitmapBuilder(typedValues.Count); + int nullCount = 0; + foreach (object v in typedValues) + { + if (v == null) + { + values.Append(0); + bitmap.Append(false); + nullCount++; + } + else + { + values.Append((int)v); + bitmap.Append(true); + } + } + ArrowBuffer nullBitmap = nullCount > 0 ? bitmap.Build(allocator) : ArrowBuffer.Empty; + return (values.Build(allocator), nullBitmap, nullCount); + } + + private static Decimal128Array BuildDecimalArray( + ShredType shredType, IList typedValues, MemoryAllocator allocator) + { + int precision = shredType == ShredType.Decimal4 ? 9 + : shredType == ShredType.Decimal8 ? 18 + : 38; + // Scale: pick the max scale seen across all rows. Arrow's builder will rescale + // individual values to match; if rows have heterogeneous scales the larger one + // accommodates all values exactly (assuming precision is not exceeded). + int scale = 0; + foreach (object v in typedValues) + { + if (v is decimal d) + { + int s = (decimal.GetBits(d)[3] >> 16) & 0x7F; + if (s > scale) scale = s; + } + } + + Decimal128Array.Builder b = new Decimal128Array.Builder(new Decimal128Type(precision, scale)); + foreach (object v in typedValues) + { + if (v == null) b.AppendNull(); else b.Append((decimal)v); + } + return b.Build(allocator); + } + + /// + /// UUID is encoded as FixedSizeBinary(16) in big-endian (RFC 4122) byte order. + /// FixedSizeBinaryArray has no concrete public Builder, so we construct the + /// value buffer manually (16 bytes per row). + /// + private static FixedSizeBinaryArray BuildUuidArray( + IList typedValues, MemoryAllocator allocator) + { + FixedSizeBinaryType type = new FixedSizeBinaryType(16); + int rowCount = typedValues.Count; + ArrowBuffer.Builder values = new ArrowBuffer.Builder(rowCount * 16); + ArrowBuffer.BitmapBuilder bitmap = new ArrowBuffer.BitmapBuilder(rowCount); + int nullCount = 0; + byte[] scratch = new byte[16]; + + foreach (object v in typedValues) + { + if (v == null) + { + // Emit 16 zero bytes as a placeholder; the null bitmap marks it invalid. + for (int i = 0; i < 16; i++) values.Append((byte)0); + bitmap.Append(false); + nullCount++; + continue; + } + + Guid g = (Guid)v; +#if NET8_0_OR_GREATER + g.TryWriteBytes(scratch.AsSpan(), bigEndian: true, out _); +#else + byte[] native = g.ToByteArray(); + // Convert .NET mixed-endian to big-endian. + scratch[0] = native[3]; scratch[1] = native[2]; scratch[2] = native[1]; scratch[3] = native[0]; + scratch[4] = native[5]; scratch[5] = native[4]; + scratch[6] = native[7]; scratch[7] = native[6]; + Buffer.BlockCopy(native, 8, scratch, 8, 8); +#endif + for (int i = 0; i < 16; i++) values.Append(scratch[i]); + bitmap.Append(true); + } + + ArrowBuffer nullBitmap = nullCount > 0 ? bitmap.Build(allocator) : ArrowBuffer.Empty; + ArrayData data = new ArrayData( + type, rowCount, nullCount, 0, + new[] { nullBitmap, values.Build(allocator) }); + return new FixedSizeBinaryArray(data); + } + + // --------------------------------------------------------------- + // Utility + // --------------------------------------------------------------- + + private static ArrowBuffer BuildNullBitmap( + IList items, Func isValid, int rowCount, + MemoryAllocator allocator, out int nullCount) + { + ArrowBuffer.BitmapBuilder bitmap = new ArrowBuffer.BitmapBuilder(rowCount); + int nulls = 0; + for (int i = 0; i < rowCount; i++) + { + bool valid = isValid(items[i]); + bitmap.Append(valid); + if (!valid) nulls++; + } + nullCount = nulls; + return nulls > 0 ? bitmap.Build(allocator) : ArrowBuffer.Empty; + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/ShreddingHelpers.cs b/src/Apache.Arrow.Operations/Shredding/ShreddingHelpers.cs new file mode 100644 index 00000000..f7be85e8 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/ShreddingHelpers.cs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow; +using Apache.Arrow.Types; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Internal helpers shared by the shredded-variant reader trio. + /// + internal static class ShreddingHelpers + { + /// + /// Builds a slot for the given index of an element-group + /// struct (one with value and/or typed_value sub-fields). Either sub-field + /// may be absent from the struct. + /// + public static ShreddedVariant BuildSlot( + ShredSchema slotSchema, + ReadOnlySpan metadata, + StructArray elementGroup, + int index) + { + StructType elementGroupType = (StructType)elementGroup.Data.DataType; + int valueIdx = elementGroupType.GetFieldIndex("value"); + int typedIdx = elementGroupType.GetFieldIndex("typed_value"); + + IArrowArray valueArr = valueIdx >= 0 ? elementGroup.Fields[valueIdx] : null; + IArrowArray typedArr = typedIdx >= 0 ? elementGroup.Fields[typedIdx] : null; + + return new ShreddedVariant(slotSchema, metadata, valueArr, typedArr, index); + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/VariantArrayShreddingExtensions.cs b/src/Apache.Arrow.Operations/Shredding/VariantArrayShreddingExtensions.cs new file mode 100644 index 00000000..4e1fbfe1 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/VariantArrayShreddingExtensions.cs @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using Apache.Arrow; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Shredding-aware extensions on . Provides both + /// transparent materialization () and a + /// reader-style API () that exposes typed + /// columns and residual bytes side-by-side. + /// + public static class VariantArrayShreddingExtensions + { + /// + /// Gets the for a variant array, derived from + /// its Arrow storage type. Returns + /// for unshredded columns. + /// + public static ShredSchema GetShredSchema(this VariantArray array) + { + if (array == null) throw new ArgumentNullException(nameof(array)); + return ShredSchema.FromArrowType(array.VariantType.TypedValueField?.DataType); + } + + /// + /// Gets a reader for the element at the given index. + /// Exposes typed-column access and residual bytes without materializing the + /// full logical variant. Works for both shredded and unshredded columns. + /// + /// If the element is null. + public static ShreddedVariant GetShreddedVariant(this VariantArray array, int index) + { + if (array == null) throw new ArgumentNullException(nameof(array)); + if (index < 0 || index >= array.Length) + throw new ArgumentOutOfRangeException(nameof(index)); + if (array.IsNull(index)) + throw new InvalidOperationException("Cannot create a ShreddedVariant for a null element."); + + ShredSchema schema = GetShredSchema(array); + ReadOnlySpan metadata = array.GetMetadataBytes(index); + IArrowArray valueArr = array.VariantType.HasValueColumn + ? GetValueArray(array) + : null; + IArrowArray typedValueArr = array.TypedValueArray; + + return new ShreddedVariant(schema, metadata, valueArr, typedValueArr, index); + } + + /// + /// Materializes the element at into a logical + /// , transparently merging shredded columns and + /// residual bytes. Works for both shredded and unshredded columns. + /// + public static VariantValue GetLogicalVariantValue(this VariantArray array, int index) + { + if (array == null) throw new ArgumentNullException(nameof(array)); + if (index < 0 || index >= array.Length) + throw new ArgumentOutOfRangeException(nameof(index)); + if (array.IsNull(index)) + return VariantValue.Null; + + return GetShreddedVariant(array, index).ToVariantValue(); + } + + /// + /// Returns the underlying value sub-array of the VariantArray's struct storage. + /// This mirrors what uses internally. + /// + private static IArrowArray GetValueArray(VariantArray array) + { + StructArray storage = array.StorageArray; + var structType = (Apache.Arrow.Types.StructType)storage.Data.DataType; + int idx = structType.GetFieldIndex("value"); + return storage.Fields[idx]; + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/VariantShredder.cs b/src/Apache.Arrow.Operations/Shredding/VariantShredder.cs new file mode 100644 index 00000000..0a825b3f --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/VariantShredder.cs @@ -0,0 +1,312 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Decomposes s into shredded (value, typed_value) pairs + /// according to a . + /// + /// Per the Parquet variant shredding spec, the variant metadata dictionary is shared + /// across an entire column. The + /// batch entrypoint builds that shared metadata and emits per-row value bytes that + /// reference it — ready to drop into a Parquet value column. + /// + /// + public static class VariantShredder + { + /// + /// Shreds a column of variant values into a shared metadata dictionary and + /// per-row s. The residual + /// bytes for each row reference the returned metadata. + /// + public static (byte[] Metadata, IReadOnlyList Rows) Shred( + IEnumerable values, + ShredSchema schema) + { + if (values == null) throw new ArgumentNullException(nameof(values)); + if (schema == null) throw new ArgumentNullException(nameof(schema)); + + // Materialize so we can make two passes (metadata collection, then shredding). + List rows = values as List ?? new List(values); + + // Pass 1: collect every field name into the shared metadata dictionary. + // A superset is fine per-spec; the Parquet value column just needs to resolve + // any field ID it references. + VariantMetadataBuilder metadata = new VariantMetadataBuilder(); + foreach (VariantValue row in rows) + { + CollectFieldNames(row, metadata); + } + byte[] metadataBytes = metadata.Build(out int[] idRemap); + + // Pass 2: shred each row against the finalized metadata. + ShredResult[] results = new ShredResult[rows.Count]; + for (int i = 0; i < rows.Count; i++) + { + results[i] = Shred(rows[i], schema, metadata, idRemap); + } + + return (metadataBytes, results); + } + + /// + /// Shreds a single variant value against a caller-managed metadata dictionary. + /// Use this when combining shredded columns with external metadata, or when + /// streaming rows one at a time. The caller is responsible for ensuring + /// already contains every field name the residual + /// may reference. + /// + public static ShredResult Shred( + VariantValue value, + ShredSchema schema, + VariantMetadataBuilder metadata, + int[] idRemap) + { + if (schema == null) throw new ArgumentNullException(nameof(schema)); + if (metadata == null) throw new ArgumentNullException(nameof(metadata)); + if (idRemap == null) throw new ArgumentNullException(nameof(idRemap)); + + switch (schema.TypedValueType) + { + case ShredType.None: + return ShredAsUntyped(value, metadata, idRemap); + case ShredType.Object: + return ShredAsObject(value, schema, metadata, idRemap); + case ShredType.Array: + return ShredAsArray(value, schema, metadata, idRemap); + default: + return ShredAsPrimitive(value, schema, metadata, idRemap); + } + } + + private static ShredResult ShredAsUntyped(VariantValue value, VariantMetadataBuilder metadata, int[] idRemap) + { + return new ShredResult(EncodeValue(value, metadata, idRemap), null); + } + + private static ShredResult ShredAsPrimitive(VariantValue value, ShredSchema schema, VariantMetadataBuilder metadata, int[] idRemap) + { + ShredType actualType = GetShredType(value); + if (actualType == schema.TypedValueType) + { + return new ShredResult(null, ExtractTypedValue(value, schema.TypedValueType)); + } + return new ShredResult(EncodeValue(value, metadata, idRemap), null); + } + + private static ShredResult ShredAsObject(VariantValue value, ShredSchema schema, VariantMetadataBuilder metadata, int[] idRemap) + { + if (!value.IsObject) + { + return new ShredResult(EncodeValue(value, metadata, idRemap), null); + } + + IReadOnlyDictionary fields = value.AsObject(); + Dictionary shreddedFields = new Dictionary(schema.ObjectFields.Count); + List> residualFields = null; + + foreach (KeyValuePair schemaField in schema.ObjectFields) + { + if (fields.TryGetValue(schemaField.Key, out VariantValue fieldValue)) + { + shreddedFields[schemaField.Key] = Shred(fieldValue, schemaField.Value, metadata, idRemap); + } + else + { + shreddedFields[schemaField.Key] = ShredResult.Missing; + } + } + + foreach (KeyValuePair field in fields) + { + if (!schema.ObjectFields.ContainsKey(field.Key)) + { + if (residualFields == null) + { + residualFields = new List>(); + } + residualFields.Add(field); + } + } + + ShredObjectResult typedValue = new ShredObjectResult(shreddedFields); + if (residualFields != null) + { + return new ShredResult(EncodeResidualObject(residualFields, metadata, idRemap), typedValue); + } + return new ShredResult(null, typedValue); + } + + private static ShredResult ShredAsArray(VariantValue value, ShredSchema schema, VariantMetadataBuilder metadata, int[] idRemap) + { + if (!value.IsArray) + { + return new ShredResult(EncodeValue(value, metadata, idRemap), null); + } + + IReadOnlyList elements = value.AsArray(); + List shreddedElements = new List(elements.Count); + for (int i = 0; i < elements.Count; i++) + { + shreddedElements.Add(Shred(elements[i], schema.ArrayElement, metadata, idRemap)); + } + return new ShredResult(null, new ShredArrayResult(shreddedElements)); + } + + // --------------------------------------------------------------- + // Encoding helpers — write value bytes referencing the shared metadata + // --------------------------------------------------------------- + + private static byte[] EncodeValue(VariantValue value, VariantMetadataBuilder metadata, int[] idRemap) + { + using VariantValueWriter writer = new VariantValueWriter(metadata, idRemap); + WriteVariantValue(writer, value); + return writer.ToArray(); + } + + private static byte[] EncodeResidualObject(List> fields, VariantMetadataBuilder metadata, int[] idRemap) + { + using VariantValueWriter writer = new VariantValueWriter(metadata, idRemap); + writer.BeginObject(); + foreach (KeyValuePair field in fields) + { + writer.WriteFieldName(field.Key); + WriteVariantValue(writer, field.Value); + } + writer.EndObject(); + return writer.ToArray(); + } + + private static void WriteVariantValue(VariantValueWriter writer, VariantValue variant) + { + if (variant.IsNull) { writer.WriteNull(); return; } + if (variant.IsBoolean) { writer.WriteBoolean(variant.AsBoolean()); return; } + if (variant.IsObject) + { + writer.BeginObject(); + foreach (KeyValuePair field in variant.AsObject()) + { + writer.WriteFieldName(field.Key); + WriteVariantValue(writer, field.Value); + } + writer.EndObject(); + return; + } + if (variant.IsArray) + { + writer.BeginArray(); + foreach (VariantValue element in variant.AsArray()) + { + WriteVariantValue(writer, element); + } + writer.EndArray(); + return; + } + + switch (variant.PrimitiveType) + { + case VariantPrimitiveType.Int8: writer.WriteInt8(variant.AsInt8()); break; + case VariantPrimitiveType.Int16: writer.WriteInt16(variant.AsInt16()); break; + case VariantPrimitiveType.Int32: writer.WriteInt32(variant.AsInt32()); break; + case VariantPrimitiveType.Int64: writer.WriteInt64(variant.AsInt64()); break; + case VariantPrimitiveType.Float: writer.WriteFloat(variant.AsFloat()); break; + case VariantPrimitiveType.Double: writer.WriteDouble(variant.AsDouble()); break; + case VariantPrimitiveType.Decimal4: writer.WriteDecimal4(variant.AsDecimal()); break; + case VariantPrimitiveType.Decimal8: writer.WriteDecimal8(variant.AsDecimal()); break; + case VariantPrimitiveType.Decimal16: writer.WriteDecimal16(variant.AsSqlDecimal()); break; + case VariantPrimitiveType.Date: writer.WriteDateDays(variant.AsDateDays()); break; + case VariantPrimitiveType.Timestamp: writer.WriteTimestampMicros(variant.AsTimestampMicros()); break; + case VariantPrimitiveType.TimestampNtz: writer.WriteTimestampNtzMicros(variant.AsTimestampNtzMicros()); break; + case VariantPrimitiveType.TimeNtz: writer.WriteTimeNtzMicros(variant.AsTimeNtzMicros()); break; + case VariantPrimitiveType.TimestampTzNanos: writer.WriteTimestampTzNanos(variant.AsTimestampTzNanos()); break; + case VariantPrimitiveType.TimestampNtzNanos: writer.WriteTimestampNtzNanos(variant.AsTimestampNtzNanos()); break; + case VariantPrimitiveType.String: writer.WriteString(variant.AsString()); break; + case VariantPrimitiveType.Binary: writer.WriteBinary(variant.AsBinary()); break; + case VariantPrimitiveType.Uuid: writer.WriteUuid(variant.AsUuid()); break; + default: throw new NotSupportedException($"Unsupported primitive type: {variant.PrimitiveType}"); + } + } + + private static void CollectFieldNames(VariantValue variant, VariantMetadataBuilder builder) + { + if (variant.IsObject) + { + foreach (KeyValuePair field in variant.AsObject()) + { + builder.Add(field.Key); + CollectFieldNames(field.Value, builder); + } + } + else if (variant.IsArray) + { + foreach (VariantValue element in variant.AsArray()) + { + CollectFieldNames(element, builder); + } + } + } + + // --------------------------------------------------------------- + // Type extraction + // --------------------------------------------------------------- + + /// + /// Extracts the native CLR value from a for + /// storage in a typed Parquet column. + /// + internal static object ExtractTypedValue(VariantValue value, ShredType shredType) + { + switch (shredType) + { + case ShredType.Boolean: return value.AsBoolean(); + case ShredType.Int8: return value.AsInt8(); + case ShredType.Int16: return value.AsInt16(); + case ShredType.Int32: return value.AsInt32(); + case ShredType.Int64: return value.AsInt64(); + case ShredType.Float: return value.AsFloat(); + case ShredType.Double: return value.AsDouble(); + case ShredType.Decimal4: + case ShredType.Decimal8: + case ShredType.Decimal16: return value.AsDecimal(); + case ShredType.Date: return value.AsDateDays(); + case ShredType.Timestamp: return value.AsTimestampMicros(); + case ShredType.TimestampNtz: return value.AsTimestampNtzMicros(); + case ShredType.TimeNtz: return value.AsTimeNtzMicros(); + case ShredType.TimestampTzNanos: return value.AsTimestampTzNanos(); + case ShredType.TimestampNtzNanos: return value.AsTimestampNtzNanos(); + case ShredType.String: return value.AsString(); + case ShredType.Binary: return value.AsBinary(); + case ShredType.Uuid: return value.AsUuid(); + default: + throw new InvalidOperationException($"Cannot extract typed value for ShredType.{shredType}."); + } + } + + /// + /// Determines the of a . + /// + internal static ShredType GetShredType(VariantValue value) + { + if (value.IsObject) return ShredType.Object; + if (value.IsArray) return ShredType.Array; + return ShredSchema.ShredTypeFromPrimitive(value.PrimitiveType); + } + } +} diff --git a/src/Apache.Arrow.Operations/Shredding/VariantUnshredder.cs b/src/Apache.Arrow.Operations/Shredding/VariantUnshredder.cs new file mode 100644 index 00000000..346e57d8 --- /dev/null +++ b/src/Apache.Arrow.Operations/Shredding/VariantUnshredder.cs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Scalars.Variant; + +namespace Apache.Arrow.Operations.Shredding +{ + /// + /// Reconstructs s from shredded (value, typed_value) pairs + /// according to a . The residual + /// bytes are interpreted against the column-level variant metadata supplied to + /// . + /// + public static class VariantUnshredder + { + /// + /// Reconstructs a variant value from a shredded result. + /// + /// The shredded (value, typed_value) pair. + /// The shredding schema that was used to produce the result. + /// The column-level variant metadata bytes. + /// + /// The reconstructed , or null if the field is missing + /// (both value and typed_value are null). + /// + public static VariantValue? Reconstruct(ShredResult shredded, ShredSchema schema, ReadOnlySpan metadata) + { + if (shredded == null) throw new ArgumentNullException(nameof(shredded)); + if (schema == null) throw new ArgumentNullException(nameof(schema)); + + if (shredded.IsMissing) + { + return null; + } + + switch (schema.TypedValueType) + { + case ShredType.Object: + return ReconstructObject(shredded, schema, metadata); + case ShredType.Array: + return ReconstructArray(shredded, schema, metadata); + case ShredType.None: + return DecodeValue(metadata, shredded.Value); + default: + return ReconstructPrimitive(shredded, schema, metadata); + } + } + + private static VariantValue ReconstructPrimitive(ShredResult shredded, ShredSchema schema, ReadOnlySpan metadata) + { + if (shredded.TypedValue != null) + { + return CreateVariantFromTyped(shredded.TypedValue, schema.TypedValueType); + } + return DecodeValue(metadata, shredded.Value); + } + + private static VariantValue ReconstructObject(ShredResult shredded, ShredSchema schema, ReadOnlySpan metadata) + { + if (shredded.TypedValue == null) + { + // Source value wasn't an object — the whole thing is in the residual. + return DecodeValue(metadata, shredded.Value); + } + + ShredObjectResult objectResult = (ShredObjectResult)shredded.TypedValue; + Dictionary fields = new Dictionary(); + + foreach (KeyValuePair fieldEntry in objectResult.Fields) + { + if (!schema.ObjectFields.TryGetValue(fieldEntry.Key, out ShredSchema fieldSchema)) + { + throw new InvalidOperationException( + $"Shredded object contains field '{fieldEntry.Key}' not in schema."); + } + + VariantValue? fieldValue = Reconstruct(fieldEntry.Value, fieldSchema, metadata); + if (fieldValue.HasValue) + { + fields[fieldEntry.Key] = fieldValue.Value; + } + // If null (missing), the field is omitted from the result. + } + + if (shredded.Value != null) + { + VariantValue residual = DecodeValue(metadata, shredded.Value); + if (!residual.IsObject) + { + throw new InvalidOperationException( + "Residual value for a partially shredded object must be an object."); + } + foreach (KeyValuePair residualField in residual.AsObject()) + { + fields[residualField.Key] = residualField.Value; + } + } + + return VariantValue.FromObject(fields); + } + + private static VariantValue ReconstructArray(ShredResult shredded, ShredSchema schema, ReadOnlySpan metadata) + { + if (shredded.TypedValue == null) + { + return DecodeValue(metadata, shredded.Value); + } + + ShredArrayResult arrayResult = (ShredArrayResult)shredded.TypedValue; + List elements = new List(arrayResult.Elements.Count); + + for (int i = 0; i < arrayResult.Elements.Count; i++) + { + VariantValue? elementValue = Reconstruct(arrayResult.Elements[i], schema.ArrayElement, metadata); + if (!elementValue.HasValue) + { + throw new InvalidOperationException( + $"Array element at index {i} is missing, but array elements cannot be missing."); + } + elements.Add(elementValue.Value); + } + + return VariantValue.FromArray(elements); + } + + private static VariantValue DecodeValue(ReadOnlySpan metadata, byte[] value) + { + if (value == null) throw new ArgumentNullException(nameof(value)); + VariantReader reader = new VariantReader(metadata, value); + return reader.ToVariantValue(); + } + + /// + /// Creates a from a typed CLR value and its . + /// + internal static VariantValue CreateVariantFromTyped(object typedValue, ShredType shredType) + { + switch (shredType) + { + case ShredType.Boolean: return VariantValue.FromBoolean((bool)typedValue); + case ShredType.Int8: return VariantValue.FromInt8((sbyte)typedValue); + case ShredType.Int16: return VariantValue.FromInt16((short)typedValue); + case ShredType.Int32: return VariantValue.FromInt32((int)typedValue); + case ShredType.Int64: return VariantValue.FromInt64((long)typedValue); + case ShredType.Float: return VariantValue.FromFloat((float)typedValue); + case ShredType.Double: return VariantValue.FromDouble((double)typedValue); + case ShredType.Decimal4: return VariantValue.FromDecimal4((decimal)typedValue); + case ShredType.Decimal8: return VariantValue.FromDecimal8((decimal)typedValue); + case ShredType.Decimal16: return VariantValue.FromDecimal16((decimal)typedValue); + case ShredType.Date: return VariantValue.FromDate((int)typedValue); + case ShredType.Timestamp: return VariantValue.FromTimestamp((long)typedValue); + case ShredType.TimestampNtz: return VariantValue.FromTimestampNtz((long)typedValue); + case ShredType.TimeNtz: return VariantValue.FromTimeNtz((long)typedValue); + case ShredType.TimestampTzNanos: return VariantValue.FromTimestampTzNanos((long)typedValue); + case ShredType.TimestampNtzNanos: return VariantValue.FromTimestampNtzNanos((long)typedValue); + case ShredType.String: return VariantValue.FromString((string)typedValue); + case ShredType.Binary: return VariantValue.FromBinary((byte[])typedValue); + case ShredType.Uuid: return VariantValue.FromUuid((Guid)typedValue); + default: + throw new InvalidOperationException($"Cannot create VariantValue for ShredType.{shredType}."); + } + } + } +} diff --git a/src/Apache.Arrow.Scalars/Variant/VariantMetadataBuilder.cs b/src/Apache.Arrow.Scalars/Variant/VariantMetadataBuilder.cs index 9e06feaf..61f708a3 100644 --- a/src/Apache.Arrow.Scalars/Variant/VariantMetadataBuilder.cs +++ b/src/Apache.Arrow.Scalars/Variant/VariantMetadataBuilder.cs @@ -57,6 +57,38 @@ public int Add(string name) /// public int Count => _names.Count; + /// + /// Recursively walks and adds every field name it + /// references to this builder. Use during the metadata-collection phase of + /// a two-pass encode — build the metadata first, finalize it via + /// , then pass the resulting remap to a + /// and call . + /// + public void CollectFieldNames(VariantReader source) + { + switch (source.BasicType) + { + case VariantBasicType.Object: + VariantObjectReader obj = new VariantObjectReader(source.Metadata, source.Value); + for (int i = 0; i < obj.FieldCount; i++) + { + Add(obj.GetFieldName(i)); + CollectFieldNames(obj.GetFieldValue(i)); + } + return; + + case VariantBasicType.Array: + VariantArrayReader arr = new VariantArrayReader(source.Metadata, source.Value); + for (int i = 0; i < arr.ElementCount; i++) + { + CollectFieldNames(arr.GetElement(i)); + } + return; + + // Primitive values and short strings have no field-name references. + } + } + /// /// Builds the binary metadata with the dictionary sorted by UTF-8 byte order. /// diff --git a/src/Apache.Arrow.Scalars/Variant/VariantValue.cs b/src/Apache.Arrow.Scalars/Variant/VariantValue.cs index d16b1043..8848ca70 100644 --- a/src/Apache.Arrow.Scalars/Variant/VariantValue.cs +++ b/src/Apache.Arrow.Scalars/Variant/VariantValue.cs @@ -113,6 +113,24 @@ public static VariantValue FromDecimal8(decimal value) => public static VariantValue FromDecimal16(decimal value) => new VariantValue(VariantPrimitiveType.Decimal16, (object)value); + /// + /// Creates a Decimal16 variant value from a , always + /// producing . Values exceeding + /// range are stored as . + /// Use this when the target type is known (e.g. materializing a Decimal16 + /// shredded column); use when you + /// want the smallest decimal type that fits the value. + /// + public static VariantValue FromDecimal16(SqlDecimal value) + { + if (value.Data[3] != 0) + { + SqlDecimal normalized = SqlDecimal.ConvertToPrecScale(value, 38, value.Scale); + return new VariantValue(VariantPrimitiveType.Decimal16, (object)normalized); + } + return new VariantValue(VariantPrimitiveType.Decimal16, (object)value.Value); + } + /// /// Creates a decimal variant value, choosing the smallest decimal type /// that can represent the value (Decimal4, Decimal8, or Decimal16). diff --git a/src/Apache.Arrow.Scalars/Variant/VariantValueWriter.cs b/src/Apache.Arrow.Scalars/Variant/VariantValueWriter.cs index 0daa47de..781f4a4f 100644 --- a/src/Apache.Arrow.Scalars/Variant/VariantValueWriter.cs +++ b/src/Apache.Arrow.Scalars/Variant/VariantValueWriter.cs @@ -530,6 +530,92 @@ public void WriteTimestampNtzNanos(long nanos) buf.WriteInt64LE(nanos); } + // --------------------------------------------------------------- + // Transcode from a VariantReader + // --------------------------------------------------------------- + + /// + /// Copies the variant value pointed to by into this + /// writer. Useful when copying between metadata dictionaries: field IDs in the + /// source are re-looked-up against this writer's + /// on the fly, via . + /// + /// + /// All field names referenced anywhere in must already + /// exist in the metadata builder used to construct this writer. Use + /// during + /// the metadata-collection phase of a two-pass encode to accumulate them. + /// + public void CopyValue(VariantReader source) + { + switch (source.BasicType) + { + case VariantBasicType.Primitive: + CopyPrimitive(source); + return; + + case VariantBasicType.ShortString: + WriteString(source.GetString()); + return; + + case VariantBasicType.Object: + VariantObjectReader obj = new VariantObjectReader(source.Metadata, source.Value); + BeginObject(); + for (int i = 0; i < obj.FieldCount; i++) + { + WriteFieldName(obj.GetFieldName(i)); + CopyValue(obj.GetFieldValue(i)); + } + EndObject(); + return; + + case VariantBasicType.Array: + VariantArrayReader arr = new VariantArrayReader(source.Metadata, source.Value); + BeginArray(); + for (int i = 0; i < arr.ElementCount; i++) + { + CopyValue(arr.GetElement(i)); + } + EndArray(); + return; + + default: + throw new NotSupportedException($"Unsupported basic type: {source.BasicType}"); + } + } + + private void CopyPrimitive(VariantReader source) + { + VariantPrimitiveType? pt = source.PrimitiveType; + switch (pt) + { + case VariantPrimitiveType.NullType: WriteNull(); return; + case VariantPrimitiveType.BooleanTrue: WriteBoolean(true); return; + case VariantPrimitiveType.BooleanFalse: WriteBoolean(false); return; + case VariantPrimitiveType.Int8: WriteInt8(source.GetInt8()); return; + case VariantPrimitiveType.Int16: WriteInt16(source.GetInt16()); return; + case VariantPrimitiveType.Int32: WriteInt32(source.GetInt32()); return; + case VariantPrimitiveType.Int64: WriteInt64(source.GetInt64()); return; + case VariantPrimitiveType.Float: WriteFloat(source.GetFloat()); return; + case VariantPrimitiveType.Double: WriteDouble(source.GetDouble()); return; + case VariantPrimitiveType.Decimal4: WriteDecimal4(source.GetDecimal4()); return; + case VariantPrimitiveType.Decimal8: WriteDecimal8(source.GetDecimal8()); return; + // Decimal16 may exceed System.Decimal's range, so route through SqlDecimal. + case VariantPrimitiveType.Decimal16: WriteDecimal16(source.GetSqlDecimal()); return; + case VariantPrimitiveType.Date: WriteDateDays(source.GetDateDays()); return; + case VariantPrimitiveType.Timestamp: WriteTimestampMicros(source.GetTimestampMicros()); return; + case VariantPrimitiveType.TimestampNtz: WriteTimestampNtzMicros(source.GetTimestampNtzMicros()); return; + case VariantPrimitiveType.TimeNtz: WriteTimeNtzMicros(source.GetTimeNtzMicros()); return; + case VariantPrimitiveType.TimestampTzNanos: WriteTimestampTzNanos(source.GetTimestampTzNanos()); return; + case VariantPrimitiveType.TimestampNtzNanos: WriteTimestampNtzNanos(source.GetTimestampNtzNanos()); return; + case VariantPrimitiveType.String: WriteString(source.GetString()); return; + case VariantPrimitiveType.Binary: WriteBinary(source.GetBinary()); return; + case VariantPrimitiveType.Uuid: WriteUuid(source.GetUuid()); return; + default: + throw new NotSupportedException($"Unsupported primitive type: {pt}"); + } + } + // --------------------------------------------------------------- // Internal bookkeeping // --------------------------------------------------------------- diff --git a/src/Apache.Arrow/Arrays/VariantArray.cs b/src/Apache.Arrow/Arrays/VariantArray.cs index c31a382a..3bc9fd53 100644 --- a/src/Apache.Arrow/Arrays/VariantArray.cs +++ b/src/Apache.Arrow/Arrays/VariantArray.cs @@ -37,9 +37,15 @@ private VariantExtensionDefinition() { } public override bool TryCreateType(IArrowType storageType, string metadata, out ExtensionType type) { + // Accept the Parquet variant storage layouts: + // struct (unshredded) + // struct (shredded with residual) + // struct (fully shredded, no value column) + // The metadata field is required. At least one of value/typed_value must be present. if (storageType is StructType structType && FindBinaryFieldIndex(structType, "metadata") >= 0 && - FindBinaryFieldIndex(structType, "value") >= 0) + (FindBinaryFieldIndex(structType, "value") >= 0 || + structType.GetFieldIndex("typed_value") >= 0)) { type = new VariantType(structType); return true; @@ -67,8 +73,15 @@ internal static int FindBinaryFieldIndex(StructType structType, string name) } /// - /// Extension type representing Parquet Variant values, stored as - /// struct<metadata: binary, value: binary>. + /// Extension type representing Parquet Variant values. The underlying storage is + /// a struct with a required metadata binary field and at least one of: + /// + /// value: binary — the variant value bytes (possibly residual when shredded). + /// typed_value: T — a typed column populated by variant shredding, where + /// T is an Arrow primitive, struct, or list per the Parquet variant shredding spec. + /// + /// Use to check for shredded layouts. Decoding shredded + /// values requires Apache.Arrow.Operations.Shredding. /// public class VariantType : ExtensionType { @@ -79,14 +92,46 @@ public class VariantType : ExtensionType public override string Name => ExtensionName; public override string ExtensionMetadata => ""; + /// + /// True if the storage layout has a value binary field (unshredded or + /// partially shredded). False for fully shredded layouts that omit the column. + /// + public bool HasValueColumn { get; } + + /// + /// True if the storage layout has a typed_value field (shredded). + /// + public bool HasTypedValueColumn { get; } + + /// + /// True if the storage layout includes any shredding (has a typed_value + /// column, or lacks a value column indicating full shredding). + /// + public bool IsShredded => HasTypedValueColumn || !HasValueColumn; + + /// + /// The typed_value field when is true; otherwise null. + /// + public Field TypedValueField { get; } + public VariantType() : base(new StructType(new[] { new Field("metadata", BinaryType.Default, false), new Field("value", BinaryType.Default, false), })) - { } + { + HasValueColumn = true; + HasTypedValueColumn = false; + TypedValueField = null; + } - internal VariantType(StructType storageType) : base(storageType) { } + internal VariantType(StructType storageType) : base(storageType) + { + HasValueColumn = VariantExtensionDefinition.FindBinaryFieldIndex(storageType, "value") >= 0; + int typedIdx = storageType.GetFieldIndex("typed_value"); + HasTypedValueColumn = typedIdx >= 0; + TypedValueField = typedIdx >= 0 ? storageType.Fields[typedIdx] : null; + } public override ExtensionArray CreateArray(IArrowArray storageArray) { @@ -107,15 +152,53 @@ public class VariantArray : ExtensionArray, IReadOnlyList public StructArray StorageArray => (StructArray)Storage; + /// + /// The variant type metadata describing which columns are present. + /// + public VariantType VariantType => (VariantType)ExtensionType; + + /// + /// True when the underlying column includes shredded data (has a + /// typed_value column, or lacks a value column). Reads + /// on shredded columns require Apache.Arrow.Operations.Shredding. + /// + public bool IsShredded => VariantType.IsShredded; + + /// + /// The typed_value child array when the column is shredded; otherwise null. + /// + public IArrowArray TypedValueArray { get; } + public VariantArray(VariantType variantType, IArrowArray storage) : base(variantType, storage) { var structType = (StructType)variantType.StorageType; _metadataArray = DecodeBinaryArray(StorageArray.Fields[structType.GetFieldIndex("metadata")], out _metadataIndexes); - _valueArray = DecodeBinaryArray(StorageArray.Fields[structType.GetFieldIndex("value")], out _valueIndexes); + + if (variantType.HasValueColumn) + { + _valueArray = DecodeBinaryArray(StorageArray.Fields[structType.GetFieldIndex("value")], out _valueIndexes); + } + + if (variantType.HasTypedValueColumn) + { + TypedValueArray = StorageArray.Fields[structType.GetFieldIndex("typed_value")]; + } } - public VariantArray(IArrowArray storage) : this(VariantType.Default, storage) { } + public VariantArray(IArrowArray storage) : this(InferVariantType(storage), storage) { } + + private static VariantType InferVariantType(IArrowArray storage) + { + if (storage == null) throw new ArgumentNullException(nameof(storage)); + if (VariantExtensionDefinition.Instance.TryCreateType(storage.Data.DataType, null, out ExtensionType ext)) + { + return (VariantType)ext; + } + throw new ArgumentException( + "Storage array does not match a variant layout (expected struct).", + nameof(storage)); + } /// /// Gets the metadata bytes for the element at the given index. @@ -129,18 +212,52 @@ public ReadOnlySpan GetMetadataBytes(int index) /// /// Gets the value bytes for the element at the given index. /// + /// If the column has no value field. public ReadOnlySpan GetValueBytes(int index) { + if (_valueArray == null) + { + throw new InvalidOperationException( + "This VariantArray has no 'value' column (fully shredded layout). " + + "Use the shredding-aware readers in Apache.Arrow.Operations.Shredding."); + } int physicalIndex = _valueIndexes.GetPhysicalIndex(index); return _valueArray.GetBytes(physicalIndex, out bool isNull); } + /// + /// Returns true and sets when the element at + /// has value bytes, false otherwise. Shredded + /// elements whose residual is null will return false. + /// + public bool TryGetValueBytes(int index, out ReadOnlySpan value) + { + if (_valueArray == null) + { + value = default; + return false; + } + int physicalIndex = _valueIndexes.GetPhysicalIndex(index); + ReadOnlySpan bytes = _valueArray.GetBytes(physicalIndex, out bool isNull); + if (isNull) + { + value = default; + return false; + } + value = bytes; + return true; + } + /// /// Gets a zero-copy for the element at the given index. /// The reader is only valid while the underlying array buffers are alive. /// /// If is out of range. - /// If the element at is null. + /// + /// If the element at is null, or the column is shredded + /// (a over residual bytes alone does not represent the + /// logical variant — use Apache.Arrow.Operations.Shredding instead). + /// public VariantReader GetVariantReader(int index) { if (index < 0 || index >= Length) @@ -149,12 +266,20 @@ public VariantReader GetVariantReader(int index) if (IsNull(index)) throw new InvalidOperationException("Cannot create a VariantReader for a null element."); + if (IsShredded) + throw new InvalidOperationException( + "Cannot create a VariantReader for a shredded column. Use the shredding-aware " + + "readers in Apache.Arrow.Operations.Shredding."); + return new VariantReader(GetMetadataBytes(index), GetValueBytes(index)); } /// /// Gets a materialized for the element at the given index. + /// Shredded columns require Apache.Arrow.Operations.Shredding; call the + /// GetShreddedVariant / GetLogicalVariantValue extension methods instead. /// + /// If the column is shredded. public VariantValue GetVariantValue(int index) { if (index < 0 || index >= Length) @@ -163,6 +288,12 @@ public VariantValue GetVariantValue(int index) if (IsNull(index)) return VariantValue.Null; + if (IsShredded) + throw new InvalidOperationException( + "GetVariantValue is not supported on shredded VariantArrays. " + + "Reference Apache.Arrow.Operations.Shredding and use GetLogicalVariantValue " + + "(transparent materialization) or GetShreddedVariant (reader-style access)."); + var metadata = GetMetadataBytes(index); var value = GetValueBytes(index); var reader = new VariantReader(metadata, value); @@ -174,6 +305,13 @@ public VariantValue GetVariantValue(int index) public IEnumerator GetEnumerator() { + if (IsShredded) + { + throw new InvalidOperationException( + "Enumeration is not supported on shredded VariantArrays. " + + "Reference Apache.Arrow.Operations.Shredding and iterate via GetLogicalVariantValue."); + } + IEnumerator metadataIdx = _metadataIndexes.EnumeratePhysicalIndices().GetEnumerator(); IEnumerator valueIdx = _valueIndexes.EnumeratePhysicalIndices().GetEnumerator(); for (int i = 0; metadataIdx.MoveNext() && valueIdx.MoveNext(); i++) diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShredRoundTripTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShredRoundTripTests.cs new file mode 100644 index 00000000..24728bdb --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShredRoundTripTests.cs @@ -0,0 +1,404 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Operations.VariantJson; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + /// + /// Round-trip tests: VariantValue → Shred → Reconstruct → VariantValue, verifying equality. + /// + public class ShredRoundTripTests + { + private static VariantValue RoundTrip(VariantValue original, ShredSchema schema) + { + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(new[] { original }, schema); + VariantValue? result = VariantUnshredder.Reconstruct(rows[0], schema, metadata); + Assert.True(result.HasValue, "Round-trip should not produce a missing value."); + return result.Value; + } + + // --------------------------------------------------------------- + // Primitives through typed columns + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_Boolean_True() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Boolean); + Assert.Equal(VariantValue.True, RoundTrip(VariantValue.True, schema)); + } + + [Fact] + public void RoundTrip_Boolean_False() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Boolean); + Assert.Equal(VariantValue.False, RoundTrip(VariantValue.False, schema)); + } + + [Fact] + public void RoundTrip_Int8() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int8); + VariantValue v = VariantValue.FromInt8(-42); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Int16() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int16); + VariantValue v = VariantValue.FromInt16(short.MaxValue); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Int32() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + VariantValue v = VariantValue.FromInt32(42); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Int64() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int64); + VariantValue v = VariantValue.FromInt64(long.MaxValue); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Float() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Float); + VariantValue v = VariantValue.FromFloat(3.14f); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Double() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Double); + VariantValue v = VariantValue.FromDouble(Math.PI); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_String() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.String); + VariantValue v = VariantValue.FromString("hello world"); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Binary() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Binary); + VariantValue v = VariantValue.FromBinary(new byte[] { 0, 1, 2, 255 }); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Uuid() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Uuid); + VariantValue v = VariantValue.FromUuid(Guid.NewGuid()); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Date() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Date); + VariantValue v = VariantValue.FromDate(19000); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Timestamp() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Timestamp); + VariantValue v = VariantValue.FromTimestamp(1640995200000000L); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Decimal4() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Decimal4); + VariantValue v = VariantValue.FromDecimal4(99.99m); + Assert.Equal(v, RoundTrip(v, schema)); + } + + // --------------------------------------------------------------- + // Primitives through binary fallback (type mismatch) + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_TypeMismatch_FallsBackToBinary() + { + // Schema expects Int32, but value is a string — goes through binary. + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + VariantValue v = VariantValue.FromString("hello"); + Assert.Equal(v, RoundTrip(v, schema)); + } + + [Fact] + public void RoundTrip_Null_FallsBackToBinary() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + Assert.Equal(VariantValue.Null, RoundTrip(VariantValue.Null, schema)); + } + + // --------------------------------------------------------------- + // Objects + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_FullyShreddedObject() + { + VariantValue original = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + Assert.Equal(original, RoundTrip(original, schema)); + } + + [Fact] + public void RoundTrip_PartiallyShreddedObject() + { + VariantValue original = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + { "active", VariantValue.True }, + }); + + // Only shred "name" — "age" and "active" go to residual binary. + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }); + + Assert.Equal(original, RoundTrip(original, schema)); + } + + [Fact] + public void RoundTrip_ObjectWithMissingField() + { + // Only "name" present, schema expects "name" and "age". + VariantValue original = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + Assert.Equal(original, RoundTrip(original, schema)); + } + + [Fact] + public void RoundTrip_EmptyObject() + { + VariantValue original = VariantValue.FromObject(new Dictionary()); + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "x", ShredSchema.Primitive(ShredType.Int32) }, + }); + + Assert.Equal(original, RoundTrip(original, schema)); + } + + // --------------------------------------------------------------- + // Arrays + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_Array_Homogeneous() + { + VariantValue original = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(3)); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + Assert.Equal(original, RoundTrip(original, schema)); + } + + [Fact] + public void RoundTrip_Array_Mixed() + { + VariantValue original = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.Null, + VariantValue.True); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + Assert.Equal(original, RoundTrip(original, schema)); + } + + [Fact] + public void RoundTrip_EmptyArray() + { + VariantValue original = VariantValue.FromArray(new List()); + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + Assert.Equal(original, RoundTrip(original, schema)); + } + + // --------------------------------------------------------------- + // Nested structures + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_NestedObjectsAndArrays() + { + VariantValue original = VariantValue.FromObject(new Dictionary + { + { "users", VariantValue.FromArray( + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "scores", VariantValue.FromArray( + VariantValue.FromInt32(95), + VariantValue.FromInt32(87)) + }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "scores", VariantValue.FromArray( + VariantValue.FromInt32(88)) + }, + })) + }, + { "count", VariantValue.FromInt32(2) }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "users", ShredSchema.ForArray( + ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "scores", ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)) }, + })) + }, + { "count", ShredSchema.Primitive(ShredType.Int32) }, + }); + + Assert.Equal(original, RoundTrip(original, schema)); + } + + // --------------------------------------------------------------- + // Unshredded fallback + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_Unshredded() + { + VariantValue original = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }); + + ShredSchema schema = ShredSchema.Unshredded(); + Assert.Equal(original, RoundTrip(original, schema)); + } + + // --------------------------------------------------------------- + // Cross-codec: JSON → shred → unshred → JSON + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_JsonThroughShredding() + { + string originalJson = "{\"name\":\"Alice\",\"age\":30,\"tags\":[\"a\",\"b\"]}"; + (byte[] metaIn, byte[] valueIn) = VariantJsonReader.Parse(originalJson); + VariantValue parsed = new VariantReader(metaIn, valueIn).ToVariantValue(); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int8) }, + }); + + VariantValue reconstructed = RoundTrip(parsed, schema); + string resultJson = VariantJsonWriter.ToJson(reconstructed); + + // Field order in JSON output is not guaranteed to match; compare via parsed equality. + (byte[] metaOut, byte[] valueOut) = VariantJsonReader.Parse(resultJson); + VariantValue reparsed = new VariantReader(metaOut, valueOut).ToVariantValue(); + Assert.Equal(parsed, reparsed); + } + + // --------------------------------------------------------------- + // Column-level shared metadata: rows share a single dictionary + // --------------------------------------------------------------- + + [Fact] + public void RoundTrip_MultipleRows_SharedMetadata() + { + List values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "extra1", VariantValue.FromInt32(1) }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "extra2", VariantValue.FromString("hi") }, + }), + }; + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }); + + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(values, schema); + + Assert.Equal(values.Count, rows.Count); + for (int i = 0; i < values.Count; i++) + { + VariantValue? reconstructed = VariantUnshredder.Reconstruct(rows[i], schema, metadata); + Assert.True(reconstructed.HasValue); + Assert.Equal(values[i], reconstructed.Value); + } + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaInfererTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaInfererTests.cs new file mode 100644 index 00000000..9aaee5d9 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaInfererTests.cs @@ -0,0 +1,259 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System.Collections.Generic; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + public class ShredSchemaInfererTests + { + private readonly ShredSchemaInferer _inferer = new ShredSchemaInferer(); + + [Fact] + public void Infer_EmptyValues_ReturnsUnshredded() + { + ShredSchema schema = _inferer.Infer(new List()); + Assert.Equal(ShredType.None, schema.TypedValueType); + } + + [Fact] + public void Infer_AllSameType_ReturnsPrimitive() + { + List values = new List + { + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(3), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Int32, schema.TypedValueType); + } + + [Fact] + public void Infer_MixedTypes_BelowConsistency_ReturnsUnshredded() + { + // 2 strings vs 2 ints = 50% consistency, below default 80% threshold. + List values = new List + { + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.FromInt32(3), + VariantValue.FromString("four"), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.None, schema.TypedValueType); + } + + [Fact] + public void Infer_MixedTypes_AboveConsistency_ReturnsDominant() + { + // 9 ints vs 1 string = 90% consistency, above default 80%. + List values = new List(); + for (int i = 0; i < 9; i++) + { + values.Add(VariantValue.FromInt32(i)); + } + values.Add(VariantValue.FromString("outlier")); + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Int32, schema.TypedValueType); + } + + [Fact] + public void Infer_Objects_InfersFieldSchemas() + { + List values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "age", VariantValue.FromInt32(25) }, + }), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Object, schema.TypedValueType); + Assert.NotNull(schema.ObjectFields); + Assert.True(schema.ObjectFields.ContainsKey("name")); + Assert.True(schema.ObjectFields.ContainsKey("age")); + Assert.Equal(ShredType.String, schema.ObjectFields["name"].TypedValueType); + Assert.Equal(ShredType.Int32, schema.ObjectFields["age"].TypedValueType); + } + + [Fact] + public void Infer_Objects_RareField_Excluded() + { + // "rare" appears in only 1 of 4 objects = 25%, below 50% threshold. + List values = new List(); + for (int i = 0; i < 4; i++) + { + Dictionary fields = new Dictionary + { + { "name", VariantValue.FromString($"user{i}") }, + }; + if (i == 0) + { + fields["rare"] = VariantValue.FromInt32(99); + } + values.Add(VariantValue.FromObject(fields)); + } + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Object, schema.TypedValueType); + Assert.True(schema.ObjectFields.ContainsKey("name")); + Assert.False(schema.ObjectFields.ContainsKey("rare")); + } + + [Fact] + public void Infer_Arrays_InfersElementSchema() + { + List values = new List + { + VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromInt32(2)), + VariantValue.FromArray( + VariantValue.FromInt32(3)), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Array, schema.TypedValueType); + Assert.NotNull(schema.ArrayElement); + Assert.Equal(ShredType.Int32, schema.ArrayElement.TypedValueType); + } + + [Fact] + public void Infer_NestedObjects() + { + List values = new List + { + VariantValue.FromObject(new Dictionary + { + { "address", VariantValue.FromObject(new Dictionary + { + { "city", VariantValue.FromString("NYC") }, + }) + }, + }), + VariantValue.FromObject(new Dictionary + { + { "address", VariantValue.FromObject(new Dictionary + { + { "city", VariantValue.FromString("LA") }, + }) + }, + }), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Object, schema.TypedValueType); + Assert.True(schema.ObjectFields.ContainsKey("address")); + + ShredSchema addressSchema = schema.ObjectFields["address"]; + Assert.Equal(ShredType.Object, addressSchema.TypedValueType); + Assert.True(addressSchema.ObjectFields.ContainsKey("city")); + Assert.Equal(ShredType.String, addressSchema.ObjectFields["city"].TypedValueType); + } + + [Fact] + public void Infer_RespectsMaxDepth() + { + List values = new List + { + VariantValue.FromObject(new Dictionary + { + { "deep", VariantValue.FromObject(new Dictionary + { + { "value", VariantValue.FromInt32(1) }, + }) + }, + }), + VariantValue.FromObject(new Dictionary + { + { "deep", VariantValue.FromObject(new Dictionary + { + { "value", VariantValue.FromInt32(2) }, + }) + }, + }), + }; + + // MaxDepth=0 means only top level — nested objects not explored. + ShredOptions options = new ShredOptions { MaxDepth = 0 }; + ShredSchema schema = _inferer.Infer(values, options); + + // Top level is object, but fields shouldn't be further explored. + Assert.Equal(ShredType.Object, schema.TypedValueType); + // "deep" field is an object, but since we can't recurse (maxDepth=0), + // it falls back to unshredded. + Assert.True(schema.ObjectFields.ContainsKey("deep")); + Assert.Equal(ShredType.None, schema.ObjectFields["deep"].TypedValueType); + } + + [Fact] + public void Infer_CustomOptions() + { + // 3 strings + 1 int = 75% string. Default (80%) would reject, but custom 70% accepts. + List values = new List + { + VariantValue.FromString("a"), + VariantValue.FromString("b"), + VariantValue.FromString("c"), + VariantValue.FromInt32(1), + }; + + ShredOptions options = new ShredOptions { MinTypeConsistency = 0.7 }; + ShredSchema schema = _inferer.Infer(values, options); + Assert.Equal(ShredType.String, schema.TypedValueType); + } + + [Fact] + public void Infer_AllStrings() + { + List values = new List + { + VariantValue.FromString("hello"), + VariantValue.FromString("world"), + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.String, schema.TypedValueType); + } + + [Fact] + public void Infer_AllBooleans() + { + List values = new List + { + VariantValue.True, + VariantValue.False, + VariantValue.True, + }; + + ShredSchema schema = _inferer.Infer(values); + Assert.Equal(ShredType.Boolean, schema.TypedValueType); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaTests.cs new file mode 100644 index 00000000..f5f0eb5e --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShredSchemaTests.cs @@ -0,0 +1,159 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + public class ShredSchemaTests + { + [Fact] + public void Unshredded_HasNoneType() + { + ShredSchema schema = ShredSchema.Unshredded(); + Assert.Equal(ShredType.None, schema.TypedValueType); + Assert.Null(schema.ObjectFields); + Assert.Null(schema.ArrayElement); + } + + [Fact] + public void Primitive_HasCorrectType() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + Assert.Equal(ShredType.Int32, schema.TypedValueType); + Assert.Null(schema.ObjectFields); + Assert.Null(schema.ArrayElement); + } + + [Theory] + [InlineData(ShredType.None)] + [InlineData(ShredType.Object)] + [InlineData(ShredType.Array)] + public void Primitive_RejectsNonPrimitiveTypes(ShredType type) + { + Assert.Throws(() => ShredSchema.Primitive(type)); + } + + [Fact] + public void ForObject_HasObjectType() + { + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "x", ShredSchema.Primitive(ShredType.Int32) }, + }); + + Assert.Equal(ShredType.Object, schema.TypedValueType); + Assert.NotNull(schema.ObjectFields); + Assert.Single(schema.ObjectFields); + Assert.Null(schema.ArrayElement); + } + + [Fact] + public void ForObject_NullThrows() + { + Assert.Throws(() => ShredSchema.ForObject(null)); + } + + [Fact] + public void ForArray_HasArrayType() + { + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.String)); + + Assert.Equal(ShredType.Array, schema.TypedValueType); + Assert.Null(schema.ObjectFields); + Assert.NotNull(schema.ArrayElement); + Assert.Equal(ShredType.String, schema.ArrayElement.TypedValueType); + } + + [Fact] + public void ForArray_NullThrows() + { + Assert.Throws(() => ShredSchema.ForArray(null)); + } + + [Fact] + public void ShredTypeFromPrimitive_MapsAllTypes() + { + Assert.Equal(ShredType.Boolean, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.BooleanTrue)); + Assert.Equal(ShredType.Boolean, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.BooleanFalse)); + Assert.Equal(ShredType.Int8, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Int8)); + Assert.Equal(ShredType.Int16, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Int16)); + Assert.Equal(ShredType.Int32, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Int32)); + Assert.Equal(ShredType.Int64, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Int64)); + Assert.Equal(ShredType.Float, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Float)); + Assert.Equal(ShredType.Double, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Double)); + Assert.Equal(ShredType.Decimal4, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Decimal4)); + Assert.Equal(ShredType.Decimal8, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Decimal8)); + Assert.Equal(ShredType.Decimal16, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Decimal16)); + Assert.Equal(ShredType.Date, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Date)); + Assert.Equal(ShredType.Timestamp, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Timestamp)); + Assert.Equal(ShredType.TimestampNtz, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.TimestampNtz)); + Assert.Equal(ShredType.TimeNtz, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.TimeNtz)); + Assert.Equal(ShredType.TimestampTzNanos, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.TimestampTzNanos)); + Assert.Equal(ShredType.TimestampNtzNanos, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.TimestampNtzNanos)); + Assert.Equal(ShredType.String, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.String)); + Assert.Equal(ShredType.Binary, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Binary)); + Assert.Equal(ShredType.Uuid, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.Uuid)); + Assert.Equal(ShredType.None, ShredSchema.ShredTypeFromPrimitive(VariantPrimitiveType.NullType)); + } + + // --------------------------------------------------------------- + // FromArrowType — decimal width inference across Decimal32/64/128. + // --------------------------------------------------------------- + + [Theory] + // Decimal32: any precision ≤ 9 maps to Decimal4. + [InlineData(typeof(Decimal32Type), 4, 2, ShredType.Decimal4)] + [InlineData(typeof(Decimal32Type), 9, 4, ShredType.Decimal4)] + // Decimal64: ≤9 still fits Decimal4; 10–18 maps to Decimal8. + [InlineData(typeof(Decimal64Type), 9, 2, ShredType.Decimal4)] + [InlineData(typeof(Decimal64Type), 10, 2, ShredType.Decimal8)] + [InlineData(typeof(Decimal64Type), 18, 9, ShredType.Decimal8)] + // Decimal128: width chosen by precision bucket. + [InlineData(typeof(Decimal128Type), 9, 4, ShredType.Decimal4)] + [InlineData(typeof(Decimal128Type), 18, 9, ShredType.Decimal8)] + [InlineData(typeof(Decimal128Type), 38, 9, ShredType.Decimal16)] + public void FromArrowType_DecimalTypes_MapToCorrectShredWidth( + Type arrowTypeKind, int precision, int scale, ShredType expected) + { + IArrowType arrowType = (IArrowType)Activator.CreateInstance(arrowTypeKind, precision, scale); + ShredSchema schema = ShredSchema.FromArrowType(arrowType); + Assert.Equal(expected, schema.TypedValueType); + } + + [Fact] + public void FromArrowType_Decimal128_PrecisionGreaterThan38_Throws() + { + // Precision 39 exceeds the spec max (38). + Assert.Throws( + () => ShredSchema.FromArrowType(new Decimal128Type(39, 0))); + } + + [Fact] + public void FromArrowType_Decimal256_Unsupported() + { + // Decimal256 exists in Arrow but the variant spec only defines 4/8/16-byte + // decimal widths, so 32-byte unscaled storage isn't a valid shred target. + Assert.Throws( + () => ShredSchema.FromArrowType(new Decimal256Type(10, 2))); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantArrayBuilderTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantArrayBuilderTests.cs new file mode 100644 index 00000000..049b1948 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantArrayBuilderTests.cs @@ -0,0 +1,424 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + /// + /// Round-trip tests for the producer path: take s, + /// shred them, assemble into a shredded , and then + /// read each row back via GetLogicalVariantValue. The reader is the + /// trusted oracle (validated against the Iceberg corpus), so equality here + /// confirms the builder produces a correct Arrow structure. + /// + public class ShreddedVariantArrayBuilderTests + { + private static VariantArray ShredAndBuild(IReadOnlyList values, ShredSchema schema) + { + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(values, schema); + return ShreddedVariantArrayBuilder.Build(schema, metadata, rows); + } + + private static void AssertRoundTrip(IReadOnlyList values, ShredSchema schema) + { + VariantArray array = ShredAndBuild(values, schema); + Assert.Equal(values.Count, array.Length); + for (int i = 0; i < values.Count; i++) + { + VariantValue actual = array.GetLogicalVariantValue(i); + Assert.Equal(values[i], actual); + } + } + + // --------------------------------------------------------------- + // Unshredded (schema = None) + // --------------------------------------------------------------- + + [Fact] + public void Unshredded_Column_HasNoTypedValue() + { + var values = new List + { + VariantValue.FromInt32(42), + VariantValue.FromString("hello"), + }; + VariantArray array = ShredAndBuild(values, ShredSchema.Unshredded()); + + Assert.False(array.IsShredded); + AssertRoundTrip(values, ShredSchema.Unshredded()); + } + + // --------------------------------------------------------------- + // Primitive shredding + // --------------------------------------------------------------- + + [Fact] + public void Primitive_Int32() + { + var values = new List + { + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(-42), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Int32)); + } + + [Fact] + public void Primitive_Boolean() + { + var values = new List { VariantValue.True, VariantValue.False, VariantValue.True }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Boolean)); + } + + [Fact] + public void Primitive_String() + { + var values = new List + { + VariantValue.FromString("alpha"), + VariantValue.FromString("beta"), + VariantValue.FromString(""), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.String)); + } + + [Fact] + public void Primitive_Int64() + { + var values = new List + { + VariantValue.FromInt64(long.MaxValue), + VariantValue.FromInt64(long.MinValue), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Int64)); + } + + [Fact] + public void Primitive_Double() + { + var values = new List + { + VariantValue.FromDouble(Math.PI), + VariantValue.FromDouble(-0.0), + VariantValue.FromDouble(double.MaxValue), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Double)); + } + + [Fact] + public void Primitive_Decimal4() + { + var values = new List + { + VariantValue.FromDecimal4(123.45m), + VariantValue.FromDecimal4(-99.99m), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Decimal4)); + } + + [Fact] + public void Primitive_Date() + { + var values = new List + { + VariantValue.FromDate(19000), + VariantValue.FromDate(0), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Date)); + } + + [Fact] + public void Primitive_Timestamp() + { + var values = new List + { + VariantValue.FromTimestamp(1640995200000000L), + VariantValue.FromTimestamp(0L), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Timestamp)); + } + + [Fact] + public void Primitive_Uuid() + { + var values = new List + { + VariantValue.FromUuid(Guid.NewGuid()), + VariantValue.FromUuid(Guid.Empty), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Uuid)); + } + + [Fact] + public void Primitive_Binary() + { + var values = new List + { + VariantValue.FromBinary(new byte[] { 1, 2, 3 }), + VariantValue.FromBinary(new byte[] { 0xff, 0x00 }), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Binary)); + } + + // --------------------------------------------------------------- + // Primitive type mismatch — falls back to residual + // --------------------------------------------------------------- + + [Fact] + public void Primitive_TypeMismatch_FallsBackToBinary() + { + // Schema expects Int32, values include a string — the string goes to residual. + var values = new List + { + VariantValue.FromInt32(42), + VariantValue.FromString("not an int"), + VariantValue.FromInt32(99), + }; + AssertRoundTrip(values, ShredSchema.Primitive(ShredType.Int32)); + } + + // --------------------------------------------------------------- + // Object shredding + // --------------------------------------------------------------- + + [Fact] + public void Object_FullyShredded() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "age", VariantValue.FromInt32(25) }, + }), + }; + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + VariantArray array = ShredAndBuild(values, schema); + Assert.True(array.IsShredded); + AssertRoundTrip(values, schema); + } + + [Fact] + public void Object_PartiallyShredded_MergesResidualFields() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + { "extra", VariantValue.True }, + }), + }; + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }); + + AssertRoundTrip(values, schema); + } + + [Fact] + public void Object_MissingField_NotInReconstruction() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("only-name") }, + }), + }; + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + AssertRoundTrip(values, schema); + } + + // --------------------------------------------------------------- + // Array shredding + // --------------------------------------------------------------- + + [Fact] + public void Array_Homogeneous() + { + var values = new List + { + VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(3)), + VariantValue.FromArray(VariantValue.FromInt32(4)), + VariantValue.FromArray(new List()), + }; + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + AssertRoundTrip(values, schema); + } + + [Fact] + public void Array_MixedElements_FallbackToBinary() + { + var values = new List + { + VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.FromInt32(3)), + }; + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + AssertRoundTrip(values, schema); + } + + // --------------------------------------------------------------- + // Nested structures + // --------------------------------------------------------------- + + [Fact] + public void Nested_ObjectsAndArrays() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "users", VariantValue.FromArray( + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "score", VariantValue.FromInt32(95) }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "score", VariantValue.FromInt32(88) }, + })) + }, + { "count", VariantValue.FromInt32(2) }, + }), + }; + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "users", ShredSchema.ForArray( + ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "score", ShredSchema.Primitive(ShredType.Int32) }, + })) + }, + { "count", ShredSchema.Primitive(ShredType.Int32) }, + }); + + AssertRoundTrip(values, schema); + } + + // --------------------------------------------------------------- + // Shape of the built Arrow array + // --------------------------------------------------------------- + + [Fact] + public void Build_ProducesExpectedArrowShape_PrimitiveInt32() + { + var values = new List { VariantValue.FromInt32(42) }; + VariantArray array = ShredAndBuild(values, ShredSchema.Primitive(ShredType.Int32)); + Assert.True(array.IsShredded); + Assert.NotNull(array.TypedValueArray); + Assert.IsType(array.TypedValueArray); + } + + [Fact] + public void Build_ProducesExpectedArrowShape_Object() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "x", VariantValue.FromInt32(1) }, + }), + }; + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "x", ShredSchema.Primitive(ShredType.Int32) }, + }); + VariantArray array = ShredAndBuild(values, schema); + Assert.True(array.IsShredded); + Assert.IsType(array.TypedValueArray); + } + + [Fact] + public void Build_ProducesExpectedArrowShape_Array() + { + var values = new List + { + VariantValue.FromArray(VariantValue.FromInt32(1), VariantValue.FromInt32(2)), + }; + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + VariantArray array = ShredAndBuild(values, schema); + Assert.True(array.IsShredded); + Assert.IsType(array.TypedValueArray); + } + + // --------------------------------------------------------------- + // Reader-side composition: built array is usable by the shredded reader. + // --------------------------------------------------------------- + + [Fact] + public void BuiltArray_SupportsShreddedReaderAccess() + { + var values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("alice") }, + { "age", VariantValue.FromInt32(42) }, + }), + }; + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + VariantArray array = ShredAndBuild(values, schema); + + ShreddedVariant slot = array.GetShreddedVariant(0); + ShreddedObject obj = slot.GetObject(); + + Assert.True(obj.TryGetField("name", out ShreddedVariant nameField)); + Assert.Equal("alice", nameField.GetString()); + + Assert.True(obj.TryGetField("age", out ShreddedVariant ageField)); + Assert.Equal(42, ageField.GetInt32()); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantConformanceTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantConformanceTests.cs new file mode 100644 index 00000000..b592ffe7 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantConformanceTests.cs @@ -0,0 +1,236 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.Json; +using Apache.Arrow; +using Apache.Arrow.Ipc; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + /// + /// Conformance tests against the Iceberg-derived shredded variant test corpus + /// from apache/parquet-testing. For each case, the sibling .arrow IPC + /// file (produced by test/shredded_variant_ipc/regen.py) is loaded, + /// the var column is projected as a , and + /// each row's materialization is compared against the expected + /// *.variant.bin payload. + /// + public class ShreddedVariantConformanceTests + { + private static readonly string IpcDir = FindIpcDir(); + private static readonly string ShreddedVariantDir = FindShreddedVariantDir(); + + private static string FindIpcDir() + { + string dir = AppContext.BaseDirectory; + for (int i = 0; i < 10; i++) + { + string candidate = Path.Combine(dir, "test", "shredded_variant_ipc"); + if (Directory.Exists(candidate) && Directory.GetFiles(candidate, "*.arrow").Length > 0) + return candidate; + string parent = Path.GetDirectoryName(dir); + if (parent == null || parent == dir) break; + dir = parent; + } + return null; + } + + private static string FindShreddedVariantDir() + { + string dir = AppContext.BaseDirectory; + for (int i = 0; i < 10; i++) + { + string candidate = Path.Combine(dir, "test", "parquet-testing", "shredded_variant"); + if (File.Exists(Path.Combine(candidate, "cases.json"))) + return candidate; + string parent = Path.GetDirectoryName(dir); + if (parent == null || parent == dir) break; + dir = parent; + } + return null; + } + + public static IEnumerable SingleRecordCases() + { + string shreddedDir = FindShreddedVariantDir(); + if (shreddedDir == null) + { + yield return new object[] { 0, null, null, null }; + yield break; + } + + string casesPath = Path.Combine(shreddedDir, "cases.json"); + using JsonDocument doc = JsonDocument.Parse(File.ReadAllText(casesPath)); + foreach (JsonElement c in doc.RootElement.EnumerateArray()) + { + if (!c.TryGetProperty("variant_file", out JsonElement vf)) continue; + if (!c.TryGetProperty("parquet_file", out JsonElement pf)) continue; + // Skip spec-INVALID cases here — the cases.json notes explicitly say + // "implementations can choose to error, or read the shredded value". + // They're covered by separate deliberate tests. + if (pf.GetString().Contains("INVALID")) continue; + int caseNumber = c.GetProperty("case_number").GetInt32(); + string testName = c.TryGetProperty("test", out JsonElement t) ? t.GetString() : ""; + yield return new object[] { caseNumber, testName, pf.GetString(), vf.GetString() }; + } + } + + public static IEnumerable MultiRecordCases() + { + string shreddedDir = FindShreddedVariantDir(); + if (shreddedDir == null) + { + yield return new object[] { 0, null, null }; + yield break; + } + + string casesPath = Path.Combine(shreddedDir, "cases.json"); + using JsonDocument doc = JsonDocument.Parse(File.ReadAllText(casesPath)); + foreach (JsonElement c in doc.RootElement.EnumerateArray()) + { + if (!c.TryGetProperty("variant_files", out _)) continue; + if (!c.TryGetProperty("parquet_file", out JsonElement pf)) continue; + int caseNumber = c.GetProperty("case_number").GetInt32(); + string testName = c.TryGetProperty("test", out JsonElement t) ? t.GetString() : ""; + yield return new object[] { caseNumber, testName, pf.GetString() }; + } + } + + [SkippableTheory] + [MemberData(nameof(SingleRecordCases))] + public void SingleRecord(int caseNumber, string testName, string parquetFile, string variantFile) + { + Skip.If(ShreddedVariantDir == null, "parquet-testing submodule not checked out"); + Skip.If(IpcDir == null, "regen.py has not been run (test/shredded_variant_ipc/*.arrow missing)"); + + string stem = Path.GetFileNameWithoutExtension(parquetFile); + string ipcPath = Path.Combine(IpcDir, stem + ".arrow"); + string variantBinPath = Path.Combine(ShreddedVariantDir, variantFile); + Skip.IfNot(File.Exists(ipcPath), $"Missing {ipcPath} (case {caseNumber}: {testName})"); + + VariantArray variantArray = LoadVariantArray(ipcPath); + Assert.True(variantArray.Length >= 1, $"Expected at least 1 row (case {caseNumber}: {testName})"); + + VariantValue actual = variantArray.GetLogicalVariantValue(0); + VariantValue expected = LoadExpectedVariant(variantBinPath); + + Assert.Equal(expected, actual); + } + + [SkippableTheory] + [MemberData(nameof(MultiRecordCases))] + public void MultiRecord(int caseNumber, string testName, string parquetFile) + { + Skip.If(ShreddedVariantDir == null, "parquet-testing submodule not checked out"); + Skip.If(IpcDir == null, "regen.py has not been run"); + + string stem = Path.GetFileNameWithoutExtension(parquetFile); + string ipcPath = Path.Combine(IpcDir, stem + ".arrow"); + Skip.IfNot(File.Exists(ipcPath), $"Missing {ipcPath} (case {caseNumber}: {testName})"); + + VariantArray variantArray = LoadVariantArray(ipcPath); + + // Load the list of expected files from cases.json. + string casesPath = Path.Combine(ShreddedVariantDir, "cases.json"); + using JsonDocument doc = JsonDocument.Parse(File.ReadAllText(casesPath)); + JsonElement caseElement = doc.RootElement.EnumerateArray() + .First(c => c.GetProperty("case_number").GetInt32() == caseNumber); + JsonElement variantFiles = caseElement.GetProperty("variant_files"); + + Assert.Equal(variantFiles.GetArrayLength(), variantArray.Length); + + for (int i = 0; i < variantArray.Length; i++) + { + JsonElement vf = variantFiles[i]; + if (vf.ValueKind == JsonValueKind.Null) + { + Assert.True(variantArray.IsNull(i), $"Case {caseNumber} ({testName}) row {i} expected struct-level null"); + continue; + } + + Assert.False(variantArray.IsNull(i), $"Case {caseNumber} ({testName}) row {i} unexpectedly null"); + string binPath = Path.Combine(ShreddedVariantDir, vf.GetString()); + VariantValue expected = LoadExpectedVariant(binPath); + VariantValue actual = variantArray.GetLogicalVariantValue(i); + Assert.Equal(expected, actual); + } + } + + // --------------------------------------------------------------- + // IPC loading helpers + // --------------------------------------------------------------- + + private static VariantArray LoadVariantArray(string ipcPath) + { + using Stream stream = File.OpenRead(ipcPath); + using ArrowFileReader reader = new ArrowFileReader(stream); + RecordBatch batch = reader.ReadNextRecordBatch(); + if (batch == null) + { + throw new InvalidOperationException($"No record batches in {ipcPath}"); + } + + int varIdx = batch.Schema.GetFieldIndex("var"); + Assert.True(varIdx >= 0, "IPC schema missing 'var' column"); + + IArrowArray varArray = batch.Column(varIdx); + return new VariantArray(varArray); + } + + // --------------------------------------------------------------- + // Expected-variant loading: decode the .variant.bin format + // = concatenated metadata bytes | value bytes + // --------------------------------------------------------------- + + private static VariantValue LoadExpectedVariant(string variantBinPath) + { + byte[] bytes = File.ReadAllBytes(variantBinPath); + int metadataLength = ComputeMetadataLength(bytes); + ReadOnlySpan metadata = new ReadOnlySpan(bytes, 0, metadataLength); + ReadOnlySpan value = new ReadOnlySpan(bytes, metadataLength, bytes.Length - metadataLength); + VariantReader reader = new VariantReader(metadata, value); + return reader.ToVariantValue(); + } + + private static int ComputeMetadataLength(byte[] bytes) + { + byte header = bytes[0]; + int offsetSize = ((header >> 6) & 0x3) + 1; + int dictSize = ReadLittleEndianInt(bytes, 1, offsetSize); + int offsetsStart = 1 + offsetSize; + int stringsStart = offsetsStart + (dictSize + 1) * offsetSize; + int lastOffset = ReadLittleEndianInt(bytes, offsetsStart + dictSize * offsetSize, offsetSize); + return stringsStart + lastOffset; + } + + private static int ReadLittleEndianInt(byte[] buf, int pos, int size) + { + int result = 0; + for (int i = 0; i < size; i++) + { + result |= buf[pos + i] << (8 * i); + } + return result; + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantErrorCaseTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantErrorCaseTests.cs new file mode 100644 index 00000000..fbf3e3d8 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantErrorCaseTests.cs @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.IO; +using Apache.Arrow; +using Apache.Arrow.Ipc; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + /// + /// Regression tests against the spec-invalid cases in the Iceberg corpus: + /// (1) cases with error_message in cases.json — malformed schemas + /// (unsupported Arrow types) or malformed data (value/typed_value conflicts). + /// (2) cases with case-NNN-INVALID.parquet — spec-invalid but whose + /// published "implementations may error or read" leaves behavior to the reader. + /// + public class ShreddedVariantErrorCaseTests + { + private static readonly string IpcDir = FindIpcDir(); + + private static string FindIpcDir() + { + string dir = AppContext.BaseDirectory; + for (int i = 0; i < 10; i++) + { + string candidate = Path.Combine(dir, "test", "shredded_variant_ipc"); + if (Directory.Exists(candidate) && Directory.GetFiles(candidate, "*.arrow").Length > 0) + return candidate; + string parent = Path.GetDirectoryName(dir); + if (parent == null || parent == dir) break; + dir = parent; + } + return null; + } + + private static VariantArray LoadCase(string stem) + { + Skip.If(IpcDir == null, "regen.py has not been run"); + string path = Path.Combine(IpcDir, stem + ".arrow"); + Skip.IfNot(File.Exists(path), $"Missing {path}"); + + using Stream stream = File.OpenRead(path); + using ArrowFileReader reader = new ArrowFileReader(stream); + RecordBatch batch = reader.ReadNextRecordBatch(); + return new VariantArray(batch.Column(batch.Schema.GetFieldIndex("var"))); + } + + // =============================================================== + // Schema-level errors: unsupported Arrow types in typed_value. + // These should fail as soon as the shredding schema is derived. + // =============================================================== + + [SkippableFact] + public void Case127_UnsignedInteger_RejectedAtSchemaDerivation() + { + // typed_value: uint32 — not a supported shredded type per spec. + VariantArray array = LoadCase("case-127"); + ArgumentException ex = Assert.Throws(() => array.GetShredSchema()); + Assert.Contains("Unsupported shredded value type", ex.Message); + } + + [SkippableFact] + public void Case137_FixedLengthByteArray_NotUuid_RejectedAtSchemaDerivation() + { + // typed_value: fixed_size_binary[4] — only fsb(16) is valid (UUID). + VariantArray array = LoadCase("case-137"); + ArgumentException ex = Assert.Throws(() => array.GetShredSchema()); + Assert.Contains("Unsupported shredded value type", ex.Message); + } + + [SkippableFact] + public void Case127_GetLogicalVariantValue_AlsoThrows() + { + // Any reader-facing entrypoint should surface the schema error. + VariantArray array = LoadCase("case-127"); + Assert.Throws(() => array.GetLogicalVariantValue(0)); + } + + // =============================================================== + // Data-level errors: both value and typed_value populated where + // the spec forbids it (primitive and array-element slots). + // =============================================================== + + [SkippableFact] + public void Case42_PrimitiveSlot_ValueAndTypedValueConflict_Throws() + { + // Top-level row: value has residual bytes AND typed_value is an int32. + VariantArray array = LoadCase("case-042"); + InvalidOperationException ex = Assert.Throws( + () => array.GetLogicalVariantValue(0)); + Assert.Contains("both", ex.Message); + Assert.Contains("value", ex.Message); + Assert.Contains("typed_value", ex.Message); + } + + [SkippableFact] + public void Case40_ArrayElement_ValueAndTypedValueConflict_Throws() + { + // Array element 0 has both value and typed_value set. + VariantArray array = LoadCase("case-040"); + InvalidOperationException ex = Assert.Throws( + () => array.GetLogicalVariantValue(0)); + Assert.Contains("both", ex.Message); + Assert.Contains("value", ex.Message); + Assert.Contains("typed_value", ex.Message); + } + + [SkippableFact] + public void Case87_NonObjectResidualWithShreddedFields_Throws() + { + // Top-level typed_value is a shredded-object struct, but the residual + // 'value' column holds a non-object variant (int32 = 34). Spec invalid. + VariantArray array = LoadCase("case-087"); + InvalidOperationException ex = Assert.Throws( + () => array.GetLogicalVariantValue(0)); + Assert.Contains("object", ex.Message); + } + + [SkippableFact] + public void Case128_NonObjectResidualWithEmptyShreddedFields_Throws() + { + // typed_value has all-null fields, value is a variant null (not an object). + VariantArray array = LoadCase("case-128"); + Assert.Throws(() => array.GetLogicalVariantValue(0)); + } + + // =============================================================== + // "INVALID" parquet files: spec-noncompliant but whose cases.json + // notes say "implementations can choose to error, or read". We + // document current behavior: we read (and the merged value may + // differ from the Iceberg-published expected value). + // =============================================================== + + [SkippableFact] + public void Case043_INVALID_FieldConflict_ReadsWithoutThrowing() + { + // case-043-INVALID: a shredded field has typed_value=null but the + // residual object re-declares it. We merge both, producing a result + // that differs from Iceberg's published "typed wins" expectation. + VariantArray array = LoadCase("case-043-INVALID"); + VariantValue v = array.GetLogicalVariantValue(0); + Assert.True(v.IsObject, + "Invalid-043 row is expected to materialize as an object under our permissive reader."); + } + + [SkippableFact] + public void Case125_INVALID_FieldConflict_ReadsWithoutThrowing() + { + VariantArray array = LoadCase("case-125-INVALID"); + VariantValue v = array.GetLogicalVariantValue(0); + Assert.True(v.IsObject); + } + + [SkippableFact] + public void Case084_INVALID_OptionalFieldStructs_ReadsWithoutThrowing() + { + VariantArray array = LoadCase("case-084-INVALID"); + VariantValue v = array.GetLogicalVariantValue(0); + Assert.True(v.IsObject); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantReaderTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantReaderTests.cs new file mode 100644 index 00000000..85ead71b --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/ShreddedVariantReaderTests.cs @@ -0,0 +1,511 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.IO; +using Apache.Arrow; +using Apache.Arrow.Ipc; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Apache.Arrow.Types; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + /// + /// Reader-style API tests: exercise , + /// , and typed accessors + /// without going through full variant materialization. These mirror what a + /// query engine would do for push-down reads against typed Parquet columns. + /// + public class ShreddedVariantReaderTests + { + private static readonly string IpcDir = FindIpcDir(); + + private static string FindIpcDir() + { + string dir = AppContext.BaseDirectory; + for (int i = 0; i < 10; i++) + { + string candidate = Path.Combine(dir, "test", "shredded_variant_ipc"); + if (Directory.Exists(candidate) && Directory.GetFiles(candidate, "*.arrow").Length > 0) + return candidate; + string parent = Path.GetDirectoryName(dir); + if (parent == null || parent == dir) break; + dir = parent; + } + return null; + } + + private static VariantArray LoadCase(string caseStem) + { + Skip.If(IpcDir == null, "regen.py has not been run"); + string path = Path.Combine(IpcDir, caseStem + ".arrow"); + Skip.IfNot(File.Exists(path), $"Missing {path}"); + + using Stream stream = File.OpenRead(path); + using ArrowFileReader reader = new ArrowFileReader(stream); + RecordBatch batch = reader.ReadNextRecordBatch(); + return new VariantArray(batch.Column(batch.Schema.GetFieldIndex("var"))); + } + + // --------------------------------------------------------------- + // Schema + state introspection + // --------------------------------------------------------------- + + [SkippableFact] + public void GetShredSchema_ReflectsPrimitiveTypedValue() + { + VariantArray array = LoadCase("case-010"); // typed_value: int32 + ShredSchema schema = array.GetShredSchema(); + Assert.Equal(ShredType.Int32, schema.TypedValueType); + } + + [SkippableFact] + public void GetShreddedVariant_HasTypedValue_WhenColumnPopulated() + { + VariantArray array = LoadCase("case-010"); // Int32 = 12345 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.True(slot.HasTypedValue); + Assert.False(slot.HasResidual); + Assert.False(slot.IsMissing); + } + + [SkippableFact] + public void GetShreddedVariant_HasResidual_WhenUnshredded() + { + VariantArray array = LoadCase("case-048"); // testUnshreddedVariants (bool true) + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.False(slot.HasTypedValue); + Assert.True(slot.HasResidual); + } + + // --------------------------------------------------------------- + // Typed primitive accessors + // --------------------------------------------------------------- + + [SkippableFact] + public void GetInt32_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-010"); // Int32 = 12345 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal(12345, slot.GetInt32()); + } + + [SkippableFact] + public void GetInt8_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-006"); // Int8 = 34 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal((sbyte)34, slot.GetInt8()); + } + + [SkippableFact] + public void GetInt64_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-012"); // Int64 = 9876543210 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal(9876543210L, slot.GetInt64()); + } + + [SkippableFact] + public void GetBoolean_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-004"); // Bool = true + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.True(slot.GetBoolean()); + } + + [SkippableFact] + public void GetString_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-031"); // String + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal("iceberg", slot.GetString()); + } + + [SkippableFact] + public void GetDouble_ReadsShreddedValue() + { + VariantArray array = LoadCase("case-016"); // Double = 14.3 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal(14.3, slot.GetDouble()); + } + + [SkippableFact] + public void GetDecimal_ReadsShreddedDecimal4() + { + VariantArray array = LoadCase("case-024"); // Decimal4 = 123456.789 (scale 3?) + ShreddedVariant slot = array.GetShreddedVariant(0); + decimal d = slot.GetDecimal(); + Assert.NotEqual(0m, d); + } + + // --------------------------------------------------------------- + // Type-mismatch errors + // --------------------------------------------------------------- + + [SkippableFact] + public void GetInt32_OnStringSchema_Throws() + { + VariantArray array = LoadCase("case-031"); // String + Assert.Throws(() => array.GetShreddedVariant(0).GetInt32()); + } + + [SkippableFact] + public void GetString_OnInt32Schema_Throws() + { + VariantArray array = LoadCase("case-010"); // Int32 + Assert.Throws(() => array.GetShreddedVariant(0).GetString()); + } + + [SkippableFact] + public void GetInt32_WithResidualOnly_Throws() + { + // Case 48 is unshredded — typed column absent on this row. + VariantArray array = LoadCase("case-048"); + Assert.False(array.GetShreddedVariant(0).HasTypedValue); + Assert.Throws(() => array.GetShreddedVariant(0).GetInt32()); + } + + // --------------------------------------------------------------- + // Residual reader access + // --------------------------------------------------------------- + + [SkippableFact] + public void TryGetResidualReader_ReturnsUnderlyingBytes() + { + VariantArray array = LoadCase("case-048"); // unshredded bool=true + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.True(slot.TryGetResidualReader(out VariantReader reader)); + Assert.True(reader.IsBoolean); + Assert.True(reader.GetBoolean()); + } + + [SkippableFact] + public void TryGetResidualReader_FalseWhenNoResidual() + { + VariantArray array = LoadCase("case-010"); // fully shredded Int32 + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.False(slot.TryGetResidualReader(out VariantReader _)); + } + + // --------------------------------------------------------------- + // Object traversal + // --------------------------------------------------------------- + + [SkippableFact] + public void GetObject_Nested_FullyTypedLeaves() + { + // case-044: outer object with field 'c' whose typed_value is itself a + // shredded object {a: int32=34, b: string="iceberg"}. + VariantArray array = LoadCase("case-044"); + ShreddedVariant slot = array.GetShreddedVariant(0); + Assert.Equal(ShredType.Object, slot.Schema.TypedValueType); + + ShreddedObject outerObj = slot.GetObject(); + Assert.True(outerObj.TryGetField("c", out ShreddedVariant cField)); + Assert.True(cField.HasTypedValue); + Assert.Equal(ShredType.Object, cField.Schema.TypedValueType); + + ShreddedObject innerObj = cField.GetObject(); + Assert.True(innerObj.TryGetField("a", out ShreddedVariant aField)); + Assert.True(aField.HasTypedValue); + Assert.Equal(34, aField.GetInt32()); + + Assert.True(innerObj.TryGetField("b", out ShreddedVariant bField)); + Assert.True(bField.HasTypedValue); + Assert.Equal("iceberg", bField.GetString()); + } + + [SkippableFact] + public void GetObject_MixedTypedAndResidualFields() + { + // case-138: top-level value column is missing (fully shredded at the top), + // but individual field typed_values can still be null at this row — + // their values live in the field-level residual ('value' sub-column). + // Field 'a' has a shredded typed of int32 but the actual value is int16, + // so 'a' falls to residual; field 'b' is typed string "iceberg". + VariantArray array = LoadCase("case-138"); + ShreddedObject obj = array.GetShreddedVariant(0).GetObject(); + + Assert.True(obj.TryGetField("a", out ShreddedVariant aField)); + Assert.False(aField.HasTypedValue); // schema says int32, value is int16 + Assert.True(aField.HasResidual); // residual holds the int16 bytes + Assert.True(aField.TryGetResidualReader(out VariantReader aReader)); + Assert.Equal((short)1234, aReader.GetInt16()); + + Assert.True(obj.TryGetField("b", out ShreddedVariant bField)); + Assert.True(bField.HasTypedValue); + Assert.Equal("iceberg", bField.GetString()); + } + + [SkippableFact] + public void TryGetField_ReturnsFalseForUnknownField() + { + VariantArray array = LoadCase("case-138"); + ShreddedObject obj = array.GetShreddedVariant(0).GetObject(); + Assert.False(obj.TryGetField("nonexistent", out _)); + } + + [SkippableFact] + public void Object_PartialShred_ExposesResidual() + { + // case-134: partially shredded — typed fields a, b plus residual field d (date). + VariantArray array = LoadCase("case-134"); + ShreddedVariant slot = array.GetShreddedVariant(0); + ShreddedObject obj = slot.GetObject(); + + Assert.True(obj.TryGetField("b", out ShreddedVariant bField)); + Assert.Equal("iceberg", bField.GetString()); + + Assert.True(obj.TryGetResidualReader(out VariantReader residualReader)); + Assert.True(residualReader.IsObject); + // Residual is a variant object holding the unshredded field(s). + VariantValue residualValue = residualReader.ToVariantValue(); + Assert.True(residualValue.IsObject); + Assert.Contains("d", residualValue.AsObject().Keys); + } + + [SkippableFact] + public void Object_MissingField_IsDetectable() + { + // case-132: typed struct but typed_value for `a` is null at row 0. + // Expected variant has only `b`; field `a` is missing. + VariantArray array = LoadCase("case-132"); + ShreddedObject obj = array.GetShreddedVariant(0).GetObject(); + + Assert.True(obj.TryGetField("a", out ShreddedVariant aField)); + Assert.True(aField.IsMissing); + + Assert.True(obj.TryGetField("b", out ShreddedVariant bField)); + Assert.False(bField.IsMissing); + Assert.Equal("iceberg", bField.GetString()); + } + + // --------------------------------------------------------------- + // Array traversal + // --------------------------------------------------------------- + + [SkippableFact] + public void GetArray_Shredded_IteratesElements() + { + // case-001: shredded array of strings [comedy, drama] + VariantArray array = LoadCase("case-001"); + ShreddedVariant slot = array.GetShreddedVariant(0); + + Assert.Equal(ShredType.Array, slot.Schema.TypedValueType); + ShreddedArray arr = slot.GetArray(); + Assert.True(arr.IsTypedList); + Assert.Equal(2, arr.ElementCount); + + Assert.Equal("comedy", arr.GetElement(0).GetString()); + Assert.Equal("drama", arr.GetElement(1).GetString()); + } + + [SkippableFact] + public void GetArray_Empty() + { + // case-002: empty array + VariantArray array = LoadCase("case-002"); + ShreddedArray arr = array.GetShreddedVariant(0).GetArray(); + Assert.True(arr.IsTypedList); + Assert.Equal(0, arr.ElementCount); + } + + [SkippableFact] + public void GetArray_ElementAccessMatches_Materialization() + { + VariantArray array = LoadCase("case-001"); + ShreddedArray arr = array.GetShreddedVariant(0).GetArray(); + // Cross-check that per-element typed access agrees with whole-array materialization. + VariantValue materialized = array.GetLogicalVariantValue(0); + Assert.True(materialized.IsArray); + var elements = materialized.AsArray(); + Assert.Equal(elements.Count, arr.ElementCount); + for (int i = 0; i < arr.ElementCount; i++) + { + Assert.Equal(elements[i].AsString(), arr.GetElement(i).GetString()); + } + } + + [Fact] + public void GetArray_BothNull_MaterializesAsVariantNull() + { + // A row where both the residual binary and the typed list are null + // encodes a variant null — same convention as ShreddedObject and + // ShreddedVariant. Direct GetArray().ToVariantValue() must agree. + VariantArray array = BuildArrayShreddedColumnWithNullRow(); + ShreddedVariant slot = array.GetShreddedVariant(0); + + Assert.Equal(ShredType.Array, slot.Schema.TypedValueType); + Assert.False(slot.HasResidual); + Assert.False(slot.HasTypedValue); + + ShreddedArray arr = slot.GetArray(); + Assert.False(arr.IsTypedList); + Assert.False(arr.TryGetResidualReader(out _)); + Assert.Equal(VariantValue.Null, arr.ToVariantValue()); + + // The slot-level entry-point path (which short-circuits on IsMissing) + // also returns variant null — keeping both APIs consistent. + Assert.Equal(VariantValue.Null, slot.ToVariantValue()); + } + + /// + /// Builds a one-row shredded VariantArray with schema Array<Int32> + /// where row 0 has both value and typed_value set to null. + /// + private static VariantArray BuildArrayShreddedColumnWithNullRow() + { + byte[] emptyMetadata = new VariantMetadataBuilder().Build(); + BinaryArray metadataArr = new BinaryArray.Builder().Append(emptyMetadata.AsSpan()).Build(); + BinaryArray valueArr = new BinaryArray.Builder().AppendNull().Build(); + + // typed_value is list>. + StructType elementGroupType = new StructType(new List + { + new Field("value", BinaryType.Default, true), + new Field("typed_value", Int32Type.Default, true), + }); + // Empty inner struct (length 0) — the list row is null so no elements are referenced. + StructArray emptyElementGroup = new StructArray( + elementGroupType, length: 0, + new IArrowArray[] + { + new BinaryArray.Builder().Build(), + new Int32Array.Builder().Build(), + }, + ArrowBuffer.Empty, nullCount: 0); + + ListType listType = new ListType(new Field("element", elementGroupType, true)); + ArrowBuffer offsetsBuffer = new ArrowBuffer.Builder().Append(0).Append(0).Build(); + ArrowBuffer listValidity = new ArrowBuffer.BitmapBuilder().Append(false).Build(); + ListArray typedValueList = new ListArray( + listType, length: 1, offsetsBuffer, emptyElementGroup, listValidity, nullCount: 1); + + StructType storageType = new StructType(new List + { + new Field("metadata", BinaryType.Default, false), + new Field("value", BinaryType.Default, true), + new Field("typed_value", listType, true), + }); + StructArray storage = new StructArray( + storageType, length: 1, + new IArrowArray[] { metadataArr, valueArr, typedValueList }, + ArrowBuffer.Empty, nullCount: 0); + return new VariantArray(storage); + } + + // --------------------------------------------------------------- + // Decimal32 / Decimal64 typed_value: construct the Arrow struct + // directly (the Iceberg corpus only exercises Decimal128Type). + // --------------------------------------------------------------- + + [Fact] + public void GetDecimal_BackedByDecimal32Array() + { + // struct + decimal expected = 123.45m; + VariantArray array = BuildShreddedColumn( + new Decimal32Type(5, 2), + new Decimal32Array.Builder(new Decimal32Type(5, 2)).Append(expected).Build()); + + Assert.Equal(ShredType.Decimal4, array.GetShredSchema().TypedValueType); + Assert.Equal(expected, array.GetShreddedVariant(0).GetDecimal()); + } + + [Fact] + public void GetDecimal_BackedByDecimal64Array() + { + // struct + decimal expected = 987654321.123456789m; + VariantArray array = BuildShreddedColumn( + new Decimal64Type(18, 9), + new Decimal64Array.Builder(new Decimal64Type(18, 9)).Append(expected).Build()); + + Assert.Equal(ShredType.Decimal8, array.GetShredSchema().TypedValueType); + Assert.Equal(expected, array.GetShreddedVariant(0).GetDecimal()); + } + + [Fact] + public void GetDecimal_BackedByDecimal32Array_MaterializesCorrectly() + { + // End-to-end: via GetLogicalVariantValue. + decimal expected = 42.50m; + VariantArray array = BuildShreddedColumn( + new Decimal32Type(4, 2), + new Decimal32Array.Builder(new Decimal32Type(4, 2)).Append(expected).Build()); + + VariantValue v = array.GetLogicalVariantValue(0); + Assert.Equal(expected, v.AsDecimal()); + } + + [Fact] + public void GetSqlDecimal_BackedByDecimal128Array_ExceedingSystemDecimalRange() + { + // Decimal16 value larger than System.Decimal (max ~7.9228e28). Precision 38, + // scale 0, value 10^38 - 1 fits in SqlDecimal/Decimal128 but overflows decimal. + System.Data.SqlTypes.SqlDecimal expected = + System.Data.SqlTypes.SqlDecimal.Parse("99999999999999999999999999999999999999"); + Decimal128Type type = new Decimal128Type(38, 0); + VariantArray array = BuildShreddedColumn( + type, + new Decimal128Array.Builder(type).Append(expected).Build()); + + Assert.Equal(ShredType.Decimal16, array.GetShredSchema().TypedValueType); + + // Typed accessor: SqlDecimal path preserves full precision. + Assert.Equal(expected, array.GetShreddedVariant(0).GetSqlDecimal()); + + // GetDecimal() overflows System.Decimal for this value. ShreddedVariant + // is a ref struct, so it cannot be captured by the Throws lambda — + // the call must be made on a fresh slot inside the delegate. + Assert.Throws(() => array.GetShreddedVariant(0).GetDecimal()); + + // Materialization must not throw: ReadTypedPrimitive dispatches the + // Decimal16 case through GetSqlDecimal / FromSqlDecimal, so the value + // is retained with SqlDecimal storage inside the VariantValue. + VariantValue v = array.GetLogicalVariantValue(0); + Assert.Equal(expected, v.AsSqlDecimal()); + Assert.Throws(() => v.AsDecimal()); + } + + /// + /// Helper: builds a one-row shredded VariantArray whose typed_value column + /// is (of type ), + /// with empty metadata and null value. + /// + private static VariantArray BuildShreddedColumn(IArrowType typedType, IArrowArray typedArray) + { + byte[] emptyMetadata = new VariantMetadataBuilder().Build(); + BinaryArray metadataArr = new BinaryArray.Builder().Append(emptyMetadata.AsSpan()).Build(); + BinaryArray valueArr = new BinaryArray.Builder().AppendNull().Build(); + + StructType storageType = new StructType(new List + { + new Field("metadata", BinaryType.Default, false), + new Field("value", BinaryType.Default, true), + new Field("typed_value", typedType, true), + }); + StructArray storage = new StructArray( + storageType, length: 1, + new IArrowArray[] { metadataArr, valueArr, typedArray }, + ArrowBuffer.Empty, nullCount: 0); + return new VariantArray(storage); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/VariantShredderTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/VariantShredderTests.cs new file mode 100644 index 00000000..ae33d1a6 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/VariantShredderTests.cs @@ -0,0 +1,496 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + public class VariantShredderTests + { + private static ShredResult ShredOne(VariantValue value, ShredSchema schema) + { + (_, IReadOnlyList rows) = VariantShredder.Shred(new[] { value }, schema); + return rows[0]; + } + + // --------------------------------------------------------------- + // Column-level metadata is shared, not per-row framed + // --------------------------------------------------------------- + + [Fact] + public void Shred_EmptyColumn_ProducesEmptyMetadata() + { + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred( + System.Array.Empty(), + ShredSchema.Primitive(ShredType.Int32)); + + Assert.NotNull(metadata); + Assert.Empty(rows); + } + + [Fact] + public void Shred_Column_ReturnsSharedMetadata() + { + // Two rows with overlapping field names should produce a single + // metadata dictionary containing all unique names. + List values = new List + { + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + { "city", VariantValue.FromString("NYC") }, + }), + }; + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }); + + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(values, schema); + + // Metadata dictionary should contain at least "age" and "city" (the residual fields). + VariantMetadata meta = new VariantMetadata(metadata); + HashSet names = new HashSet(); + for (int i = 0; i < meta.DictionarySize; i++) + { + names.Add(meta.GetString(i)); + } + Assert.Contains("age", names); + Assert.Contains("city", names); + + Assert.Equal(2, rows.Count); + } + + // --------------------------------------------------------------- + // Unshredded (ShredType.None) + // --------------------------------------------------------------- + + [Fact] + public void Shred_Unshredded_EncodesAsBinary() + { + VariantValue value = VariantValue.FromInt32(42); + ShredResult result = ShredOne(value, ShredSchema.Unshredded()); + + Assert.NotNull(result.Value); + Assert.Null(result.TypedValue); + Assert.False(result.IsMissing); + } + + // --------------------------------------------------------------- + // Primitive shredding + // --------------------------------------------------------------- + + [Fact] + public void Shred_Boolean_MatchingType() + { + ShredResult result = ShredOne(VariantValue.True, ShredSchema.Primitive(ShredType.Boolean)); + + Assert.Null(result.Value); + Assert.NotNull(result.TypedValue); + Assert.Equal(true, result.TypedValue); + } + + [Fact] + public void Shred_Boolean_False() + { + ShredResult result = ShredOne(VariantValue.False, ShredSchema.Primitive(ShredType.Boolean)); + + Assert.Null(result.Value); + Assert.Equal(false, result.TypedValue); + } + + [Fact] + public void Shred_Int32_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromInt32(42), ShredSchema.Primitive(ShredType.Int32)); + + Assert.Null(result.Value); + Assert.Equal(42, result.TypedValue); + } + + [Fact] + public void Shred_Int64_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromInt64(long.MaxValue), ShredSchema.Primitive(ShredType.Int64)); + + Assert.Null(result.Value); + Assert.Equal(long.MaxValue, result.TypedValue); + } + + [Fact] + public void Shred_Float_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromFloat(3.14f), ShredSchema.Primitive(ShredType.Float)); + + Assert.Null(result.Value); + Assert.Equal(3.14f, result.TypedValue); + } + + [Fact] + public void Shred_Double_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromDouble(Math.PI), ShredSchema.Primitive(ShredType.Double)); + + Assert.Null(result.Value); + Assert.Equal(Math.PI, result.TypedValue); + } + + [Fact] + public void Shred_String_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromString("hello"), ShredSchema.Primitive(ShredType.String)); + + Assert.Null(result.Value); + Assert.Equal("hello", result.TypedValue); + } + + [Fact] + public void Shred_Binary_MatchingType() + { + byte[] data = new byte[] { 1, 2, 3 }; + ShredResult result = ShredOne(VariantValue.FromBinary(data), ShredSchema.Primitive(ShredType.Binary)); + + Assert.Null(result.Value); + Assert.Equal(data, result.TypedValue); + } + + [Fact] + public void Shred_Uuid_MatchingType() + { + Guid guid = Guid.NewGuid(); + ShredResult result = ShredOne(VariantValue.FromUuid(guid), ShredSchema.Primitive(ShredType.Uuid)); + + Assert.Null(result.Value); + Assert.Equal(guid, result.TypedValue); + } + + [Fact] + public void Shred_Date_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromDate(19000), ShredSchema.Primitive(ShredType.Date)); + + Assert.Null(result.Value); + Assert.Equal(19000, result.TypedValue); + } + + [Fact] + public void Shred_Decimal4_MatchingType() + { + ShredResult result = ShredOne(VariantValue.FromDecimal4(99.99m), ShredSchema.Primitive(ShredType.Decimal4)); + + Assert.Null(result.Value); + Assert.Equal(99.99m, result.TypedValue); + } + + [Fact] + public void Shred_TypeMismatch_FallsBackToBinary() + { + // Schema expects Int32, but value is a string. + ShredResult result = ShredOne(VariantValue.FromString("hello"), ShredSchema.Primitive(ShredType.Int32)); + + Assert.NotNull(result.Value); + Assert.Null(result.TypedValue); + } + + [Fact] + public void Shred_NullVariant_FallsBackToBinary() + { + // Variant null doesn't match any primitive shred type. + ShredResult result = ShredOne(VariantValue.Null, ShredSchema.Primitive(ShredType.Int32)); + + Assert.NotNull(result.Value); + Assert.Null(result.TypedValue); + } + + // --------------------------------------------------------------- + // Object shredding + // --------------------------------------------------------------- + + [Fact] + public void Shred_Object_FullyShredded() + { + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredResult result = ShredOne(value, schema); + + // Fully shredded: value is null. + Assert.Null(result.Value); + Assert.NotNull(result.TypedValue); + + ShredObjectResult objectResult = (ShredObjectResult)result.TypedValue; + Assert.Equal(2, objectResult.Fields.Count); + + Assert.Null(objectResult.Fields["name"].Value); + Assert.Equal("Alice", objectResult.Fields["name"].TypedValue); + + Assert.Null(objectResult.Fields["age"].Value); + Assert.Equal(30, objectResult.Fields["age"].TypedValue); + } + + [Fact] + public void Shred_Object_PartiallyShredded() + { + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + { "extra", VariantValue.True }, + }); + + // Schema only shreds "name" — "age" and "extra" go to residual. + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }); + + ShredResult result = ShredOne(value, schema); + + // Partially shredded: both value and typed_value are non-null. + Assert.NotNull(result.Value); + Assert.NotNull(result.TypedValue); + + ShredObjectResult objectResult = (ShredObjectResult)result.TypedValue; + Assert.Equal("Alice", objectResult.Fields["name"].TypedValue); + } + + [Fact] + public void Shred_Object_MissingField() + { + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + }); + + // Schema expects "name" and "age", but "age" is missing. + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredResult result = ShredOne(value, schema); + + ShredObjectResult objectResult = (ShredObjectResult)result.TypedValue; + Assert.True(objectResult.Fields["age"].IsMissing); + } + + [Fact] + public void Shred_NotObject_WithObjectSchema_FallsBackToBinary() + { + VariantValue value = VariantValue.FromInt32(42); + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "x", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredResult result = ShredOne(value, schema); + + Assert.NotNull(result.Value); + Assert.Null(result.TypedValue); + } + + [Fact] + public void Shred_Object_FieldTypeMismatch() + { + // Field "age" is a string but schema expects Int32. + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "age", VariantValue.FromString("thirty") }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredResult result = ShredOne(value, schema); + ShredObjectResult objectResult = (ShredObjectResult)result.TypedValue; + + // Field falls back to binary within the shredded object. + Assert.NotNull(objectResult.Fields["age"].Value); + Assert.Null(objectResult.Fields["age"].TypedValue); + } + + // --------------------------------------------------------------- + // Array shredding + // --------------------------------------------------------------- + + [Fact] + public void Shred_Array_AllMatchingType() + { + VariantValue value = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(3)); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + ShredResult result = ShredOne(value, schema); + + Assert.Null(result.Value); + Assert.NotNull(result.TypedValue); + + ShredArrayResult arrayResult = (ShredArrayResult)result.TypedValue; + Assert.Equal(3, arrayResult.Elements.Count); + Assert.Equal(1, arrayResult.Elements[0].TypedValue); + Assert.Equal(2, arrayResult.Elements[1].TypedValue); + Assert.Equal(3, arrayResult.Elements[2].TypedValue); + } + + [Fact] + public void Shred_Array_MixedTypes() + { + VariantValue value = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.FromInt32(3)); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + ShredResult result = ShredOne(value, schema); + ShredArrayResult arrayResult = (ShredArrayResult)result.TypedValue; + + // Element 0: matches, goes to typed. + Assert.Null(arrayResult.Elements[0].Value); + Assert.Equal(1, arrayResult.Elements[0].TypedValue); + + // Element 1: doesn't match, goes to binary. + Assert.NotNull(arrayResult.Elements[1].Value); + Assert.Null(arrayResult.Elements[1].TypedValue); + + // Element 2: matches. + Assert.Null(arrayResult.Elements[2].Value); + Assert.Equal(3, arrayResult.Elements[2].TypedValue); + } + + [Fact] + public void Shred_Array_NullElement_FallsToBinary() + { + // Variant null in an array goes to binary (it doesn't match Int32). + VariantValue value = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.Null, + VariantValue.FromInt32(3)); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + ShredResult result = ShredOne(value, schema); + ShredArrayResult arrayResult = (ShredArrayResult)result.TypedValue; + + // Null element falls back to binary (not missing — arrays can't have missing). + Assert.NotNull(arrayResult.Elements[1].Value); + Assert.Null(arrayResult.Elements[1].TypedValue); + Assert.False(arrayResult.Elements[1].IsMissing); + } + + [Fact] + public void Shred_NotArray_WithArraySchema_FallsBackToBinary() + { + VariantValue value = VariantValue.FromInt32(42); + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + ShredResult result = ShredOne(value, schema); + + Assert.NotNull(result.Value); + Assert.Null(result.TypedValue); + } + + // --------------------------------------------------------------- + // Nested shredding + // --------------------------------------------------------------- + + [Fact] + public void Shred_NestedObject() + { + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "user", VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }) + }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "user", ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }) + }, + }); + + ShredResult result = ShredOne(value, schema); + + Assert.Null(result.Value); + ShredObjectResult outer = (ShredObjectResult)result.TypedValue; + + ShredResult userResult = outer.Fields["user"]; + Assert.Null(userResult.Value); + ShredObjectResult userObj = (ShredObjectResult)userResult.TypedValue; + + Assert.Equal("Alice", userObj.Fields["name"].TypedValue); + Assert.Equal(30, userObj.Fields["age"].TypedValue); + } + + [Fact] + public void Shred_ObjectWithArrayField() + { + VariantValue value = VariantValue.FromObject(new Dictionary + { + { "scores", VariantValue.FromArray( + VariantValue.FromInt32(95), + VariantValue.FromInt32(87)) + }, + }); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "scores", ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)) }, + }); + + ShredResult result = ShredOne(value, schema); + + ShredObjectResult obj = (ShredObjectResult)result.TypedValue; + ShredResult scoresResult = obj.Fields["scores"]; + ShredArrayResult arr = (ShredArrayResult)scoresResult.TypedValue; + Assert.Equal(2, arr.Elements.Count); + Assert.Equal(95, arr.Elements[0].TypedValue); + Assert.Equal(87, arr.Elements[1].TypedValue); + } + } +} diff --git a/test/Apache.Arrow.Operations.Tests/Shredding/VariantUnshredderTests.cs b/test/Apache.Arrow.Operations.Tests/Shredding/VariantUnshredderTests.cs new file mode 100644 index 00000000..b1714c01 --- /dev/null +++ b/test/Apache.Arrow.Operations.Tests/Shredding/VariantUnshredderTests.cs @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using Apache.Arrow.Operations.Shredding; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Operations.Tests.Shredding +{ + public class VariantUnshredderTests + { + // For typed_value-only results the metadata is unused, but the unshredder + // still needs a valid empty metadata span. Build one once. + private static readonly byte[] EmptyMetadata = new VariantMetadataBuilder().Build(); + + // --------------------------------------------------------------- + // Missing values + // --------------------------------------------------------------- + + [Fact] + public void Reconstruct_Missing_ReturnsNull() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + VariantValue? result = VariantUnshredder.Reconstruct(ShredResult.Missing, schema, EmptyMetadata); + + Assert.Null(result); + } + + // --------------------------------------------------------------- + // Primitive reconstruction + // --------------------------------------------------------------- + + [Fact] + public void Reconstruct_Boolean() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Boolean); + ShredResult shredded = new ShredResult(null, true); + + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.True, result.Value); + } + + [Fact] + public void Reconstruct_Int32() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Int32); + ShredResult shredded = new ShredResult(null, 42); + + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.FromInt32(42), result.Value); + } + + [Fact] + public void Reconstruct_String() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.String); + ShredResult shredded = new ShredResult(null, "hello"); + + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.FromString("hello"), result.Value); + } + + [Fact] + public void Reconstruct_Double() + { + ShredSchema schema = ShredSchema.Primitive(ShredType.Double); + ShredResult shredded = new ShredResult(null, Math.PI); + + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.FromDouble(Math.PI), result.Value); + } + + [Fact] + public void Reconstruct_Uuid() + { + Guid guid = Guid.NewGuid(); + ShredSchema schema = ShredSchema.Primitive(ShredType.Uuid); + ShredResult shredded = new ShredResult(null, guid); + + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.FromUuid(guid), result.Value); + } + + // --------------------------------------------------------------- + // Object reconstruction + // --------------------------------------------------------------- + + [Fact] + public void Reconstruct_FullyShreddedObject() + { + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredObjectResult objectResult = new ShredObjectResult(new Dictionary + { + { "name", new ShredResult(null, "Alice") }, + { "age", new ShredResult(null, 30) }, + }); + + ShredResult shredded = new ShredResult(null, objectResult); + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.True(result.Value.IsObject); + IReadOnlyDictionary fields = result.Value.AsObject(); + Assert.Equal(2, fields.Count); + Assert.Equal(VariantValue.FromString("Alice"), fields["name"]); + Assert.Equal(VariantValue.FromInt32(30), fields["age"]); + } + + [Fact] + public void Reconstruct_ObjectWithMissingField() + { + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + { "age", ShredSchema.Primitive(ShredType.Int32) }, + }); + + ShredObjectResult objectResult = new ShredObjectResult(new Dictionary + { + { "name", new ShredResult(null, "Alice") }, + { "age", ShredResult.Missing }, + }); + + ShredResult shredded = new ShredResult(null, objectResult); + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + IReadOnlyDictionary fields = result.Value.AsObject(); + Assert.Single(fields); + Assert.Equal(VariantValue.FromString("Alice"), fields["name"]); + } + + [Fact] + public void Reconstruct_NonObjectWithObjectSchema_DecodesFromBinary() + { + // Value was not an object, so it went to binary with typed_value=null. + VariantValue original = VariantValue.FromInt32(42); + + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "x", ShredSchema.Primitive(ShredType.Int32) }, + }); + + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(new[] { original }, schema); + VariantValue? result = VariantUnshredder.Reconstruct(rows[0], schema, metadata); + + Assert.True(result.HasValue); + Assert.Equal(VariantValue.FromInt32(42), result.Value); + } + + // --------------------------------------------------------------- + // Array reconstruction + // --------------------------------------------------------------- + + [Fact] + public void Reconstruct_Array() + { + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + ShredArrayResult arrayResult = new ShredArrayResult(new List + { + new ShredResult(null, 1), + new ShredResult(null, 2), + new ShredResult(null, 3), + }); + + ShredResult shredded = new ShredResult(null, arrayResult); + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + Assert.True(result.Value.IsArray); + IReadOnlyList elements = result.Value.AsArray(); + Assert.Equal(3, elements.Count); + Assert.Equal(VariantValue.FromInt32(1), elements[0]); + Assert.Equal(VariantValue.FromInt32(2), elements[1]); + Assert.Equal(VariantValue.FromInt32(3), elements[2]); + } + + [Fact] + public void Reconstruct_ArrayWithMixedElements() + { + // Element 1 didn't match the shred type, so it's in binary. + VariantValue original = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.FromInt32(3)); + + ShredSchema schema = ShredSchema.ForArray(ShredSchema.Primitive(ShredType.Int32)); + + (byte[] metadata, IReadOnlyList rows) = VariantShredder.Shred(new[] { original }, schema); + VariantValue? result = VariantUnshredder.Reconstruct(rows[0], schema, metadata); + + Assert.True(result.HasValue); + IReadOnlyList elements = result.Value.AsArray(); + Assert.Equal(3, elements.Count); + Assert.Equal(VariantValue.FromInt32(1), elements[0]); + Assert.Equal(VariantValue.FromString("two"), elements[1]); + Assert.Equal(VariantValue.FromInt32(3), elements[2]); + } + + // --------------------------------------------------------------- + // Nested reconstruction + // --------------------------------------------------------------- + + [Fact] + public void Reconstruct_NestedObject() + { + ShredSchema schema = ShredSchema.ForObject(new Dictionary + { + { "user", ShredSchema.ForObject(new Dictionary + { + { "name", ShredSchema.Primitive(ShredType.String) }, + }) + }, + }); + + ShredObjectResult innerObj = new ShredObjectResult(new Dictionary + { + { "name", new ShredResult(null, "Alice") }, + }); + + ShredObjectResult outerObj = new ShredObjectResult(new Dictionary + { + { "user", new ShredResult(null, innerObj) }, + }); + + ShredResult shredded = new ShredResult(null, outerObj); + VariantValue? result = VariantUnshredder.Reconstruct(shredded, schema, EmptyMetadata); + + Assert.True(result.HasValue); + IReadOnlyDictionary outer = result.Value.AsObject(); + IReadOnlyDictionary inner = outer["user"].AsObject(); + Assert.Equal(VariantValue.FromString("Alice"), inner["name"]); + } + } +} diff --git a/test/Apache.Arrow.Scalars.Tests/VariantValueWriterCopyValueTests.cs b/test/Apache.Arrow.Scalars.Tests/VariantValueWriterCopyValueTests.cs new file mode 100644 index 00000000..24672981 --- /dev/null +++ b/test/Apache.Arrow.Scalars.Tests/VariantValueWriterCopyValueTests.cs @@ -0,0 +1,413 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +using System; +using System.Collections.Generic; +using System.Data.SqlTypes; +using Apache.Arrow.Scalars.Variant; +using Xunit; + +namespace Apache.Arrow.Scalars.Tests +{ + /// + /// Tests for and + /// — the transcoder + /// that walks a and re-emits it into a writer + /// with a (potentially different) metadata dictionary. + /// + public class VariantValueWriterCopyValueTests + { + private static readonly VariantBuilder _encoder = new VariantBuilder(); + + /// + /// Encodes a value, then transcodes it through CopyValue into a fresh writer + /// whose metadata is collected via CollectFieldNames. Returns the transcoded + /// VariantValue for equality comparison. + /// + private static VariantValue Transcode(VariantValue original) + { + (byte[] srcMetadata, byte[] srcValue) = _encoder.Encode(original); + VariantReader srcReader = new VariantReader(srcMetadata, srcValue); + + VariantMetadataBuilder dstMetadata = new VariantMetadataBuilder(); + dstMetadata.CollectFieldNames(srcReader); + byte[] dstMetadataBytes = dstMetadata.Build(out int[] idRemap); + + using VariantValueWriter writer = new VariantValueWriter(dstMetadata, idRemap); + writer.CopyValue(srcReader); + byte[] dstValue = writer.ToArray(); + + return new VariantReader(dstMetadataBytes, dstValue).ToVariantValue(); + } + + // --------------------------------------------------------------- + // Primitives + // --------------------------------------------------------------- + + [Fact] + public void CopyValue_Null() => + Assert.Equal(VariantValue.Null, Transcode(VariantValue.Null)); + + [Fact] + public void CopyValue_BooleanTrue() => + Assert.Equal(VariantValue.True, Transcode(VariantValue.True)); + + [Fact] + public void CopyValue_BooleanFalse() => + Assert.Equal(VariantValue.False, Transcode(VariantValue.False)); + + [Theory] + [InlineData(sbyte.MinValue)] + [InlineData((sbyte)-1)] + [InlineData((sbyte)0)] + [InlineData(sbyte.MaxValue)] + public void CopyValue_Int8(sbyte v) => + Assert.Equal(VariantValue.FromInt8(v), Transcode(VariantValue.FromInt8(v))); + + [Fact] + public void CopyValue_Int16() => + Assert.Equal(VariantValue.FromInt16(short.MaxValue), Transcode(VariantValue.FromInt16(short.MaxValue))); + + [Fact] + public void CopyValue_Int32() => + Assert.Equal(VariantValue.FromInt32(int.MinValue), Transcode(VariantValue.FromInt32(int.MinValue))); + + [Fact] + public void CopyValue_Int64() => + Assert.Equal(VariantValue.FromInt64(long.MaxValue), Transcode(VariantValue.FromInt64(long.MaxValue))); + + [Fact] + public void CopyValue_Float() => + Assert.Equal(VariantValue.FromFloat(3.14f), Transcode(VariantValue.FromFloat(3.14f))); + + [Fact] + public void CopyValue_Double() => + Assert.Equal(VariantValue.FromDouble(Math.PI), Transcode(VariantValue.FromDouble(Math.PI))); + + [Fact] + public void CopyValue_Decimal4() => + Assert.Equal(VariantValue.FromDecimal4(123.45m), Transcode(VariantValue.FromDecimal4(123.45m))); + + [Fact] + public void CopyValue_Decimal8() + { + // Must fit in 64-bit unscaled (precision ≤ 18). 17 significant digits. + VariantValue v = VariantValue.FromDecimal8(987654321.12345678m); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_Decimal16() => + Assert.Equal(VariantValue.FromDecimal16(99999999.99m), Transcode(VariantValue.FromDecimal16(99999999.99m))); + + [Fact] + public void CopyValue_Decimal16_SqlDecimalRange() + { + // Exceeds System.Decimal range — should route through SqlDecimal internally. + SqlDecimal large = SqlDecimal.Parse("12345678901234567890123456789012345678"); + VariantValue original = VariantValue.FromSqlDecimal(large); + VariantValue roundTripped = Transcode(original); + Assert.Equal(original, roundTripped); + } + + [Fact] + public void CopyValue_Date() => + Assert.Equal(VariantValue.FromDate(19000), Transcode(VariantValue.FromDate(19000))); + + [Fact] + public void CopyValue_Timestamp() => + Assert.Equal(VariantValue.FromTimestamp(1640995200000000L), Transcode(VariantValue.FromTimestamp(1640995200000000L))); + + [Fact] + public void CopyValue_TimestampNtz() => + Assert.Equal(VariantValue.FromTimestampNtz(1640995200000000L), Transcode(VariantValue.FromTimestampNtz(1640995200000000L))); + + [Fact] + public void CopyValue_TimeNtz() => + Assert.Equal(VariantValue.FromTimeNtz(123456789L), Transcode(VariantValue.FromTimeNtz(123456789L))); + + [Fact] + public void CopyValue_TimestampTzNanos() => + Assert.Equal(VariantValue.FromTimestampTzNanos(1700000000_123456789L), Transcode(VariantValue.FromTimestampTzNanos(1700000000_123456789L))); + + [Fact] + public void CopyValue_TimestampNtzNanos() => + Assert.Equal(VariantValue.FromTimestampNtzNanos(1700000000_123456789L), Transcode(VariantValue.FromTimestampNtzNanos(1700000000_123456789L))); + + [Fact] + public void CopyValue_ShortString() => + // 5-byte string triggers the short-string encoding (≤ 63 bytes). + Assert.Equal(VariantValue.FromString("hello"), Transcode(VariantValue.FromString("hello"))); + + [Fact] + public void CopyValue_LongString_PrimitiveEncoding() + { + // 64+ bytes forces the long-string primitive path. + string longString = new string('x', 100); + VariantValue v = VariantValue.FromString(longString); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_EmptyString() => + Assert.Equal(VariantValue.FromString(""), Transcode(VariantValue.FromString(""))); + + [Fact] + public void CopyValue_Binary() => + Assert.Equal( + VariantValue.FromBinary(new byte[] { 0, 1, 2, 255 }), + Transcode(VariantValue.FromBinary(new byte[] { 0, 1, 2, 255 }))); + + [Fact] + public void CopyValue_Uuid() + { + Guid g = Guid.NewGuid(); + Assert.Equal(VariantValue.FromUuid(g), Transcode(VariantValue.FromUuid(g))); + } + + // --------------------------------------------------------------- + // Containers + // --------------------------------------------------------------- + + [Fact] + public void CopyValue_EmptyObject() => + Assert.Equal( + VariantValue.FromObject(new Dictionary()), + Transcode(VariantValue.FromObject(new Dictionary()))); + + [Fact] + public void CopyValue_EmptyArray() => + Assert.Equal( + VariantValue.FromArray(new List()), + Transcode(VariantValue.FromArray(new List()))); + + [Fact] + public void CopyValue_Object_FlatFields() + { + VariantValue v = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("alice") }, + { "age", VariantValue.FromInt32(30) }, + { "active", VariantValue.True }, + }); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_ArrayOfPrimitives() + { + VariantValue v = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromInt32(2), + VariantValue.FromInt32(3)); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_NestedObjectInArray() + { + VariantValue v = VariantValue.FromArray( + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + }), + VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Bob") }, + })); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_DeeplyNested() + { + VariantValue v = VariantValue.FromObject(new Dictionary + { + { "users", VariantValue.FromArray( + VariantValue.FromObject(new Dictionary + { + { "id", VariantValue.FromInt32(1) }, + { "tags", VariantValue.FromArray( + VariantValue.FromString("admin"), + VariantValue.FromString("beta")) + }, + }), + VariantValue.FromObject(new Dictionary + { + { "id", VariantValue.FromInt32(2) }, + { "tags", VariantValue.FromArray(VariantValue.FromString("user")) }, + })) + }, + { "count", VariantValue.FromInt32(2) }, + }); + Assert.Equal(v, Transcode(v)); + } + + [Fact] + public void CopyValue_MixedArray() + { + VariantValue v = VariantValue.FromArray( + VariantValue.FromInt32(1), + VariantValue.FromString("two"), + VariantValue.Null, + VariantValue.True, + VariantValue.FromArray(VariantValue.FromInt32(4), VariantValue.FromInt32(5))); + Assert.Equal(v, Transcode(v)); + } + + // --------------------------------------------------------------- + // Field-ID remap: transcoding between distinct metadata dictionaries. + // --------------------------------------------------------------- + + [Fact] + public void CopyValue_RemapsFieldIds_WhenTargetMetadataIsSuperset() + { + // Source has fields {"age", "name"} — sorted metadata assigns IDs based + // on byte order (age < name lexicographically). + VariantValue source = VariantValue.FromObject(new Dictionary + { + { "name", VariantValue.FromString("Alice") }, + { "age", VariantValue.FromInt32(30) }, + }); + + (byte[] srcMetadata, byte[] srcValue) = _encoder.Encode(source); + VariantReader srcReader = new VariantReader(srcMetadata, srcValue); + + // Build target metadata that contains extra names the source doesn't use. + // This forces the field IDs to differ between source and target. + VariantMetadataBuilder dstMetadata = new VariantMetadataBuilder(); + dstMetadata.Add("zzz-decoy-1"); + dstMetadata.Add("aaa-decoy-2"); // sorts before "age" + dstMetadata.CollectFieldNames(srcReader); + dstMetadata.Add("mmm-decoy-3"); + + byte[] dstMetadataBytes = dstMetadata.Build(out int[] idRemap); + using VariantValueWriter writer = new VariantValueWriter(dstMetadata, idRemap); + writer.CopyValue(srcReader); + byte[] dstValue = writer.ToArray(); + + // Reading back through the target metadata should yield an equivalent + // VariantValue even though the field IDs are numerically different. + VariantValue reconstructed = new VariantReader(dstMetadataBytes, dstValue).ToVariantValue(); + Assert.Equal(source, reconstructed); + } + + [Fact] + public void CopyValue_ThrowsIfFieldNameNotInTargetMetadata() + { + VariantValue source = VariantValue.FromObject(new Dictionary + { + { "unknown-field", VariantValue.FromInt32(42) }, + }); + + (byte[] srcMetadata, byte[] srcValue) = _encoder.Encode(source); + + // Deliberately skip CollectFieldNames — "unknown-field" is not in dst metadata. + VariantMetadataBuilder dstMetadata = new VariantMetadataBuilder(); + byte[] _ = dstMetadata.Build(out int[] idRemap); + using VariantValueWriter writer = new VariantValueWriter(dstMetadata, idRemap); + + // Ref structs (VariantReader) can't be captured by lambdas; reconstruct inside. + Assert.Throws(() => + writer.CopyValue(new VariantReader(srcMetadata, srcValue))); + } + + // --------------------------------------------------------------- + // CollectFieldNames — idempotence + coverage. + // --------------------------------------------------------------- + + [Fact] + public void CollectFieldNames_AccumulatesNamesFromNestedSources() + { + VariantValue v = VariantValue.FromObject(new Dictionary + { + { "outer1", VariantValue.FromObject(new Dictionary + { + { "inner1", VariantValue.FromInt32(1) }, + { "inner2", VariantValue.FromInt32(2) }, + }) + }, + { "outer2", VariantValue.FromArray( + VariantValue.FromObject(new Dictionary + { + { "inner3", VariantValue.FromInt32(3) }, + })) + }, + }); + + (byte[] metadata, byte[] value) = _encoder.Encode(v); + VariantReader reader = new VariantReader(metadata, value); + + VariantMetadataBuilder dst = new VariantMetadataBuilder(); + dst.CollectFieldNames(reader); + + // All five distinct field names should be present. + Assert.Equal(5, dst.Count); + Assert.Equal(0, dst.GetId("outer1")); // insertion order (pre-sort) + // Exact insertion IDs matter less than "every name is addable"; verify via re-add: + int before = dst.Count; + dst.Add("outer1"); // duplicate — no change + Assert.Equal(before, dst.Count); + } + + [Fact] + public void CollectFieldNames_PrimitiveReader_IsNoOp() + { + (byte[] metadata, byte[] value) = _encoder.Encode(VariantValue.FromInt32(42)); + VariantReader reader = new VariantReader(metadata, value); + + VariantMetadataBuilder dst = new VariantMetadataBuilder(); + dst.CollectFieldNames(reader); + Assert.Equal(0, dst.Count); + } + + // --------------------------------------------------------------- + // Merging values from multiple sources into one target. + // --------------------------------------------------------------- + + [Fact] + public void CopyValue_MergesTwoObjectsIntoOneTargetDictionary() + { + VariantValue a = VariantValue.FromObject(new Dictionary + { + { "alpha", VariantValue.FromInt32(1) }, + }); + VariantValue b = VariantValue.FromObject(new Dictionary + { + { "beta", VariantValue.FromInt32(2) }, + }); + + (byte[] aMeta, byte[] aVal) = _encoder.Encode(a); + (byte[] bMeta, byte[] bVal) = _encoder.Encode(b); + VariantReader aReader = new VariantReader(aMeta, aVal); + VariantReader bReader = new VariantReader(bMeta, bVal); + + // Single target metadata that covers both sources. + VariantMetadataBuilder dst = new VariantMetadataBuilder(); + dst.CollectFieldNames(aReader); + dst.CollectFieldNames(bReader); + byte[] dstMeta = dst.Build(out int[] remap); + + // Transcode each into its own value stream (still referencing `dst`). + using VariantValueWriter writerA = new VariantValueWriter(dst, remap); + writerA.CopyValue(aReader); + using VariantValueWriter writerB = new VariantValueWriter(dst, remap); + writerB.CopyValue(bReader); + + Assert.Equal(a, new VariantReader(dstMeta, writerA.ToArray()).ToVariantValue()); + Assert.Equal(b, new VariantReader(dstMeta, writerB.ToArray()).ToVariantValue()); + } + } +} diff --git a/test/shredded_variant_ipc/case-001.arrow b/test/shredded_variant_ipc/case-001.arrow new file mode 100644 index 00000000..65c39caf Binary files /dev/null and b/test/shredded_variant_ipc/case-001.arrow differ diff --git a/test/shredded_variant_ipc/case-002.arrow b/test/shredded_variant_ipc/case-002.arrow new file mode 100644 index 00000000..381be6a6 Binary files /dev/null and b/test/shredded_variant_ipc/case-002.arrow differ diff --git a/test/shredded_variant_ipc/case-004.arrow b/test/shredded_variant_ipc/case-004.arrow new file mode 100644 index 00000000..4626f07a Binary files /dev/null and b/test/shredded_variant_ipc/case-004.arrow differ diff --git a/test/shredded_variant_ipc/case-005.arrow b/test/shredded_variant_ipc/case-005.arrow new file mode 100644 index 00000000..302570aa Binary files /dev/null and b/test/shredded_variant_ipc/case-005.arrow differ diff --git a/test/shredded_variant_ipc/case-006.arrow b/test/shredded_variant_ipc/case-006.arrow new file mode 100644 index 00000000..f773dac9 Binary files /dev/null and b/test/shredded_variant_ipc/case-006.arrow differ diff --git a/test/shredded_variant_ipc/case-007.arrow b/test/shredded_variant_ipc/case-007.arrow new file mode 100644 index 00000000..8f38ba04 Binary files /dev/null and b/test/shredded_variant_ipc/case-007.arrow differ diff --git a/test/shredded_variant_ipc/case-008.arrow b/test/shredded_variant_ipc/case-008.arrow new file mode 100644 index 00000000..10a23730 Binary files /dev/null and b/test/shredded_variant_ipc/case-008.arrow differ diff --git a/test/shredded_variant_ipc/case-009.arrow b/test/shredded_variant_ipc/case-009.arrow new file mode 100644 index 00000000..19922bbf Binary files /dev/null and b/test/shredded_variant_ipc/case-009.arrow differ diff --git a/test/shredded_variant_ipc/case-010.arrow b/test/shredded_variant_ipc/case-010.arrow new file mode 100644 index 00000000..17f69428 Binary files /dev/null and b/test/shredded_variant_ipc/case-010.arrow differ diff --git a/test/shredded_variant_ipc/case-011.arrow b/test/shredded_variant_ipc/case-011.arrow new file mode 100644 index 00000000..4736beaa Binary files /dev/null and b/test/shredded_variant_ipc/case-011.arrow differ diff --git a/test/shredded_variant_ipc/case-012.arrow b/test/shredded_variant_ipc/case-012.arrow new file mode 100644 index 00000000..3c507cbf Binary files /dev/null and b/test/shredded_variant_ipc/case-012.arrow differ diff --git a/test/shredded_variant_ipc/case-013.arrow b/test/shredded_variant_ipc/case-013.arrow new file mode 100644 index 00000000..dfccfe46 Binary files /dev/null and b/test/shredded_variant_ipc/case-013.arrow differ diff --git a/test/shredded_variant_ipc/case-014.arrow b/test/shredded_variant_ipc/case-014.arrow new file mode 100644 index 00000000..0d12fea6 Binary files /dev/null and b/test/shredded_variant_ipc/case-014.arrow differ diff --git a/test/shredded_variant_ipc/case-015.arrow b/test/shredded_variant_ipc/case-015.arrow new file mode 100644 index 00000000..f96f50fd Binary files /dev/null and b/test/shredded_variant_ipc/case-015.arrow differ diff --git a/test/shredded_variant_ipc/case-016.arrow b/test/shredded_variant_ipc/case-016.arrow new file mode 100644 index 00000000..aaa269af Binary files /dev/null and b/test/shredded_variant_ipc/case-016.arrow differ diff --git a/test/shredded_variant_ipc/case-017.arrow b/test/shredded_variant_ipc/case-017.arrow new file mode 100644 index 00000000..9e4e3dd9 Binary files /dev/null and b/test/shredded_variant_ipc/case-017.arrow differ diff --git a/test/shredded_variant_ipc/case-018.arrow b/test/shredded_variant_ipc/case-018.arrow new file mode 100644 index 00000000..3b5d531e Binary files /dev/null and b/test/shredded_variant_ipc/case-018.arrow differ diff --git a/test/shredded_variant_ipc/case-019.arrow b/test/shredded_variant_ipc/case-019.arrow new file mode 100644 index 00000000..ebe4ef0f Binary files /dev/null and b/test/shredded_variant_ipc/case-019.arrow differ diff --git a/test/shredded_variant_ipc/case-020.arrow b/test/shredded_variant_ipc/case-020.arrow new file mode 100644 index 00000000..f7107d67 Binary files /dev/null and b/test/shredded_variant_ipc/case-020.arrow differ diff --git a/test/shredded_variant_ipc/case-021.arrow b/test/shredded_variant_ipc/case-021.arrow new file mode 100644 index 00000000..6d2fe77b Binary files /dev/null and b/test/shredded_variant_ipc/case-021.arrow differ diff --git a/test/shredded_variant_ipc/case-022.arrow b/test/shredded_variant_ipc/case-022.arrow new file mode 100644 index 00000000..72c41eef Binary files /dev/null and b/test/shredded_variant_ipc/case-022.arrow differ diff --git a/test/shredded_variant_ipc/case-023.arrow b/test/shredded_variant_ipc/case-023.arrow new file mode 100644 index 00000000..63579dfa Binary files /dev/null and b/test/shredded_variant_ipc/case-023.arrow differ diff --git a/test/shredded_variant_ipc/case-024.arrow b/test/shredded_variant_ipc/case-024.arrow new file mode 100644 index 00000000..1f2a93ec Binary files /dev/null and b/test/shredded_variant_ipc/case-024.arrow differ diff --git a/test/shredded_variant_ipc/case-025.arrow b/test/shredded_variant_ipc/case-025.arrow new file mode 100644 index 00000000..e6aea2c4 Binary files /dev/null and b/test/shredded_variant_ipc/case-025.arrow differ diff --git a/test/shredded_variant_ipc/case-026.arrow b/test/shredded_variant_ipc/case-026.arrow new file mode 100644 index 00000000..47732a05 Binary files /dev/null and b/test/shredded_variant_ipc/case-026.arrow differ diff --git a/test/shredded_variant_ipc/case-027.arrow b/test/shredded_variant_ipc/case-027.arrow new file mode 100644 index 00000000..2bb38a57 Binary files /dev/null and b/test/shredded_variant_ipc/case-027.arrow differ diff --git a/test/shredded_variant_ipc/case-028.arrow b/test/shredded_variant_ipc/case-028.arrow new file mode 100644 index 00000000..40668ace Binary files /dev/null and b/test/shredded_variant_ipc/case-028.arrow differ diff --git a/test/shredded_variant_ipc/case-029.arrow b/test/shredded_variant_ipc/case-029.arrow new file mode 100644 index 00000000..ce522cda Binary files /dev/null and b/test/shredded_variant_ipc/case-029.arrow differ diff --git a/test/shredded_variant_ipc/case-030.arrow b/test/shredded_variant_ipc/case-030.arrow new file mode 100644 index 00000000..38691ab3 Binary files /dev/null and b/test/shredded_variant_ipc/case-030.arrow differ diff --git a/test/shredded_variant_ipc/case-031.arrow b/test/shredded_variant_ipc/case-031.arrow new file mode 100644 index 00000000..b1ac2901 Binary files /dev/null and b/test/shredded_variant_ipc/case-031.arrow differ diff --git a/test/shredded_variant_ipc/case-032.arrow b/test/shredded_variant_ipc/case-032.arrow new file mode 100644 index 00000000..bf63b6a6 Binary files /dev/null and b/test/shredded_variant_ipc/case-032.arrow differ diff --git a/test/shredded_variant_ipc/case-033.arrow b/test/shredded_variant_ipc/case-033.arrow new file mode 100644 index 00000000..b58fbf34 Binary files /dev/null and b/test/shredded_variant_ipc/case-033.arrow differ diff --git a/test/shredded_variant_ipc/case-034.arrow b/test/shredded_variant_ipc/case-034.arrow new file mode 100644 index 00000000..75d91da1 Binary files /dev/null and b/test/shredded_variant_ipc/case-034.arrow differ diff --git a/test/shredded_variant_ipc/case-035.arrow b/test/shredded_variant_ipc/case-035.arrow new file mode 100644 index 00000000..b28137ef Binary files /dev/null and b/test/shredded_variant_ipc/case-035.arrow differ diff --git a/test/shredded_variant_ipc/case-036.arrow b/test/shredded_variant_ipc/case-036.arrow new file mode 100644 index 00000000..3ad81053 Binary files /dev/null and b/test/shredded_variant_ipc/case-036.arrow differ diff --git a/test/shredded_variant_ipc/case-037.arrow b/test/shredded_variant_ipc/case-037.arrow new file mode 100644 index 00000000..5866d8dc Binary files /dev/null and b/test/shredded_variant_ipc/case-037.arrow differ diff --git a/test/shredded_variant_ipc/case-038.arrow b/test/shredded_variant_ipc/case-038.arrow new file mode 100644 index 00000000..77ce4071 Binary files /dev/null and b/test/shredded_variant_ipc/case-038.arrow differ diff --git a/test/shredded_variant_ipc/case-039.arrow b/test/shredded_variant_ipc/case-039.arrow new file mode 100644 index 00000000..b279c541 Binary files /dev/null and b/test/shredded_variant_ipc/case-039.arrow differ diff --git a/test/shredded_variant_ipc/case-040.arrow b/test/shredded_variant_ipc/case-040.arrow new file mode 100644 index 00000000..04a1ce4b Binary files /dev/null and b/test/shredded_variant_ipc/case-040.arrow differ diff --git a/test/shredded_variant_ipc/case-041.arrow b/test/shredded_variant_ipc/case-041.arrow new file mode 100644 index 00000000..0dba3f13 Binary files /dev/null and b/test/shredded_variant_ipc/case-041.arrow differ diff --git a/test/shredded_variant_ipc/case-042.arrow b/test/shredded_variant_ipc/case-042.arrow new file mode 100644 index 00000000..bae03e12 Binary files /dev/null and b/test/shredded_variant_ipc/case-042.arrow differ diff --git a/test/shredded_variant_ipc/case-043-INVALID.arrow b/test/shredded_variant_ipc/case-043-INVALID.arrow new file mode 100644 index 00000000..5e6ad553 Binary files /dev/null and b/test/shredded_variant_ipc/case-043-INVALID.arrow differ diff --git a/test/shredded_variant_ipc/case-044.arrow b/test/shredded_variant_ipc/case-044.arrow new file mode 100644 index 00000000..7abb0bbe Binary files /dev/null and b/test/shredded_variant_ipc/case-044.arrow differ diff --git a/test/shredded_variant_ipc/case-045.arrow b/test/shredded_variant_ipc/case-045.arrow new file mode 100644 index 00000000..a558a30c Binary files /dev/null and b/test/shredded_variant_ipc/case-045.arrow differ diff --git a/test/shredded_variant_ipc/case-046.arrow b/test/shredded_variant_ipc/case-046.arrow new file mode 100644 index 00000000..41d10fa3 Binary files /dev/null and b/test/shredded_variant_ipc/case-046.arrow differ diff --git a/test/shredded_variant_ipc/case-047.arrow b/test/shredded_variant_ipc/case-047.arrow new file mode 100644 index 00000000..4280e661 Binary files /dev/null and b/test/shredded_variant_ipc/case-047.arrow differ diff --git a/test/shredded_variant_ipc/case-048.arrow b/test/shredded_variant_ipc/case-048.arrow new file mode 100644 index 00000000..844f03a1 Binary files /dev/null and b/test/shredded_variant_ipc/case-048.arrow differ diff --git a/test/shredded_variant_ipc/case-049.arrow b/test/shredded_variant_ipc/case-049.arrow new file mode 100644 index 00000000..b7d4351f Binary files /dev/null and b/test/shredded_variant_ipc/case-049.arrow differ diff --git a/test/shredded_variant_ipc/case-050.arrow b/test/shredded_variant_ipc/case-050.arrow new file mode 100644 index 00000000..c69f3bcf Binary files /dev/null and b/test/shredded_variant_ipc/case-050.arrow differ diff --git a/test/shredded_variant_ipc/case-051.arrow b/test/shredded_variant_ipc/case-051.arrow new file mode 100644 index 00000000..173b764d Binary files /dev/null and b/test/shredded_variant_ipc/case-051.arrow differ diff --git a/test/shredded_variant_ipc/case-052.arrow b/test/shredded_variant_ipc/case-052.arrow new file mode 100644 index 00000000..7ea5d0c2 Binary files /dev/null and b/test/shredded_variant_ipc/case-052.arrow differ diff --git a/test/shredded_variant_ipc/case-053.arrow b/test/shredded_variant_ipc/case-053.arrow new file mode 100644 index 00000000..221186a3 Binary files /dev/null and b/test/shredded_variant_ipc/case-053.arrow differ diff --git a/test/shredded_variant_ipc/case-054.arrow b/test/shredded_variant_ipc/case-054.arrow new file mode 100644 index 00000000..e39c1a61 Binary files /dev/null and b/test/shredded_variant_ipc/case-054.arrow differ diff --git a/test/shredded_variant_ipc/case-055.arrow b/test/shredded_variant_ipc/case-055.arrow new file mode 100644 index 00000000..5459b9ac Binary files /dev/null and b/test/shredded_variant_ipc/case-055.arrow differ diff --git a/test/shredded_variant_ipc/case-056.arrow b/test/shredded_variant_ipc/case-056.arrow new file mode 100644 index 00000000..4b76e649 Binary files /dev/null and b/test/shredded_variant_ipc/case-056.arrow differ diff --git a/test/shredded_variant_ipc/case-057.arrow b/test/shredded_variant_ipc/case-057.arrow new file mode 100644 index 00000000..196dbe69 Binary files /dev/null and b/test/shredded_variant_ipc/case-057.arrow differ diff --git a/test/shredded_variant_ipc/case-058.arrow b/test/shredded_variant_ipc/case-058.arrow new file mode 100644 index 00000000..4dda132d Binary files /dev/null and b/test/shredded_variant_ipc/case-058.arrow differ diff --git a/test/shredded_variant_ipc/case-059.arrow b/test/shredded_variant_ipc/case-059.arrow new file mode 100644 index 00000000..f22b0317 Binary files /dev/null and b/test/shredded_variant_ipc/case-059.arrow differ diff --git a/test/shredded_variant_ipc/case-060.arrow b/test/shredded_variant_ipc/case-060.arrow new file mode 100644 index 00000000..65a8ddcc Binary files /dev/null and b/test/shredded_variant_ipc/case-060.arrow differ diff --git a/test/shredded_variant_ipc/case-061.arrow b/test/shredded_variant_ipc/case-061.arrow new file mode 100644 index 00000000..f3fc282c Binary files /dev/null and b/test/shredded_variant_ipc/case-061.arrow differ diff --git a/test/shredded_variant_ipc/case-062.arrow b/test/shredded_variant_ipc/case-062.arrow new file mode 100644 index 00000000..5d0e1206 Binary files /dev/null and b/test/shredded_variant_ipc/case-062.arrow differ diff --git a/test/shredded_variant_ipc/case-063.arrow b/test/shredded_variant_ipc/case-063.arrow new file mode 100644 index 00000000..f8a048bf Binary files /dev/null and b/test/shredded_variant_ipc/case-063.arrow differ diff --git a/test/shredded_variant_ipc/case-064.arrow b/test/shredded_variant_ipc/case-064.arrow new file mode 100644 index 00000000..e6f00e9c Binary files /dev/null and b/test/shredded_variant_ipc/case-064.arrow differ diff --git a/test/shredded_variant_ipc/case-065.arrow b/test/shredded_variant_ipc/case-065.arrow new file mode 100644 index 00000000..e00dcfee Binary files /dev/null and b/test/shredded_variant_ipc/case-065.arrow differ diff --git a/test/shredded_variant_ipc/case-066.arrow b/test/shredded_variant_ipc/case-066.arrow new file mode 100644 index 00000000..14806248 Binary files /dev/null and b/test/shredded_variant_ipc/case-066.arrow differ diff --git a/test/shredded_variant_ipc/case-067.arrow b/test/shredded_variant_ipc/case-067.arrow new file mode 100644 index 00000000..ca9d91af Binary files /dev/null and b/test/shredded_variant_ipc/case-067.arrow differ diff --git a/test/shredded_variant_ipc/case-068.arrow b/test/shredded_variant_ipc/case-068.arrow new file mode 100644 index 00000000..f4bc9ce7 Binary files /dev/null and b/test/shredded_variant_ipc/case-068.arrow differ diff --git a/test/shredded_variant_ipc/case-069.arrow b/test/shredded_variant_ipc/case-069.arrow new file mode 100644 index 00000000..8eef70df Binary files /dev/null and b/test/shredded_variant_ipc/case-069.arrow differ diff --git a/test/shredded_variant_ipc/case-070.arrow b/test/shredded_variant_ipc/case-070.arrow new file mode 100644 index 00000000..1f8be371 Binary files /dev/null and b/test/shredded_variant_ipc/case-070.arrow differ diff --git a/test/shredded_variant_ipc/case-071.arrow b/test/shredded_variant_ipc/case-071.arrow new file mode 100644 index 00000000..bd9e558a Binary files /dev/null and b/test/shredded_variant_ipc/case-071.arrow differ diff --git a/test/shredded_variant_ipc/case-072.arrow b/test/shredded_variant_ipc/case-072.arrow new file mode 100644 index 00000000..d005bc3d Binary files /dev/null and b/test/shredded_variant_ipc/case-072.arrow differ diff --git a/test/shredded_variant_ipc/case-073.arrow b/test/shredded_variant_ipc/case-073.arrow new file mode 100644 index 00000000..5aace763 Binary files /dev/null and b/test/shredded_variant_ipc/case-073.arrow differ diff --git a/test/shredded_variant_ipc/case-074.arrow b/test/shredded_variant_ipc/case-074.arrow new file mode 100644 index 00000000..336e10b0 Binary files /dev/null and b/test/shredded_variant_ipc/case-074.arrow differ diff --git a/test/shredded_variant_ipc/case-075.arrow b/test/shredded_variant_ipc/case-075.arrow new file mode 100644 index 00000000..be872492 Binary files /dev/null and b/test/shredded_variant_ipc/case-075.arrow differ diff --git a/test/shredded_variant_ipc/case-076.arrow b/test/shredded_variant_ipc/case-076.arrow new file mode 100644 index 00000000..b834b13c Binary files /dev/null and b/test/shredded_variant_ipc/case-076.arrow differ diff --git a/test/shredded_variant_ipc/case-077.arrow b/test/shredded_variant_ipc/case-077.arrow new file mode 100644 index 00000000..33fd3189 Binary files /dev/null and b/test/shredded_variant_ipc/case-077.arrow differ diff --git a/test/shredded_variant_ipc/case-078.arrow b/test/shredded_variant_ipc/case-078.arrow new file mode 100644 index 00000000..78b768ee Binary files /dev/null and b/test/shredded_variant_ipc/case-078.arrow differ diff --git a/test/shredded_variant_ipc/case-079.arrow b/test/shredded_variant_ipc/case-079.arrow new file mode 100644 index 00000000..f2cf41b6 Binary files /dev/null and b/test/shredded_variant_ipc/case-079.arrow differ diff --git a/test/shredded_variant_ipc/case-080.arrow b/test/shredded_variant_ipc/case-080.arrow new file mode 100644 index 00000000..12b31a06 Binary files /dev/null and b/test/shredded_variant_ipc/case-080.arrow differ diff --git a/test/shredded_variant_ipc/case-081.arrow b/test/shredded_variant_ipc/case-081.arrow new file mode 100644 index 00000000..2f94b8ad Binary files /dev/null and b/test/shredded_variant_ipc/case-081.arrow differ diff --git a/test/shredded_variant_ipc/case-082.arrow b/test/shredded_variant_ipc/case-082.arrow new file mode 100644 index 00000000..13cd2217 Binary files /dev/null and b/test/shredded_variant_ipc/case-082.arrow differ diff --git a/test/shredded_variant_ipc/case-083.arrow b/test/shredded_variant_ipc/case-083.arrow new file mode 100644 index 00000000..c9276485 Binary files /dev/null and b/test/shredded_variant_ipc/case-083.arrow differ diff --git a/test/shredded_variant_ipc/case-084-INVALID.arrow b/test/shredded_variant_ipc/case-084-INVALID.arrow new file mode 100644 index 00000000..d7274793 Binary files /dev/null and b/test/shredded_variant_ipc/case-084-INVALID.arrow differ diff --git a/test/shredded_variant_ipc/case-085.arrow b/test/shredded_variant_ipc/case-085.arrow new file mode 100644 index 00000000..4a7fb36b Binary files /dev/null and b/test/shredded_variant_ipc/case-085.arrow differ diff --git a/test/shredded_variant_ipc/case-086.arrow b/test/shredded_variant_ipc/case-086.arrow new file mode 100644 index 00000000..89515025 Binary files /dev/null and b/test/shredded_variant_ipc/case-086.arrow differ diff --git a/test/shredded_variant_ipc/case-087.arrow b/test/shredded_variant_ipc/case-087.arrow new file mode 100644 index 00000000..bd223108 Binary files /dev/null and b/test/shredded_variant_ipc/case-087.arrow differ diff --git a/test/shredded_variant_ipc/case-088.arrow b/test/shredded_variant_ipc/case-088.arrow new file mode 100644 index 00000000..4b94fa0e Binary files /dev/null and b/test/shredded_variant_ipc/case-088.arrow differ diff --git a/test/shredded_variant_ipc/case-089.arrow b/test/shredded_variant_ipc/case-089.arrow new file mode 100644 index 00000000..0bd03544 Binary files /dev/null and b/test/shredded_variant_ipc/case-089.arrow differ diff --git a/test/shredded_variant_ipc/case-090.arrow b/test/shredded_variant_ipc/case-090.arrow new file mode 100644 index 00000000..be943a67 Binary files /dev/null and b/test/shredded_variant_ipc/case-090.arrow differ diff --git a/test/shredded_variant_ipc/case-091.arrow b/test/shredded_variant_ipc/case-091.arrow new file mode 100644 index 00000000..54db6169 Binary files /dev/null and b/test/shredded_variant_ipc/case-091.arrow differ diff --git a/test/shredded_variant_ipc/case-092.arrow b/test/shredded_variant_ipc/case-092.arrow new file mode 100644 index 00000000..b7216cd3 Binary files /dev/null and b/test/shredded_variant_ipc/case-092.arrow differ diff --git a/test/shredded_variant_ipc/case-093.arrow b/test/shredded_variant_ipc/case-093.arrow new file mode 100644 index 00000000..bead0001 Binary files /dev/null and b/test/shredded_variant_ipc/case-093.arrow differ diff --git a/test/shredded_variant_ipc/case-094.arrow b/test/shredded_variant_ipc/case-094.arrow new file mode 100644 index 00000000..e9ed083a Binary files /dev/null and b/test/shredded_variant_ipc/case-094.arrow differ diff --git a/test/shredded_variant_ipc/case-095.arrow b/test/shredded_variant_ipc/case-095.arrow new file mode 100644 index 00000000..62bd1f9e Binary files /dev/null and b/test/shredded_variant_ipc/case-095.arrow differ diff --git a/test/shredded_variant_ipc/case-096.arrow b/test/shredded_variant_ipc/case-096.arrow new file mode 100644 index 00000000..38772002 Binary files /dev/null and b/test/shredded_variant_ipc/case-096.arrow differ diff --git a/test/shredded_variant_ipc/case-097.arrow b/test/shredded_variant_ipc/case-097.arrow new file mode 100644 index 00000000..d6a3b264 Binary files /dev/null and b/test/shredded_variant_ipc/case-097.arrow differ diff --git a/test/shredded_variant_ipc/case-098.arrow b/test/shredded_variant_ipc/case-098.arrow new file mode 100644 index 00000000..af2f0414 Binary files /dev/null and b/test/shredded_variant_ipc/case-098.arrow differ diff --git a/test/shredded_variant_ipc/case-099.arrow b/test/shredded_variant_ipc/case-099.arrow new file mode 100644 index 00000000..34be6798 Binary files /dev/null and b/test/shredded_variant_ipc/case-099.arrow differ diff --git a/test/shredded_variant_ipc/case-100.arrow b/test/shredded_variant_ipc/case-100.arrow new file mode 100644 index 00000000..074083f9 Binary files /dev/null and b/test/shredded_variant_ipc/case-100.arrow differ diff --git a/test/shredded_variant_ipc/case-101.arrow b/test/shredded_variant_ipc/case-101.arrow new file mode 100644 index 00000000..2f2dcbed Binary files /dev/null and b/test/shredded_variant_ipc/case-101.arrow differ diff --git a/test/shredded_variant_ipc/case-102.arrow b/test/shredded_variant_ipc/case-102.arrow new file mode 100644 index 00000000..11f4d82a Binary files /dev/null and b/test/shredded_variant_ipc/case-102.arrow differ diff --git a/test/shredded_variant_ipc/case-103.arrow b/test/shredded_variant_ipc/case-103.arrow new file mode 100644 index 00000000..5be970b2 Binary files /dev/null and b/test/shredded_variant_ipc/case-103.arrow differ diff --git a/test/shredded_variant_ipc/case-104.arrow b/test/shredded_variant_ipc/case-104.arrow new file mode 100644 index 00000000..a9fe5233 Binary files /dev/null and b/test/shredded_variant_ipc/case-104.arrow differ diff --git a/test/shredded_variant_ipc/case-105.arrow b/test/shredded_variant_ipc/case-105.arrow new file mode 100644 index 00000000..571be21b Binary files /dev/null and b/test/shredded_variant_ipc/case-105.arrow differ diff --git a/test/shredded_variant_ipc/case-106.arrow b/test/shredded_variant_ipc/case-106.arrow new file mode 100644 index 00000000..1b901f50 Binary files /dev/null and b/test/shredded_variant_ipc/case-106.arrow differ diff --git a/test/shredded_variant_ipc/case-107.arrow b/test/shredded_variant_ipc/case-107.arrow new file mode 100644 index 00000000..587fafa0 Binary files /dev/null and b/test/shredded_variant_ipc/case-107.arrow differ diff --git a/test/shredded_variant_ipc/case-108.arrow b/test/shredded_variant_ipc/case-108.arrow new file mode 100644 index 00000000..98a8c574 Binary files /dev/null and b/test/shredded_variant_ipc/case-108.arrow differ diff --git a/test/shredded_variant_ipc/case-109.arrow b/test/shredded_variant_ipc/case-109.arrow new file mode 100644 index 00000000..9547ec06 Binary files /dev/null and b/test/shredded_variant_ipc/case-109.arrow differ diff --git a/test/shredded_variant_ipc/case-110.arrow b/test/shredded_variant_ipc/case-110.arrow new file mode 100644 index 00000000..d99d9854 Binary files /dev/null and b/test/shredded_variant_ipc/case-110.arrow differ diff --git a/test/shredded_variant_ipc/case-111.arrow b/test/shredded_variant_ipc/case-111.arrow new file mode 100644 index 00000000..796e6924 Binary files /dev/null and b/test/shredded_variant_ipc/case-111.arrow differ diff --git a/test/shredded_variant_ipc/case-112.arrow b/test/shredded_variant_ipc/case-112.arrow new file mode 100644 index 00000000..199596ff Binary files /dev/null and b/test/shredded_variant_ipc/case-112.arrow differ diff --git a/test/shredded_variant_ipc/case-113.arrow b/test/shredded_variant_ipc/case-113.arrow new file mode 100644 index 00000000..39437c0d Binary files /dev/null and b/test/shredded_variant_ipc/case-113.arrow differ diff --git a/test/shredded_variant_ipc/case-114.arrow b/test/shredded_variant_ipc/case-114.arrow new file mode 100644 index 00000000..39ee297f Binary files /dev/null and b/test/shredded_variant_ipc/case-114.arrow differ diff --git a/test/shredded_variant_ipc/case-115.arrow b/test/shredded_variant_ipc/case-115.arrow new file mode 100644 index 00000000..1a62a33f Binary files /dev/null and b/test/shredded_variant_ipc/case-115.arrow differ diff --git a/test/shredded_variant_ipc/case-116.arrow b/test/shredded_variant_ipc/case-116.arrow new file mode 100644 index 00000000..51e705a7 Binary files /dev/null and b/test/shredded_variant_ipc/case-116.arrow differ diff --git a/test/shredded_variant_ipc/case-117.arrow b/test/shredded_variant_ipc/case-117.arrow new file mode 100644 index 00000000..91c2bd76 Binary files /dev/null and b/test/shredded_variant_ipc/case-117.arrow differ diff --git a/test/shredded_variant_ipc/case-118.arrow b/test/shredded_variant_ipc/case-118.arrow new file mode 100644 index 00000000..c5da1cd5 Binary files /dev/null and b/test/shredded_variant_ipc/case-118.arrow differ diff --git a/test/shredded_variant_ipc/case-119.arrow b/test/shredded_variant_ipc/case-119.arrow new file mode 100644 index 00000000..5d6f695d Binary files /dev/null and b/test/shredded_variant_ipc/case-119.arrow differ diff --git a/test/shredded_variant_ipc/case-120.arrow b/test/shredded_variant_ipc/case-120.arrow new file mode 100644 index 00000000..0fa61c3d Binary files /dev/null and b/test/shredded_variant_ipc/case-120.arrow differ diff --git a/test/shredded_variant_ipc/case-121.arrow b/test/shredded_variant_ipc/case-121.arrow new file mode 100644 index 00000000..97018ed0 Binary files /dev/null and b/test/shredded_variant_ipc/case-121.arrow differ diff --git a/test/shredded_variant_ipc/case-122.arrow b/test/shredded_variant_ipc/case-122.arrow new file mode 100644 index 00000000..5a75cbc2 Binary files /dev/null and b/test/shredded_variant_ipc/case-122.arrow differ diff --git a/test/shredded_variant_ipc/case-123.arrow b/test/shredded_variant_ipc/case-123.arrow new file mode 100644 index 00000000..79fffdf7 Binary files /dev/null and b/test/shredded_variant_ipc/case-123.arrow differ diff --git a/test/shredded_variant_ipc/case-124.arrow b/test/shredded_variant_ipc/case-124.arrow new file mode 100644 index 00000000..a15beb29 Binary files /dev/null and b/test/shredded_variant_ipc/case-124.arrow differ diff --git a/test/shredded_variant_ipc/case-125-INVALID.arrow b/test/shredded_variant_ipc/case-125-INVALID.arrow new file mode 100644 index 00000000..3640a698 Binary files /dev/null and b/test/shredded_variant_ipc/case-125-INVALID.arrow differ diff --git a/test/shredded_variant_ipc/case-126.arrow b/test/shredded_variant_ipc/case-126.arrow new file mode 100644 index 00000000..e5071f15 Binary files /dev/null and b/test/shredded_variant_ipc/case-126.arrow differ diff --git a/test/shredded_variant_ipc/case-127.arrow b/test/shredded_variant_ipc/case-127.arrow new file mode 100644 index 00000000..2a7bdfd3 Binary files /dev/null and b/test/shredded_variant_ipc/case-127.arrow differ diff --git a/test/shredded_variant_ipc/case-128.arrow b/test/shredded_variant_ipc/case-128.arrow new file mode 100644 index 00000000..9d426443 Binary files /dev/null and b/test/shredded_variant_ipc/case-128.arrow differ diff --git a/test/shredded_variant_ipc/case-129.arrow b/test/shredded_variant_ipc/case-129.arrow new file mode 100644 index 00000000..d542b8e8 Binary files /dev/null and b/test/shredded_variant_ipc/case-129.arrow differ diff --git a/test/shredded_variant_ipc/case-130.arrow b/test/shredded_variant_ipc/case-130.arrow new file mode 100644 index 00000000..f904f49d Binary files /dev/null and b/test/shredded_variant_ipc/case-130.arrow differ diff --git a/test/shredded_variant_ipc/case-131.arrow b/test/shredded_variant_ipc/case-131.arrow new file mode 100644 index 00000000..6883e4bc Binary files /dev/null and b/test/shredded_variant_ipc/case-131.arrow differ diff --git a/test/shredded_variant_ipc/case-132.arrow b/test/shredded_variant_ipc/case-132.arrow new file mode 100644 index 00000000..4d062d96 Binary files /dev/null and b/test/shredded_variant_ipc/case-132.arrow differ diff --git a/test/shredded_variant_ipc/case-133.arrow b/test/shredded_variant_ipc/case-133.arrow new file mode 100644 index 00000000..d4cb7b36 Binary files /dev/null and b/test/shredded_variant_ipc/case-133.arrow differ diff --git a/test/shredded_variant_ipc/case-134.arrow b/test/shredded_variant_ipc/case-134.arrow new file mode 100644 index 00000000..c18e1b3d Binary files /dev/null and b/test/shredded_variant_ipc/case-134.arrow differ diff --git a/test/shredded_variant_ipc/case-135.arrow b/test/shredded_variant_ipc/case-135.arrow new file mode 100644 index 00000000..369f88eb Binary files /dev/null and b/test/shredded_variant_ipc/case-135.arrow differ diff --git a/test/shredded_variant_ipc/case-136.arrow b/test/shredded_variant_ipc/case-136.arrow new file mode 100644 index 00000000..0c8ce487 Binary files /dev/null and b/test/shredded_variant_ipc/case-136.arrow differ diff --git a/test/shredded_variant_ipc/case-137.arrow b/test/shredded_variant_ipc/case-137.arrow new file mode 100644 index 00000000..7e683420 Binary files /dev/null and b/test/shredded_variant_ipc/case-137.arrow differ diff --git a/test/shredded_variant_ipc/case-138.arrow b/test/shredded_variant_ipc/case-138.arrow new file mode 100644 index 00000000..1dfdae44 Binary files /dev/null and b/test/shredded_variant_ipc/case-138.arrow differ diff --git a/test/shredded_variant_ipc/regen.py b/test/shredded_variant_ipc/regen.py new file mode 100644 index 00000000..5352047f --- /dev/null +++ b/test/shredded_variant_ipc/regen.py @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Converts shredded-variant Parquet test cases from +test/parquet-testing/shredded_variant/*.parquet into Arrow IPC (.arrow) files +under this directory, so that .NET tests can load them without a Parquet +reader. The Parquet test corpus comes from apache/parquet-testing. + +Requires: pyarrow (tested with 23.0). + +Run from the repo root: + + python test/shredded_variant_ipc/regen.py + +Existing .arrow files are overwritten in place. + +Test cases such as case-037 may produce different output on different platforms +because Arrow metadata is read into a hash-based collection where the hashing +function can vary across platforms. +""" + +import json +import os +import sys + +import pyarrow as pa +import pyarrow.ipc as ipc +import pyarrow.parquet as pq + + +def main() -> int: + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + src = os.path.join(repo_root, "test", "parquet-testing", "shredded_variant") + dst = os.path.join(repo_root, "test", "shredded_variant_ipc") + cases_json = os.path.join(src, "cases.json") + + if not os.path.exists(cases_json): + print(f"cases.json not found at {cases_json}", file=sys.stderr) + return 1 + + with open(cases_json) as f: + cases = json.load(f) + + os.makedirs(dst, exist_ok=True) + + written = 0 + for case in cases: + parquet_files = [] + if "parquet_file" in case: + parquet_files.append(case["parquet_file"]) + + for pf in parquet_files: + src_path = os.path.join(src, pf) + if not os.path.exists(src_path): + continue + + table = pq.read_table(src_path) + dst_path = os.path.join(dst, os.path.splitext(pf)[0] + ".arrow") + + with ipc.new_file(dst_path, table.schema) as writer: + writer.write_table(table) + + written += 1 + + print(f"Wrote {written} .arrow files to {dst}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())