From a6a8b62c7299b3b58517ac4b8ab0ce7465020503 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Fri, 14 Jun 2024 14:01:20 -0700 Subject: [PATCH 1/3] deprecate file_offset field --- README.md | 26 +++++++++++++------------- src/main/thrift/parquet.thrift | 17 +++++++++++++---- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 9567c6384..08eb47626 100644 --- a/README.md +++ b/README.md @@ -89,29 +89,29 @@ more pages. This file and the [Thrift definition](src/main/thrift/parquet.thrift) should be read together to understand the format. 4-byte magic number "PAR1" - - + + ... - - - + + + ... - + ... - - + + ... - + File Metadata 4-byte length in bytes of file metadata (little endian) 4-byte magic number "PAR1" In the above example, there are N columns in this table, split into M row -groups. The file metadata contains the locations of all the column metadata +groups. The file metadata contains the locations of all the column chunk start locations. More details on what is contained in the metadata can be found in the Thrift definition. -Metadata is written after the data to allow for single pass writing. +File Metadata is written after the data to allow for single pass writing. Readers are expected to first read the file metadata to find all the column chunks they are interested in. The columns chunks should then be read sequentially. @@ -119,8 +119,8 @@ chunks they are interested in. The columns chunks should then be read sequentia ![File Layout](https://raw.github.com/apache/parquet-format/master/doc/images/FileLayout.gif) ## Metadata -There are three types of metadata: file metadata, column (chunk) metadata and page -header metadata. All thrift structures are serialized using the TCompactProtocol. +There are two types of metadata: file metadata and page header metadata. All thrift structures +are serialized using the TCompactProtocol. ![Metadata diagram](https://github.com/apache/parquet-format/raw/master/doc/images/FileFormat.gif) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 934b3cadd..ee6efeb6e 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -867,12 +867,21 @@ struct ColumnChunk { **/ 1: optional string file_path - /** Byte offset in file_path to the ColumnMetaData **/ + /** Deprecated: Byte offset in file_path to the ColumnMetaData + * + * Past use of this field has been inconsistent, with some implementations + * using it to point to the ColumnMetaData and some using it to point to + * the first page in the column chunk. In many cases, the ColumnMetaData at this + * location is wrong. This field is now deprecated and should not be used. + * Writers should set this field to 0 if no ColumnMetaData has been written outside + * the footer. + */ 2: required i64 file_offset - /** Column metadata for this chunk. This is the same content as what is at - * file_path/file_offset. Having it here has it replicated in the file - * metadata. + /** Column metadata for this chunk. Some writers may also replicate this at the + * location pointed to by file_path/file_offset. + * Note: while marked as optional, this field is in fact required by most major + * Parquet implementations. As such, writers MUST populate this field. **/ 3: optional ColumnMetaData meta_data From 084567794848387e45c97d71f07f91b56b248617 Mon Sep 17 00:00:00 2001 From: seidl Date: Wed, 26 Jun 2024 09:13:45 -0700 Subject: [PATCH 2/3] set default value for file_offset --- src/main/thrift/parquet.thrift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index ee6efeb6e..9e83529ac 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -876,7 +876,7 @@ struct ColumnChunk { * Writers should set this field to 0 if no ColumnMetaData has been written outside * the footer. */ - 2: required i64 file_offset + 2: required i64 file_offset = 0 /** Column metadata for this chunk. Some writers may also replicate this at the * location pointed to by file_path/file_offset. From e49f34410f5c7c2a20f6a997a57d7f0ed270f857 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Thu, 27 Jun 2024 16:24:49 -0700 Subject: [PATCH 3/3] remove page metadata from diagram --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 08eb47626..d268b45e9 100644 --- a/README.md +++ b/README.md @@ -89,19 +89,19 @@ more pages. This file and the [Thrift definition](src/main/thrift/parquet.thrift) should be read together to understand the format. 4-byte magic number "PAR1" - - + + ... - - - + + + ... - + ... - - + + ... - + File Metadata 4-byte length in bytes of file metadata (little endian) 4-byte magic number "PAR1"