Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
e7f1f68
Stricter template on RleDecoder
AntoinePrv Jul 24, 2025
6f5670c
Refactor number of vlq bytes needed
AntoinePrv Jul 24, 2025
dfa3b71
Refactor LEB128 reading
AntoinePrv Jul 25, 2025
77fc9e0
Refactor LEB128 writing
AntoinePrv Jul 25, 2025
22e6282
Simple Run classes
AntoinePrv Aug 4, 2025
15cde24
Rename RleDecoder > RleBitPackedDecoder
AntoinePrv Aug 4, 2025
1faac12
Add RleDecoder
AntoinePrv Aug 5, 2025
33bb556
Add BitPackedDecoder
AntoinePrv Aug 5, 2025
8369eb6
Implement runs
AntoinePrv Aug 5, 2025
97647ac
Add RleBitPackedParser
AntoinePrv Aug 6, 2025
a4bc593
Rename RleEncoder > RleBitPackedEncoder
AntoinePrv Aug 7, 2025
991b59c
Add Runs tests
AntoinePrv Aug 7, 2025
c35db07
Add RleDecoder test
AntoinePrv Aug 7, 2025
5123cb1
Add BitPackedDecoder test
AntoinePrv Aug 7, 2025
f832d82
Add RleBitPackedParser test
AntoinePrv Aug 7, 2025
1f87da5
Strengthen RleBitPacked tests
AntoinePrv Aug 14, 2025
db2f11e
Remove BitBlockCounter from RleBitPackedDecoder
AntoinePrv Aug 11, 2025
e72dfd4
Plug new parser+decoder in RleBitPackedDecoder
AntoinePrv Aug 13, 2025
fa829bc
Dict RleEncoding tests
AntoinePrv Aug 19, 2025
ce4232d
Perf: Fewer variant calls
AntoinePrv Aug 20, 2025
585c3a7
Perf: template bit run readers
AntoinePrv Aug 21, 2025
37d0c7b
Perf: separate impl for identity converter
AntoinePrv Aug 21, 2025
b4806b4
Perf: GetBatch fewer variant calls
AntoinePrv Aug 22, 2025
b9abef3
Perf: predict no bad data
AntoinePrv Aug 22, 2025
dc3c16d
Perf: predict new bit run in RleSpaced
AntoinePrv Aug 22, 2025
5b93a7e
Fix RleSpaced
AntoinePrv Aug 22, 2025
fab7fdf
Refactor parsing with handler
AntoinePrv Aug 22, 2025
407de76
Perf: predict false in LEB128
AntoinePrv Aug 25, 2025
e641ca0
Remove BitPackedIdentity
AntoinePrv Aug 25, 2025
93415fb
Check overflow in LEB128 reading
AntoinePrv Aug 26, 2025
c13243d
Avoid UB in LEB128 overflow check
AntoinePrv Aug 26, 2025
bbf6ffb
Fix UB in test
AntoinePrv Aug 26, 2025
b4a37ce
Fix UB
AntoinePrv Aug 26, 2025
9027ca8
First pass addressing review comments
AntoinePrv Sep 9, 2025
1db18de
Second pass addressing review comments
AntoinePrv Sep 10, 2025
2b80bc6
Test callback parsing
AntoinePrv Sep 10, 2025
d3ee424
Remove parser iteration API
AntoinePrv Sep 10, 2025
365dd88
Add missing plug of LEB128 in BitReader/BitWriter
AntoinePrv Sep 10, 2025
2edec86
Better handle signed/negative values in LEB128
AntoinePrv Sep 10, 2025
36cb673
Aggressive LEB128 test cases
AntoinePrv Sep 10, 2025
fb69856
Use pointer for non-const ref
AntoinePrv Sep 10, 2025
eb384fe
West const
AntoinePrv Sep 10, 2025
12a87fd
Comment slashes
AntoinePrv Sep 10, 2025
a272274
Third pass addressing review comments
AntoinePrv Sep 10, 2025
ab7d212
Fix infinite loop on invalid input
AntoinePrv Sep 15, 2025
6ed409e
Fourth pass addressing review comments
AntoinePrv Sep 15, 2025
a61d836
Apply formatter
AntoinePrv Sep 15, 2025
c91638b
Address newer reviewer comments
AntoinePrv Sep 17, 2025
11bb10a
Inline small method and remove [[nodiscard]]
AntoinePrv Sep 17, 2025
a91a847
snake_case for small const methods
AntoinePrv Sep 17, 2025
2221741
Rule of zero
AntoinePrv Sep 17, 2025
d520ebd
Simplify type aliases
AntoinePrv Sep 17, 2025
7f153c0
Set buffer capacity
AntoinePrv Sep 17, 2025
41ff0cd
More doc
AntoinePrv Sep 17, 2025
dcc6db0
Address reviewer comments
AntoinePrv Sep 19, 2025
c2344a8
Explicitly pass bit width (don't store multiple time)
AntoinePrv Sep 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CPPLINT.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@ filter = -readability/alt_tokens
filter = -readability/casting
filter = -readability/todo
filter = -runtime/references
# Let the formatter do the job for whitespaces
filter = -whitespace/comments
filter = -whitespace/braces
linelength = 90
4 changes: 4 additions & 0 deletions cpp/src/arrow/util/bit_run_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {

class BitRunReaderLinear {
public:
BitRunReaderLinear() = default;

BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
: reader_(bitmap, start_offset, length) {}

Expand All @@ -74,6 +76,8 @@ class BitRunReaderLinear {
/// in a bitmap.
class ARROW_EXPORT BitRunReader {
public:
BitRunReader() = default;

/// \brief Constructs new BitRunReader.
///
/// \param[in] bitmap source data
Expand Down
178 changes: 82 additions & 96 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <algorithm>
#include <cstdint>
#include <cstring>
#include <type_traits>

#include "arrow/util/bit_util.h"
#include "arrow/util/bpacking_internal.h"
Expand All @@ -30,8 +31,7 @@
#include "arrow/util/macros.h"
#include "arrow/util/ubsan.h"

namespace arrow {
namespace bit_util {
namespace arrow::bit_util {

/// Utility class to write bit/byte streams. This class can write data to either be
/// bit packed or byte aligned (and a single stream that has a mix of both).
Expand Down Expand Up @@ -73,19 +73,14 @@ class BitWriter {
/// room. The value is written byte aligned.
/// For more details on vlq:
/// en.wikipedia.org/wiki/Variable-length_quantity
bool PutVlqInt(uint32_t v);
template <typename Int>
bool PutVlqInt(Int v);

// Writes an int zigzag encoded.
bool PutZigZagVlqInt(int32_t v);

/// Write a Vlq encoded int64 to the buffer. Returns false if there was not enough
/// room. The value is written byte aligned.
/// For more details on vlq:
/// en.wikipedia.org/wiki/Variable-length_quantity
bool PutVlqInt(uint64_t v);

// Writes an int64 zigzag encoded.
bool PutZigZagVlqInt(int64_t v);
/// Writes a zigzag encoded signed integer.
/// Zigzag encoding is used to encode possibly negative numbers by alternating positive
/// and negative ones.
template <typename Int>
bool PutZigZagVlqInt(Int v);

/// Get a pointer to the next aligned byte and advance the underlying buffer
/// by num_bytes.
Expand Down Expand Up @@ -128,14 +123,14 @@ inline uint64_t ReadLittleEndianWord(const uint8_t* buffer, int bytes_remaining)
/// bytes in one read (e.g. encoded int).
class BitReader {
public:
BitReader() = default;
BitReader() noexcept = default;

/// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'.
BitReader(const uint8_t* buffer, int buffer_len) : BitReader() {
Reset(buffer, buffer_len);
}

void Reset(const uint8_t* buffer, int buffer_len) {
void Reset(const uint8_t* buffer, int buffer_len) noexcept {
buffer_ = buffer;
max_bytes_ = buffer_len;
byte_offset_ = 0;
Expand Down Expand Up @@ -169,18 +164,14 @@ class BitReader {
/// Reads a vlq encoded int from the stream. The encoded int must start at
/// the beginning of a byte. Return false if there were not enough bytes in
/// the buffer.
bool GetVlqInt(uint32_t* v);

// Reads a zigzag encoded int `into` v.
bool GetZigZagVlqInt(int32_t* v);

/// Reads a vlq encoded int64 from the stream. The encoded int must start at
/// the beginning of a byte. Return false if there were not enough bytes in
/// the buffer.
bool GetVlqInt(uint64_t* v);
template <typename Int>
bool GetVlqInt(Int* v);

// Reads a zigzag encoded int64 `into` v.
bool GetZigZagVlqInt(int64_t* v);
/// Reads a zigzag encoded integer into a signed integer output v.
/// Zigzag encoding is used to decode possibly negative numbers by alternating positive
/// and negative ones.
template <typename Int>
bool GetZigZagVlqInt(Int* v);

/// Returns the number of bytes left in the stream, not including the current
/// byte (i.e., there may be an additional fraction of a byte).
Expand All @@ -189,12 +180,6 @@ class BitReader {
(byte_offset_ + static_cast<int>(bit_util::BytesForBits(bit_offset_)));
}

/// Maximum byte length of a vlq encoded int
static constexpr int kMaxVlqByteLength = 5;

/// Maximum byte length of a vlq encoded int64
static constexpr int kMaxVlqByteLengthForInt64 = 10;

private:
const uint8_t* buffer_;
int max_bytes_;
Expand Down Expand Up @@ -439,91 +424,92 @@ inline bool BitReader::Advance(int64_t num_bits) {
return true;
}

inline bool BitWriter::PutVlqInt(uint32_t v) {
bool result = true;
while ((v & 0xFFFFFF80UL) != 0UL) {
result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
v >>= 7;
}
result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
return result;
}
template <typename Int>
inline bool BitWriter::PutVlqInt(Int v) {
static_assert(std::is_integral_v<Int>);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here: assert it's unsigned?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also write a positive number from an unsigned integer type


inline bool BitReader::GetVlqInt(uint32_t* v) {
uint32_t tmp = 0;
constexpr auto kBufferSize = kMaxLEB128ByteLenFor<Int>;

for (int i = 0; i < kMaxVlqByteLength; i++) {
uint8_t byte = 0;
if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
uint8_t buffer[kBufferSize] = {};
const auto bytes_written = WriteLEB128(v, buffer, kBufferSize);
ARROW_DCHECK_LE(bytes_written, kBufferSize);
if constexpr (std::is_signed_v<Int>) {
// Can fail if negative
if (ARROW_PREDICT_FALSE(!bytes_written == 0)) {
return false;
}
tmp |= static_cast<uint32_t>(byte & 0x7F) << (7 * i);
} else {
// Cannot fail since we gave max space
ARROW_DCHECK_GT(bytes_written, 0);
}

if ((byte & 0x80) == 0) {
*v = tmp;
return true;
for (int i = 0; i < bytes_written; ++i) {
const bool success = PutAligned(buffer[i], 1);
if (ARROW_PREDICT_FALSE(!success)) {
return false;
}
}

return false;
}

inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
uint32_t u_v = ::arrow::util::SafeCopy<uint32_t>(v);
u_v = (u_v << 1) ^ static_cast<uint32_t>(v >> 31);
return PutVlqInt(u_v);
}

inline bool BitReader::GetZigZagVlqInt(int32_t* v) {
uint32_t u;
if (!GetVlqInt(&u)) return false;
u = (u >> 1) ^ (~(u & 1) + 1);
*v = ::arrow::util::SafeCopy<int32_t>(u);
return true;
}

inline bool BitWriter::PutVlqInt(uint64_t v) {
bool result = true;
while ((v & 0xFFFFFFFFFFFFFF80ULL) != 0ULL) {
result &= PutAligned<uint8_t>(static_cast<uint8_t>((v & 0x7F) | 0x80), 1);
v >>= 7;
template <typename Int>
inline bool BitReader::GetVlqInt(Int* v) {
static_assert(std::is_integral_v<Int>);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also assert that it's unsigned?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can, but we don't have to. The code safely works for extracting a positive number into a signed integer type (if it fits, but that's true for all types).


// The data that we will pass to the LEB128 parser
// In all case, we read a byte-aligned value, skipping remaining bits
const uint8_t* data = NULLPTR;
int max_size = 0;

// Number of bytes left in the buffered values, not including the current
// byte (i.e., there may be an additional fraction of a byte).
const int bytes_left_in_cache =
sizeof(buffered_values_) - static_cast<int>(bit_util::BytesForBits(bit_offset_));

// If there are clearly enough bytes left we can try to parse from the cache
if (bytes_left_in_cache >= kMaxLEB128ByteLenFor<Int>) {
max_size = bytes_left_in_cache;
data = reinterpret_cast<const uint8_t*>(&buffered_values_) +
bit_util::BytesForBits(bit_offset_);
// Otherwise, we try straight from buffer (ignoring few bytes that may be cached)
} else {
max_size = bytes_left();
data = buffer_ + (max_bytes_ - max_size);
}
result &= PutAligned<uint8_t>(static_cast<uint8_t>(v & 0x7F), 1);
return result;
}

inline bool BitReader::GetVlqInt(uint64_t* v) {
uint64_t tmp = 0;

for (int i = 0; i < kMaxVlqByteLengthForInt64; i++) {
uint8_t byte = 0;
if (ARROW_PREDICT_FALSE(!GetAligned<uint8_t>(1, &byte))) {
return false;
}
tmp |= static_cast<uint64_t>(byte & 0x7F) << (7 * i);

if ((byte & 0x80) == 0) {
*v = tmp;
return true;
}
const auto bytes_read = bit_util::ParseLeadingLEB128(data, max_size, v);
if (ARROW_PREDICT_FALSE(bytes_read == 0)) {
// Corrupt LEB128
return false;
}

return false;
// Advance for the bytes we have read + the bits we skipped
return Advance((8 * bytes_read) + (bit_offset_ % 8));
}

inline bool BitWriter::PutZigZagVlqInt(int64_t v) {
uint64_t u_v = ::arrow::util::SafeCopy<uint64_t>(v);
u_v = (u_v << 1) ^ static_cast<uint64_t>(v >> 63);
template <typename Int>
inline bool BitWriter::PutZigZagVlqInt(Int v) {
static_assert(std::is_integral_v<Int>);
static_assert(std::is_signed_v<Int>);
using UInt = std::make_unsigned_t<Int>;
constexpr auto kBitSize = 8 * sizeof(Int);

UInt u_v = ::arrow::util::SafeCopy<UInt>(v);
u_v = (u_v << 1) ^ static_cast<UInt>(v >> (kBitSize - 1));
return PutVlqInt(u_v);
}

inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
uint64_t u;
template <typename Int>
inline bool BitReader::GetZigZagVlqInt(Int* v) {
static_assert(std::is_integral_v<Int>);
static_assert(std::is_signed_v<Int>);

std::make_unsigned_t<Int> u;
if (!GetVlqInt(&u)) return false;
u = (u >> 1) ^ (~(u & 1) + 1);
*v = ::arrow::util::SafeCopy<int64_t>(u);
*v = ::arrow::util::SafeCopy<Int>(u);
return true;
}

} // namespace bit_util
} // namespace arrow
} // namespace arrow::bit_util
Loading
Loading