Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0d442fd
Add alp code
sfc-gh-pgaur Dec 4, 2025
06d1e19
Integrate ALP with arrow
sfc-gh-pgaur Dec 4, 2025
a98c594
Add alp benchmark
sfc-gh-pgaur Dec 4, 2025
c297f97
Add datasets for alp benchmarking
sfc-gh-pgaur Dec 4, 2025
ab928e8
Update cmake file
sfc-gh-pgaur Dec 4, 2025
6a95a59
Move hpp files to h
sfc-gh-pgaur Dec 6, 2025
865e46a
Update flow digram and layout digram to use ASCII and not unicode cha…
sfc-gh-pgaur Dec 7, 2025
cb6d0b6
Rename cpp files to cc
sfc-gh-pgaur Dec 7, 2025
496e23b
Update documentation to align with arrow's doxygen style
sfc-gh-pgaur Dec 7, 2025
8803b52
Adapt methods and variable names to arrow style
sfc-gh-pgaur Dec 7, 2025
31e94ec
Update the tests to adhere to arrow style code
sfc-gh-pgaur Dec 7, 2025
46c0ecc
Update callers
sfc-gh-pgaur Dec 7, 2025
a70b08f
Fuse FOR and decode loop
sfc-gh-pgaur Dec 7, 2025
ccbb1dd
Reduce memory allocation in the decompress call
sfc-gh-pgaur Dec 7, 2025
6a01df2
Attempt at making decoding faster with SIMD
sfc-gh-pgaur Dec 8, 2025
4ced783
Revert "Attempt at making decoding faster with SIMD"
sfc-gh-pgaur Dec 8, 2025
4fac73c
Move cpp files to cc
sfc-gh-pgaur Dec 8, 2025
1cb0852
Move data file to parquet-testing submodule
sfc-gh-pgaur Dec 8, 2025
8d307a6
Update path to the data file
sfc-gh-pgaur Dec 9, 2025
0908342
Adapt files names to arrow convention
sfc-gh-pgaur Dec 15, 2025
e56c877
File rename
sfc-gh-pgaur Dec 15, 2025
cfa00ba
Obtain compressed size and number of elements from page header
sfc-gh-pgaur Dec 15, 2025
a1d11ee
Fix namespace depth
sfc-gh-pgaur Dec 16, 2025
719468b
Better pack the compression block header
sfc-gh-pgaur Dec 16, 2025
69b4e07
Rename class
sfc-gh-pgaur Dec 16, 2025
193a808
Rearrage field for vector metadata for better packing
sfc-gh-pgaur Dec 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,12 @@ if(ARROW_WITH_ZSTD)
list(APPEND ARROW_UTIL_SRCS util/compression_zstd.cc)
endif()

# ALP (for Parquet encoder/decoder)
list(APPEND ARROW_UTIL_SRCS
util/alp/alp.cc
util/alp/alp_sampler.cc
util/alp/alp_wrapper.cc)

arrow_add_object_library(ARROW_UTIL ${ARROW_UTIL_SRCS})

# Disable DLL exports in vendored uriparser library
Expand Down
7 changes: 7 additions & 0 deletions cpp/src/arrow/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ add_arrow_test(bit-utility-test
rle_encoding_test.cc
test_common.cc)

add_arrow_test(alp-test
SOURCES
alp/alp_test.cc
alp/alp.cc
alp/alp_sampler.cc
alp/alp_wrapper.cc)

add_arrow_test(crc32-test
SOURCES
crc32_test.cc
Expand Down
791 changes: 791 additions & 0 deletions cpp/src/arrow/util/alp/alp.cc

Large diffs are not rendered by default.

531 changes: 531 additions & 0 deletions cpp/src/arrow/util/alp/alp.h

Large diffs are not rendered by default.

256 changes: 256 additions & 0 deletions cpp/src/arrow/util/alp/alp_constants.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// Constants and type traits for ALP compression

#pragma once

#include <cstdint>

#include "arrow/util/logging.h"

namespace arrow {
namespace util {
namespace alp {

// ----------------------------------------------------------------------
// AlpConstants

/// \brief Constants used throughout ALP compression
class AlpConstants {
public:
/// Number of elements compressed together as a unit. Fixed for compatibility.
static constexpr uint64_t kAlpVectorSize = 1024;

/// Number of elements to use when determining sampling parameters.
static constexpr uint64_t kSamplerVectorSize = 4096;

/// Total number of elements in a rowgroup for sampling purposes.
static constexpr uint64_t kSamplerRowgroupSize = 122880;

/// Number of samples to collect per vector during the sampling phase.
static constexpr uint64_t kSamplerSamplesPerVector = 256;

/// Number of sample vectors to collect per rowgroup.
static constexpr uint64_t kSamplerSampleVectorsPerRowgroup = 8;

/// Version number for the ALP compression format.
static constexpr uint8_t kAlpVersion = 1;

/// Type used to store exception positions within a compressed vector.
using PositionType = uint16_t;

/// Threshold for early exit during sampling when compression quality is poor.
static constexpr uint8_t kSamplingEarlyExitThreshold = 4;

/// Maximum number of exponent-factor combinations to try during compression.
static constexpr uint8_t kMaxCombinations = 5;

/// Loop unroll factor for tight loops in ALP compression/decompression.
/// ALP has multiple tight loops that profit from unrolling. Setting this
/// might affect performance, so benchmarking is recommended.
static constexpr uint64_t kLoopUnrolls = 4;

/// \brief Get power of ten as uint64_t
///
/// \param[in] power the exponent (must be <= 19)
/// \return 10^power as uint64_t
static uint64_t PowerOfTenUB8(const uint8_t power) {
ARROW_DCHECK(power <= 19) << "power_out_of_range: " << static_cast<int>(power);
static constexpr uint64_t kTable[20] = {1,
10,
100,
1'000,
10'000,
100'000,
1'000'000,
10'000'000,
100'000'000,
1'000'000'000,
10'000'000'000,
100'000'000'000,
1'000'000'000'000,
10'000'000'000'000,
100'000'000'000'000,
1'000'000'000'000'000,
10'000'000'000'000'000,
100'000'000'000'000'000,
1'000'000'000'000'000'000,
10'000'000'000'000'000'000ULL};

return kTable[power];
}

/// \brief Get power of ten as float
///
/// \param[in] power the exponent (must be in range [-10, 10])
/// \return 10^power as float
static float PowerOfTenFloat(int8_t power) {
ARROW_DCHECK(power >= -10 && power <= 10)
<< "power_out_of_range: " << static_cast<int>(power);
static constexpr float kTable[21] = {
0.0000000001F, 0.000000001F, 0.00000001F, 0.0000001F, 0.000001F,
0.00001F, 0.0001F, 0.001F, 0.01F, 0.1F,
1.0F, 10.0F, 100.0F, 1000.0F, 10000.0F,
100000.0F, 1000000.0F, 10000000.0F, 100000000.0F,
1000000000.0F, 10000000000.0F};

return kTable[power + 10];
}

/// \brief Get power of ten as double
///
/// \param[in] power the exponent (must be in range [-20, 20])
/// \return 10^power as double
static double PowerOfTenDouble(const int8_t power) {
ARROW_DCHECK(power >= -20 && power <= 20)
<< "power_out_of_range: " << static_cast<int>(power);
static constexpr double kTable[41] = {
0.00000000000000000001,
0.0000000000000000001,
0.000000000000000001,
0.00000000000000001,
0.0000000000000001,
0.000000000000001,
0.00000000000001,
0.0000000000001,
0.000000000001,
0.00000000001,
0.0000000001,
0.000000001,
0.00000001,
0.0000001,
0.000001,
0.00001,
0.0001,
0.001,
0.01,
0.1,
1.0,
10.0,
100.0,
1000.0,
10000.0,
100000.0,
1000000.0,
10000000.0,
100000000.0,
1000000000.0,
10000000000.0,
100000000000.0,
1000000000000.0,
10000000000000.0,
100000000000000.0,
1000000000000000.0,
10000000000000000.0,
100000000000000000.0,
1000000000000000000.0,
10000000000000000000.0,
100000000000000000000.0,
};
return kTable[power + 20];
}

/// \brief Get factor as int64_t
///
/// \param[in] power the exponent
/// \return 10^power as int64_t
static int64_t GetFactor(const int8_t power) { return PowerOfTenUB8(power); }
};

// ----------------------------------------------------------------------
// AlpTypedConstants

/// \brief Type-specific constants for ALP compression
/// \tparam FloatingPointType the floating point type (float or double)
template <typename FloatingPointType>
struct AlpTypedConstants {};

/// \brief Type-specific constants for float
template <>
struct AlpTypedConstants<float> {
/// Magic number used for fast rounding of floats to nearest integer:
/// rounded(n) = static_cast<int32_t>(n + kMagicNumber - kMagicNumber).
static constexpr float kMagicNumber = 12582912.0f; // 2^22 + 2^23

static constexpr uint8_t kMaxExponent = 10;

/// Largest float value that can be safely converted to int32.
static constexpr float kEncodingUpperLimit = 2147483520.0f;
static constexpr float kEncodingLowerLimit = -2147483520.0f;

/// \brief Get exponent multiplier
///
/// \param[in] power the exponent
/// \return 10^power as float
static float GetExponent(const uint8_t power) {
return AlpConstants::PowerOfTenFloat(power);
}

/// \brief Get factor multiplier
///
/// \param[in] power the factor
/// \return 10^(-power) as float
static float GetFactor(const uint8_t power) {
// This double cast is necessary since subtraction on int8_t does not
// necessarily yield an int8_t.
return AlpConstants::PowerOfTenFloat(
static_cast<int8_t>(-static_cast<int8_t>(power)));
}

using FloatingToExact = uint32_t;
using FloatingToSignedExact = int32_t;
};

/// \brief Type-specific constants for double
template <>
class AlpTypedConstants<double> {
public:
/// Magic number used for fast rounding of doubles to nearest integer:
/// rounded(n) = static_cast<int64_t>(n + kMagicNumber - kMagicNumber).
static constexpr double kMagicNumber = 6755399441055744.0; // 2^51 + 2^52

static constexpr uint8_t kMaxExponent = 18; // 10^18 is the maximum int64

/// Largest double value that can be safely converted to int64.
static constexpr double kEncodingUpperLimit = 9223372036854774784.0;
static constexpr double kEncodingLowerLimit = -9223372036854774784.0;

/// \brief Get exponent multiplier
///
/// \param[in] power the exponent
/// \return 10^power as double
static double GetExponent(const uint8_t power) {
return AlpConstants::PowerOfTenDouble(power);
}

/// \brief Get factor multiplier
///
/// \param[in] power the factor
/// \return 10^(-power) as double
static double GetFactor(const uint8_t power) {
return AlpConstants::PowerOfTenDouble(
static_cast<int8_t>(-static_cast<int8_t>(power)));
}

using FloatingToExact = uint64_t;
using FloatingToSignedExact = int64_t;
};

} // namespace alp
} // namespace util
} // namespace arrow
Loading