-
Notifications
You must be signed in to change notification settings - Fork 11
Open
Description
With 26.01 parquet_read with LDF_PREFER_EAGER_ALLOCATIONS is throwing cudaErrorInvalidDevice: invalid device ordinal when the problem is overdecomposed.
The conda env:
channels:
- rapidsai
- legate
- legate/label/rc
- legate-nightly
- conda-forge
- nvidia
dependencies:
- c-compiler
- clang-tools>=19.0
- clang-tools>=8
- clang>=8
- clangxx>=19.0
- cmake>=3.24,!=3.25.0
- cmake>=3.26.4,!=3.30.0
- coverage
- cuda-cudart-dev
- cuda-cudart-static
- cuda-cupti-dev
- cuda-driver-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-profiler-api
- cuda-sanitizer-api
- cuda-toolkit
- cuda-version=12.9
- cuda-version>=12.2
- cudf==25.08.*,>=0.0.0a0
- cupy>=12.0.0
- cupynumeric==26.01.*,>=0.0.0.dev0
- cutensor<=2.2.0
- cutensor>=2,<2.3
- cxx-compiler
- cython>=3.0.1
- cython>=3.0.3
- dask-cuda==25.08.*
- dask-cudf==25.08.*
- doxygen
- elfutils
- gcc=11.*
- gcc_linux-64=11.*
- git
- gperftools
- gxx=11.*
- h5py
- hdf5
- hypothesis>=6
- ipython
- jinja2
- legate==26.01.*,>=0.0.0.dev0
- libarrow-acero
- libblas=*=*openblas*
- libcublas-dev
- libcudf==25.08.*,>=0.0.0a0
- libcudss-dev
- libcufft-dev
- libcufile-dev
- libcurand-dev
- libcusolver-dev
- libcusolvermp-dev>=0.7
- libcusparse-dev
- libhwloc=*=*default*
- libnvjitlink-dev
- librmm==25.08.*,>=0.0.0a0
- llvm-openmp
- make
- markdown
- matplotlib>=3.9
- mock
- mypy>=0.961
- mypy>=1.13
- myst-parser>=4.0
- nbconvert>=7.16
- nccl<2.29
- nccl>=2.19
- ninja
- ninja>=1.11.1.1
- notebook>=7
- numba
- numpy >=1.23,<3.0.0a0
- numpy<2.0
- numpy>=1.22,!=2.1.0
- nvidia-ml-py
- onnx>=1.10
- onnxmltools>=1.10
- openblas
- openblas<=0.3.21
- openblas=*=*openmp*
- openmpi<6
- openssh
- openssl
- opt_einsum
- pandoc
- pip
- pip<=25.2
- pkg-config
- polars>=1.25,<1.32
- pre-commit
- psutil
- pydata-sphinx-theme>=0.16
- pydata-sphinx-theme>=0.16.0
- pylibcudf==25.08.*,>=0.0.0a0
- pynvjitlink<=0.6
- pytest
- pytest-cov
- pytest-mock
- pytest>=7,<8
- pytest>=7.0
- python-build>=1.2.0
- python=3.12
- python>=3.11
- python>=3.11,<3.13
- rapids-build-backend>=0.3.2,<0.4.0.dev0
- rich
- rust
- scikit-build-core>=0.10.0
- scikit-build>=0.13.1
- scikit-learn>=1.6
- scipy
- seaborn>=0.13
- setuptools>60,<=75.3.0
- sphinx>=8.0,<8.2.0
- sysroot_linux-64==2.17
- tifffile
- types-docutils
- typing-extensions>=4.0
- ucc
- ucx>=1.16
- valgrind
- xgboost>=2.0
- zarr<3
- zlib
- pip:
- breathe>=4.35.0
- myst-parser
- nbsphinx
- nvidia-sphinx-theme
- sphinx-copybutton
- sphinx>=8.2
name: all_cuda-122
The program:
import argparse
import numpy as np
from legate.timing import time
from legate.io.hdf5 import to_file
from legate.core import get_legate_runtime, Scope, ParallelPolicy
import legate_dataframe as ldf
from legate_dataframe.lib.parquet import parquet_read
from legate_dataframe.lib import join
from legate_dataframe.lib.binaryop import binary_operation
from legate_dataframe.lib.timestamps import (
extract_timestamp_component,
)
def process_data(
path_customers: str, path_transactions: str, path_output: str, od_factor
):
trans_cols = ["amount", "senderID", "isFraud", "time"]
print("OD FACTOR", od_factor)
with Scope(
parallel_policy=ParallelPolicy(
overdecompose_factor=od_factor
)
):
transaction_data = parquet_read(
path_transactions,
columns=trans_cols,
ignore_row_groups=True,
)
del transaction_data
def barrier():
get_legate_runtime().issue_execution_fence(block=True)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("customers", help="Path to the financial_account parquet files")
parser.add_argument(
"transactions", help="Path to the financial_transactions parquet files"
)
parser.add_argument("output", help="Path to write parquet files")
parser.add_argument(
"--od-factor", type=int, default=4, help="Overdecompositon factor"
)
args = parser.parse_args()
path_customers = args.customers
path_transactions = args.transactions
path_output = args.output
od_factor = args.od_factor
print("Running ETL pipeline - Legate Streaming - HDF5")
barrier()
start_load = time()
process_data(path_customers, path_transactions, path_output, od_factor)
barrier()
end_preprocess = time()
print(f"ETL time: {(end_preprocess - start_load) / 1000000.0:.2f}s")
if __name__ == "__main__":
main()
CMD:
LEGATE_SHOW_CONFIG=1 LDF_PREFER_EAGER_ALLOCATIONS=1 LEGATE_TEST=1 legate --gpus 1 ../test/test_store_del.py /home/reazulh/sandbox/gpu-xb-ai/data/financial_account_no_duplicate_no_compression.parquet /home/reazulh/sandbox/gpu-xb-ai/data/financial_transactions_no_compression.parquet ./ --od-factor 2
Actual Error:
#0 Task "legate::dataframe::task::ParquetReadByRows" threw an unexpected exception (thrust::system::system_error): "after determining tmp storage requirements for exclusive_scan: cudaErrorInvalidDevice: invalid device ordinal". If this exception is expected to be thrown (and handled), then you must explicitly inform the runtime that this task may raise an exception via the AutoTask/ManualTask throws_exception() member function on task construction. Furthermore, you must wrap the exception in an instance of legate::TaskException. Note that throwing exceptions from tasks is discouraged as it has severe performance implications. For example, the runtime is required to block the caller on the completion of the task. It should only be used as a last resort.
Backtrace with gdb:
#0 __cxxabiv1::__cxa_throw (obj=0x7fccf88e6c80, tinfo=0x7fffe942d2b0 <typeinfo for std::runtime_error>, dest=0x7fffe93163b8 <std::runtime_error::~runtime_error()>)
at ../../../../libstdc++-v3/libsupc++/eh_throw.cc:80
#1 0x00007fffe91e77fc in kvikio::cudaAPI::cudaAPI() [clone .cold] () from /home/reazulh/miniconda3/envs/ldf-test/lib/libkvikio.so
#2 0x00007fffe922a5fc in kvikio::cudaAPI::instance() () from /home/reazulh/miniconda3/envs/ldf-test/lib/libkvikio.so
#3 0x00007fffe922d127 in kvikio::is_host_memory(void const*) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libkvikio.so
#4 0x00007fffe921c60b in kvikio::FileHandle::pread(void*, unsigned long, unsigned long, unsigned long, unsigned long, bool) ()
from /home/reazulh/miniconda3/envs/ldf-test/lib/libkvikio.so
#5 0x00007fccca76792e in cudf::io::(anonymous namespace)::kvikio_source<kvikio::FileHandle>::host_read(unsigned long, unsigned long) ()
from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#6 0x00007fccca6c346e in cudf::io::parquet::detail::metadata::metadata(cudf::io::datasource*) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#7 0x00007fccca6cf943 in cudf::io::parquet::detail::aggregate_reader_metadata::metadatas_from_sources(cudf::host_span<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> > const, 18446744073709551615ul>) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#8 0x00007fccca6d0a89 in cudf::io::parquet::detail::aggregate_reader_metadata::aggregate_reader_metadata(cudf::host_span<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> > const, 18446744073709551615ul>, bool, bool) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#9 0x00007fccca686f65 in cudf::io::parquet::detail::reader_impl::reader_impl(unsigned long, unsigned long, std::vector<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> >, std::allocator<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> > > >&&, cudf::io::parquet_reader_options const&, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>) ()
from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#10 0x00007fccca6883cb in cudf::io::parquet::detail::reader_impl::reader_impl(std::vector<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> >, std::allocator<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> > > >&&, cudf::io::parquet_reader_options const&, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#11 0x00007fccca67cae8 in cudf::io::parquet::detail::reader::reader(std::vector<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> >, std::allocator<std::unique_ptr<cudf::io::datasource, std::default_delete<cudf::io::datasource> > > >&&, cudf::io::parquet_reader_options const&, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#12 0x00007fccca3d3ff5 in cudf::io::read_parquet(cudf::io::parquet_reader_options const&, rmm::cuda_stream_view, cuda::mr::__4::basic_resource_ref<(cuda::mr::__4::_AllocType)1, cuda::mr::__4::device_accessible>) () from /home/reazulh/miniconda3/envs/ldf-test/lib/libcudf.so
#13 0x00007fff9ccf541f in legate::dataframe::task::ParquetReadByRows::gpu_variant (context=...) at /home/reazulh/sandbox/reazul-legate-dataframe/cpp/src/parquet.cu:166
#14 0x00007fffea04cf4a in legate::detail::task_detail::task_body<const legate::detail::legion_task_body(legate::VariantImpl, legate::VariantCode, std::optional<std::basic_string_view<char> >, void const*, std::size_t, Legion::Processor)::<lambda()>&>(legate::TaskContext, legate::VariantImpl, const struct {...} &) (ctx=...,
variant_impl=0x7fff9ccf4ee0 <legate::dataframe::task::ParquetReadByRows::gpu_variant(legate::TaskContext)>, get_task_name=...)
at /home/reazulh/sandbox/reazul.legate.internal/src/cpp/legate/task/detail/task.inl:71
#15 0x00007fffea04c9d5 in legate::detail::legion_task_body (variant_impl=0x7fff9ccf4ee0 <legate::dataframe::task::ParquetReadByRows::gpu_variant(legate::TaskContext)>,
variant_kind=legate::VariantCode::GPU, task_name=std::optional<std::basic_string_view<char, std::char_traits<char> >> = {...}, args=0x7fcd0271d920, arglen=8, p=...)
at /home/reazulh/sandbox/reazul.legate.internal/src/cpp/legate/task/detail/legion_task_body.cc:233
#16 0x00007fffe9fc53bd in legate::detail::task_wrapper (variant_impl=0x7fff9ccf4ee0 <legate::dataframe::task::ParquetReadByRows::gpu_variant(legate::TaskContext)>,
variant_kind=legate::VariantCode::GPU, task_name=std::optional<std::string_view> = {...}, args=0x7fcd0271d920, arglen=8, p=...)
at /home/reazulh/sandbox/reazul.legate.internal/src/cpp/legate/task/task.cc:25
#17 0x00007fff9ccf1dda in legate::LegateTask<legate::dataframe::task::ParquetReadByRows>::task_wrapper_<&legate::dataframe::task::ParquetReadByRows::gpu_variant, (legate::VariantCode)2> (args=0x7fcd0271d920, arglen=8, userdata=0x0, userlen=0, p=...) at /home/reazulh/miniconda3/envs/ldf-test/include/legate/legate/task/task.inl:110
#18 0x00007fffc9bca553 in Realm::Cuda::GPUProcessor::execute_task (this=0x55555779b780, func_id=78, task_args=...)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/cuda/cuda_module.cc:1292
#19 0x00007fffc95d02bc in Realm::Task::execute_on_processor (this=0x7fcd0271d7a0, p=...)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/tasks.cc:340
#20 0x00007fffc95d42aa in Realm::KernelThreadTaskScheduler::execute_task (this=0x555555df6db0, task=0x7fcd0271d7a0)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/tasks.cc:1446
#21 0x00007fffc95d30e7 in Realm::ThreadedTaskScheduler::scheduler_loop (this=0x555555df6db0)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/tasks.cc:1183
#22 0x00007fffc95d3716 in Realm::ThreadedTaskScheduler::scheduler_loop_wlock (this=0x555555df6db0)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/tasks.cc:1297
#23 0x00007fffc95da934 in Realm::Thread::thread_entry_wrapper<Realm::ThreadedTaskScheduler, &Realm::ThreadedTaskScheduler::scheduler_loop_wlock> (obj=0x555555df6db0)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/../realm/threads.inl:91
#24 0x00007fffc95e6f3c in Realm::KernelThread::pthread_entry (data=0x555557833880)
at /home/reazulh/sandbox/reazul.legate.internal/arch-linux-py-cuda-debug/cmake_build/_deps/realm-src/src/realm/threads.cc:869
#25 0x00007ffff7f80609 in start_thread (arg=<optimized out>) at pthread_create.c:477
#26 0x00007ffff7d4b353 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
I have tried multiple versions of kvikio - 25.08, 25.10.
Legate.core commit: f78f3ed2d5bb32922287941487959eb8d2a6e5f6
Legate-dataframe commit: b093d90e69ffd658f6ea381347e00f83375bae6d
I have also tried creating a simple reproducer in legate.core but did not succeed which makes this a legate dataframe issue with high probability.