-
Notifications
You must be signed in to change notification settings - Fork 425
Open
Description
Starting with v2025.12.0, nested column selection fails with fsspec.parquet.open_parquet_file, pyarrow, and some parquet files. The changelog makes me think that it has something to do with the changes introduced by #1945.
Reproducible code, which was tested with multiple pyarrow and CPython versions on multiple OSs:
import pyarrow.parquet as pq
from fsspec.parquet import open_parquet_file
path = "train-00000-of-00007.parquet"
column = "spectrum.flux"
with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
table = pq.read_table(fh, columns=[column])You can download file from here: https://huggingface.co/datasets/MultimodalUniverse/desi/blob/main/edr_sv3/train-00000-of-00007.parquet
Traceback
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[1], line 7
5 column = "spectrum.flux"
6 with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
----> 7 table = pq.read_table(fh, columns=[column])
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1899, in read_table(source, columns, use_threads, schema, use_pandas_metadata, read_dictionary, binary_type, list_type, memory_map, buffer_size, partitioning, filesystem, filters, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification, arrow_extensions_enabled)
1885 # TODO test that source is not a directory or a list
1886 dataset = ParquetFile(
1887 source, read_dictionary=read_dictionary,
1888 binary_type=binary_type,
(...) 1896 page_checksum_verification=page_checksum_verification,
1897 )
-> 1899 return dataset.read(columns=columns, use_threads=use_threads,
1900 use_pandas_metadata=use_pandas_metadata)
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1538, in ParquetDataset.read(self, columns, use_threads, use_pandas_metadata)
1530 index_columns = [
1531 col for col in _get_pandas_index_columns(metadata)
1532 if not isinstance(col, dict)
1533 ]
1534 columns = (
1535 list(columns) + list(set(index_columns) - set(columns))
1536 )
-> 1538 table = self._dataset.to_table(
1539 columns=columns, filter=self._filter_expression,
1540 use_threads=use_threads
1541 )
1543 # if use_pandas_metadata, restore the pandas metadata (which gets
1544 # lost if doing a specific `columns` selection in to_table)
1545 if use_pandas_metadata:
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:589, in pyarrow._dataset.Dataset.to_table()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:3969, in pyarrow._dataset.Scanner.to_table()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:89, in pyarrow.lib.check_status()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/fsspec/spec.py:2122, in AbstractBufferedFile.read(self, length)
2119 if length == 0:
2120 # don't even bother calling fetch
2121 return b""
-> 2122 out = self.cache._fetch(self.loc, self.loc + length)
2124 logger.debug(
2125 "%s read: %i - %i %s",
2126 self,
(...) 2129 self.cache._log_stats(),
2130 )
2131 self.loc += len(out)
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/fsspec/caching.py:706, in KnownPartsOfAFile._fetch(self, start, stop)
704 if started and not self.strict:
705 return out + b"\x00" * (stop - loc_old)
--> 706 raise ValueError
ValueError:
It also fails with a different error for path = Npix=257.parquet, column = "lightcurve.mag", file is downloaded from an open S3 bucket: s3://ipac-irsa-ztf/contributed/dr23/lc/hats/ztf_dr23_lc-hats_margin_10arcsec/dataset/Norder=3/Dir=0/Npix=257.parquet
Traceback
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Cell In[1], line 7
5 column = "lightcurve.mag"
6 with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
----> 7 table = pq.read_table(fh, columns=[column])
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1899, in read_table(source, columns, use_threads, schema, use_pandas_metadata, read_dictionary, binary_type, list_type, memory_map, buffer_size, partitioning, filesystem, filters, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification, arrow_extensions_enabled)
1885 # TODO test that source is not a directory or a list
1886 dataset = ParquetFile(
1887 source, read_dictionary=read_dictionary,
1888 binary_type=binary_type,
(...) 1896 page_checksum_verification=page_checksum_verification,
1897 )
-> 1899 return dataset.read(columns=columns, use_threads=use_threads,
1900 use_pandas_metadata=use_pandas_metadata)
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1538, in ParquetDataset.read(self, columns, use_threads, use_pandas_metadata)
1530 index_columns = [
1531 col for col in _get_pandas_index_columns(metadata)
1532 if not isinstance(col, dict)
1533 ]
1534 columns = (
1535 list(columns) + list(set(index_columns) - set(columns))
1536 )
-> 1538 table = self._dataset.to_table(
1539 columns=columns, filter=self._filter_expression,
1540 use_threads=use_threads
1541 )
1543 # if use_pandas_metadata, restore the pandas metadata (which gets
1544 # lost if doing a specific `columns` selection in to_table)
1545 if use_pandas_metadata:
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:589, in pyarrow._dataset.Dataset.to_table()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:3969, in pyarrow._dataset.Scanner.to_table()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()
OSError: Couldn't deserialize thrift: invalid TType
Deserializing page header failed.
Metadata
Metadata
Assignees
Labels
No labels