Skip to content

Regression in fsspec.parquet.open_parquet_file in fsspec>=2025.12 #1973

@hombit

Description

@hombit

Starting with v2025.12.0, nested column selection fails with fsspec.parquet.open_parquet_file, pyarrow, and some parquet files. The changelog makes me think that it has something to do with the changes introduced by #1945.

Reproducible code, which was tested with multiple pyarrow and CPython versions on multiple OSs:

import pyarrow.parquet as pq
from fsspec.parquet import open_parquet_file

path = "train-00000-of-00007.parquet"
column = "spectrum.flux"
with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
    table = pq.read_table(fh, columns=[column])

You can download file from here: https://huggingface.co/datasets/MultimodalUniverse/desi/blob/main/edr_sv3/train-00000-of-00007.parquet

Traceback
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[1], line 7
      5 column = "spectrum.flux"
      6 with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
----> 7     table = pq.read_table(fh, columns=[column])

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1899, in read_table(source, columns, use_threads, schema, use_pandas_metadata, read_dictionary, binary_type, list_type, memory_map, buffer_size, partitioning, filesystem, filters, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification, arrow_extensions_enabled)
   1885     # TODO test that source is not a directory or a list
   1886     dataset = ParquetFile(
   1887         source, read_dictionary=read_dictionary,
   1888         binary_type=binary_type,
   (...)   1896         page_checksum_verification=page_checksum_verification,
   1897     )
-> 1899 return dataset.read(columns=columns, use_threads=use_threads,
   1900                     use_pandas_metadata=use_pandas_metadata)

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1538, in ParquetDataset.read(self, columns, use_threads, use_pandas_metadata)
   1530         index_columns = [
   1531             col for col in _get_pandas_index_columns(metadata)
   1532             if not isinstance(col, dict)
   1533         ]
   1534         columns = (
   1535             list(columns) + list(set(index_columns) - set(columns))
   1536         )
-> 1538 table = self._dataset.to_table(
   1539     columns=columns, filter=self._filter_expression,
   1540     use_threads=use_threads
   1541 )
   1543 # if use_pandas_metadata, restore the pandas metadata (which gets
   1544 # lost if doing a specific `columns` selection in to_table)
   1545 if use_pandas_metadata:

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:589, in pyarrow._dataset.Dataset.to_table()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:3969, in pyarrow._dataset.Scanner.to_table()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:89, in pyarrow.lib.check_status()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/fsspec/spec.py:2122, in AbstractBufferedFile.read(self, length)
   2119 if length == 0:
   2120     # don't even bother calling fetch
   2121     return b""
-> 2122 out = self.cache._fetch(self.loc, self.loc + length)
   2124 logger.debug(
   2125     "%s read: %i - %i %s",
   2126     self,
   (...)   2129     self.cache._log_stats(),
   2130 )
   2131 self.loc += len(out)

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/fsspec/caching.py:706, in KnownPartsOfAFile._fetch(self, start, stop)
    704 if started and not self.strict:
    705     return out + b"\x00" * (stop - loc_old)
--> 706 raise ValueError

ValueError:

It also fails with a different error for path = Npix=257.parquet, column = "lightcurve.mag", file is downloaded from an open S3 bucket: s3://ipac-irsa-ztf/contributed/dr23/lc/hats/ztf_dr23_lc-hats_margin_10arcsec/dataset/Norder=3/Dir=0/Npix=257.parquet

Traceback
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
Cell In[1], line 7
      5 column = "lightcurve.mag"
      6 with open_parquet_file(path, columns=[column], engine='pyarrow') as fh:
----> 7     table = pq.read_table(fh, columns=[column])

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1899, in read_table(source, columns, use_threads, schema, use_pandas_metadata, read_dictionary, binary_type, list_type, memory_map, buffer_size, partitioning, filesystem, filters, ignore_prefixes, pre_buffer, coerce_int96_timestamp_unit, decryption_properties, thrift_string_size_limit, thrift_container_size_limit, page_checksum_verification, arrow_extensions_enabled)
   1885     # TODO test that source is not a directory or a list
   1886     dataset = ParquetFile(
   1887         source, read_dictionary=read_dictionary,
   1888         binary_type=binary_type,
   (...)   1896         page_checksum_verification=page_checksum_verification,
   1897     )
-> 1899 return dataset.read(columns=columns, use_threads=use_threads,
   1900                     use_pandas_metadata=use_pandas_metadata)

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/parquet/core.py:1538, in ParquetDataset.read(self, columns, use_threads, use_pandas_metadata)
   1530         index_columns = [
   1531             col for col in _get_pandas_index_columns(metadata)
   1532             if not isinstance(col, dict)
   1533         ]
   1534         columns = (
   1535             list(columns) + list(set(index_columns) - set(columns))
   1536         )
-> 1538 table = self._dataset.to_table(
   1539     columns=columns, filter=self._filter_expression,
   1540     use_threads=use_threads
   1541 )
   1543 # if use_pandas_metadata, restore the pandas metadata (which gets
   1544 # lost if doing a specific `columns` selection in to_table)
   1545 if use_pandas_metadata:

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:589, in pyarrow._dataset.Dataset.to_table()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/_dataset.pyx:3969, in pyarrow._dataset.Scanner.to_table()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()

File ~/.virtualenvs/nested-pandas/lib/python3.14/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

OSError: Couldn't deserialize thrift: invalid TType
Deserializing page header failed.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions