From b0789936607ee15161241db8a83d7c279612b218 Mon Sep 17 00:00:00 2001 From: Ihor Indyk Date: Fri, 16 Jan 2026 14:23:31 -0800 Subject: [PATCH] Improve RTD contents and structure. PiperOrigin-RevId: 857303280 --- docs/data_sources/protocol.rst | 2 +- docs/grain.checkpoint.rst | 13 +-- docs/grain.constants.rst | 22 +++-- docs/grain.data_loader.rst | 28 +++++++ docs/grain.dataset.rst | 2 + docs/grain.experimental.rst | 140 ++++++++++++++++++++++--------- docs/grain.multiprocessing.rst | 8 +- docs/grain.rst | 10 ++- docs/grain.samplers.rst | 12 +-- docs/grain.sharding.rst | 10 +-- docs/grain.transforms.rst | 26 +++--- docs/index.md | 43 +++++----- docs/installation.md | 17 +++- grain/_src/core/constants.py | 6 ++ grain/_src/python/data_loader.py | 1 + grain/_src/python/samplers.py | 9 ++ 16 files changed, 243 insertions(+), 106 deletions(-) create mode 100644 docs/grain.data_loader.rst diff --git a/docs/data_sources/protocol.rst b/docs/data_sources/protocol.rst index a3058d1a0..d32b4d5e9 100644 --- a/docs/data_sources/protocol.rst +++ b/docs/data_sources/protocol.rst @@ -113,5 +113,5 @@ File systems Grain supports the formats mentioned above in combination with the following file systems (in addition to the local file system): -* :doc:`S3 <../tutorials/data_sources/load_from_s3_tutorial>` * :doc:`GCS <../tutorials/data_sources/load_from_gcs_tutorial>` +* :doc:`Amazon S3 <../tutorials/data_sources/load_from_s3_tutorial>` diff --git a/docs/grain.checkpoint.rst b/docs/grain.checkpoint.rst index 847b7bbac..c5fc46eb7 100644 --- a/docs/grain.checkpoint.rst +++ b/docs/grain.checkpoint.rst @@ -3,13 +3,16 @@ =========================== .. automodule:: grain.checkpoint +.. currentmodule:: grain.checkpoint List of Members --------------- -.. autosummary:: - :toctree: _autosummary +.. autoclass:: CheckpointHandler + :members: - CheckpointHandler - CheckpointSave - CheckpointRestore \ No newline at end of file +.. autoclass:: CheckpointSave + :members: + +.. autoclass:: CheckpointRestore + :members: \ No newline at end of file diff --git a/docs/grain.constants.rst b/docs/grain.constants.rst index 66ba8abe9..be96fabe7 100644 --- a/docs/grain.constants.rst +++ b/docs/grain.constants.rst @@ -2,17 +2,21 @@ ========================== .. automodule:: grain.constants + :noindex: List of Constants ----------------- -.. autosummary:: - :toctree: _autosummary +.. autodata:: grain.constants.DATASET_INDEX - DATASET_INDEX - EPOCH - INDEX - META_FEATURES - RECORD - RECORD_KEY - SEED \ No newline at end of file +.. autodata:: grain.constants.EPOCH + +.. autodata:: grain.constants.INDEX + +.. autodata:: grain.constants.META_FEATURES + +.. autodata:: grain.constants.RECORD + +.. autodata:: grain.constants.RECORD_KEY + +.. autodata:: grain.constants.SEED \ No newline at end of file diff --git a/docs/grain.data_loader.rst b/docs/grain.data_loader.rst new file mode 100644 index 000000000..9f9004136 --- /dev/null +++ b/docs/grain.data_loader.rst @@ -0,0 +1,28 @@ +``grain`` DataLoader +================= + +.. automodule:: grain._src.python.data_loader +.. currentmodule:: grain + +List of Members +--------------- + +.. autofunction:: load + +.. autoclass:: DataLoader + :special-members: __init__, __iter__ + :members: + :undoc-members: + +.. autoclass:: DataLoaderIterator + :special-members: __init__, __iter__, __next__ + :members: + :undoc-members: + +.. autoclass:: Record + :members: + :undoc-members: + +.. autoclass:: RecordMetadata + :members: + :undoc-members: diff --git a/docs/grain.dataset.rst b/docs/grain.dataset.rst index bc548ca56..32c7e7b47 100644 --- a/docs/grain.dataset.rst +++ b/docs/grain.dataset.rst @@ -33,3 +33,5 @@ List of Members :show-inheritance: :inherited-members: :undoc-members: + +.. autoclass:: ReadOptions diff --git a/docs/grain.experimental.rst b/docs/grain.experimental.rst index 4fbd983ae..4741ed495 100644 --- a/docs/grain.experimental.rst +++ b/docs/grain.experimental.rst @@ -6,43 +6,103 @@ List of Members --------------- -.. autosummary:: - :toctree: _autosummary - - FlatMapTransform - DatasetOptions - ExecutionTrackingMode - apply_transformations - ElasticIterator - WithOptionsIterDataset - ParquetIterDataset - TFRecordIterDataset - batch_and_pad - CacheIterDataset - FlatMapMapDataset - FlatMapIterDataset - InterleaveIterDataset - LimitIterDataset - RngPool - FirstFitPackIterDataset - BestFitPackIterDataset - BOSHandling - ConcatThenSplitIterDataset - multithread_prefetch - ThreadPrefetchIterDataset - ThreadPrefetchDatasetIterator - RebatchIterDataset - RepeatIterDataset - WindowShuffleMapDataset - WindowShuffleIterDataset - ZipMapDataset - ZipIterDataset - PackAndBatchOperation - index_shuffle - assert_equal_output_after_checkpoint - device_put - PerformanceConfig - pick_performance_config - get_element_spec - set_next_index - get_next_index +.. autoclass:: FlatMapTransform + :members: + +.. autoclass:: DatasetOptions + :members: + +.. autoclass:: ExecutionTrackingMode + :members: + +.. autofunction:: apply_transformations + +.. autoclass:: ElasticIterator + :special-members: __init__, __iter__, __next__ + +.. autoclass:: WithOptionsIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: ParquetIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: TFRecordIterDataset + :special-members: __init__, __iter__ + +.. autofunction:: batch_and_pad + +.. autoclass:: CacheIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: FlatMapMapDataset + :special-members: __init__, __getitem__ + +.. autoclass:: FlatMapIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: InterleaveIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: LimitIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: RngPool + :members: + +.. autoclass:: FirstFitPackIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: BestFitPackIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: BOSHandling + :members: + +.. autoclass:: ConcatThenSplitIterDataset + :special-members: __init__, __iter__ + +.. autofunction:: multithread_prefetch + +.. autoclass:: ThreadPrefetchIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: ThreadPrefetchDatasetIterator + :special-members: __init__, __iter__, __next__ + +.. autoclass:: RebatchIterDataset + :special-members: __init__, __iter__ + +.. autoclass:: RepeatIterDataset + :special-members: __init__, __iter__ + + +.. autoclass:: WindowShuffleMapDataset + :special-members: __init__, __getitem__ + +.. autoclass:: WindowShuffleIterDataset + :special-members: __init__, __iter__ + + +.. autoclass:: ZipMapDataset + :special-members: __init__, __getitem__ + + +.. autoclass:: ZipIterDataset + :special-members: __init__, __iter__ + +.. autofunction:: index_shuffle + +.. autofunction:: assert_equal_output_after_checkpoint + +.. autofunction:: device_put + +.. autoclass:: PerformanceConfig + :members: + +.. autofunction:: pick_performance_config + +.. autofunction:: get_element_spec + +.. autofunction:: set_next_index + +.. autofunction:: get_next_index diff --git a/docs/grain.multiprocessing.rst b/docs/grain.multiprocessing.rst index c0df6cc9b..fd5805276 100644 --- a/docs/grain.multiprocessing.rst +++ b/docs/grain.multiprocessing.rst @@ -6,8 +6,8 @@ List of Members --------------- -.. autosummary:: - :toctree: _autosummary +.. autoclass:: MultiprocessingOptions + +.. autoclass:: SharedMemoryArray + :members: - MultiprocessingOptions - SharedMemoryArray diff --git a/docs/grain.rst b/docs/grain.rst index 71e901b37..298e4ea27 100644 --- a/docs/grain.rst +++ b/docs/grain.rst @@ -1,8 +1,15 @@ .. currentmodule:: grain -Public API: ``grain`` package +``grain`` package ============================= +.. toctree:: + :hidden: + :caption: Core APIs + + Dataset APIs + DataLoader APIs + Subpackages ----------- @@ -35,7 +42,6 @@ Simple high-level pipelines .. autosummary:: - :toctree: _autosummary load DataLoader diff --git a/docs/grain.samplers.rst b/docs/grain.samplers.rst index 6b0da6721..8489b91d5 100644 --- a/docs/grain.samplers.rst +++ b/docs/grain.samplers.rst @@ -6,9 +6,11 @@ List of Members --------------- -.. autosummary:: - :toctree: _autosummary +.. autoclass:: IndexSampler + :special-members: __init__, __len__, __getitem__ - IndexSampler - Sampler - SequentialSampler +.. autoclass:: Sampler + :special-members: __init__, __len__, __getitem__ + +.. autoclass:: SequentialSampler + :special-members: __init__, __len__, __getitem__ diff --git a/docs/grain.sharding.rst b/docs/grain.sharding.rst index aeb761e26..4012cc7f6 100644 --- a/docs/grain.sharding.rst +++ b/docs/grain.sharding.rst @@ -6,9 +6,9 @@ List of Members --------------- -.. autosummary:: - :toctree: _autosummary +.. autoclass:: ShardOptions + +.. autoclass:: NoSharding + +.. autoclass:: ShardByJaxProcess - NoSharding - ShardByJaxProcess - ShardOptions diff --git a/docs/grain.transforms.rst b/docs/grain.transforms.rst index 73f49ca1b..e3ec91aa2 100644 --- a/docs/grain.transforms.rst +++ b/docs/grain.transforms.rst @@ -6,14 +6,18 @@ List of Members --------------- -.. autosummary:: - :toctree: _autosummary - - Batch - Filter - Map - MapWithIndex - RandomMap - Transformation - Transformations - DatasetSelectionMap +.. autoclass:: Batch + +.. autoclass:: Filter + +.. autoclass:: Map + +.. autoclass:: MapWithIndex + +.. autoclass:: RandomMap + +.. autoclass:: Transformation + +.. autoclass:: Transformations + +.. autoclass:: DatasetSelectionMap diff --git a/docs/index.md b/docs/index.md index c6bbf186a..4838af60b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -46,39 +46,41 @@ not depend on TensorFlow. :caption: Get started installation api_choice -behind_the_scenes ``` ``` {toctree} :maxdepth: 1 :hidden: -:caption: Data Sources +:caption: Data sources data_sources/protocol -tutorials/data_sources/bagz_data_source_tutorial -tutorials/data_sources/arrayrecord_data_source_tutorial -tutorials/data_sources/huggingface_dataset_tutorial -tutorials/data_sources/pytorch_dataset_tutorial -tutorials/data_sources/parquet_dataset_tutorial -tutorials/data_sources/load_from_s3_tutorial -tutorials/data_sources/load_from_gcs_tutorial +Bagz +ArrayRecord +Parquet +TfRecord +TFDS +HuggingFace +PyTorch +GCS +Amazon S3 ``` ``` {toctree} :maxdepth: 1 :hidden: -:caption: Data Loader -data_loader/samplers -data_loader/transformations +:caption: Dataset +Basics +Advanced usage +Transformations +Performance debugging ``` -```{toctree} -:maxdepth: 3 +``` {toctree} +:maxdepth: 1 :hidden: -:caption: Tutorials -tutorials/data_loader_tutorial -tutorials/dataset_basic_tutorial -tutorials/dataset_advanced_tutorial -tutorials/dataset_debugging_tutorial +:caption: DataLoader +data_loader/samplers +data_loader/transformations +Tutorial ``` ``` {toctree} @@ -92,6 +94,7 @@ changelog ``` {toctree} :maxdepth: 1 :hidden: -:caption: Contributor guides +:caption: For contributors +behind_the_scenes CONTRIBUTING ``` \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index f777fd07a..80fcfc171 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -2,8 +2,17 @@ -To install Grain, you can use pip: +Grain is available through multiple package managers: -```bash -pip install grain -``` +* **pip**, install from + [PyPi](https://pypi.org/project/grain/#description) with + ```bash + pip install grain + ``` + +* **Anaconda**, install from + [conda-forge](https://anaconda.org/channels/conda-forge/packages/grain/overview) + with + ```bash + conda install conda-forge::grain + ``` \ No newline at end of file diff --git a/grain/_src/core/constants.py b/grain/_src/core/constants.py index ef3d3f57d..33f217569 100644 --- a/grain/_src/core/constants.py +++ b/grain/_src/core/constants.py @@ -17,20 +17,26 @@ # on top of it). These features are generated on the fly and help to track # progress over the dataset. Users can read these but shouldn't alter them. They # start with "_" to indicate that they are "private". + # Index into the stream of all records (globally unique). Starts with 0. INDEX = "_index" + # Key of the record. If DATASET_INDEX is present it's the key in the dataset. # Starts with 0. RECORD_KEY = "_record_key" + # Index of the dataset from which to take the record. Only present when mixing. # Starts with 0. DATASET_INDEX = "_dataset_index" + # Epoch for the record. When mixing datasets this is the epoch over the dataset, # not the mixture. Starts with 1. EPOCH = "_epoch" + # Random seed for stateless random operations. This is unique per record # and changes every epoch. SEED = "_seed" + # Serialized record. RECORD = "_record" diff --git a/grain/_src/python/data_loader.py b/grain/_src/python/data_loader.py index a373a0934..fd911aacb 100644 --- a/grain/_src/python/data_loader.py +++ b/grain/_src/python/data_loader.py @@ -533,6 +533,7 @@ class DataLoaderIterator(collections.abc.Iterator[_T]): produced records among worker processes in a round robin fashion. Generally, some workers can process more elements than others at a given training step. Checkpointing logic goes as follows: + 1) With each output batch produced, GrainPool emits the worker_index of The worker that processed the batch. 2) DataLoaderIterator keeps track of the last_seen_index at each worker. diff --git a/grain/_src/python/samplers.py b/grain/_src/python/samplers.py index a64b30766..b7deb8d6c 100644 --- a/grain/_src/python/samplers.py +++ b/grain/_src/python/samplers.py @@ -117,6 +117,15 @@ class IndexSampler: This index sampler supports the following operations: - Sharding of the dataset. - Global shuffle of the dataset. + - Repeat the dataset for a fixed number of epochs or infinitely. + + Attributes: + num_records: Number of records in the data source. + shard_options: Sharding options for the dataset. + shuffle: Whether to globally shuffle the dataset. + num_epochs: Number of epochs to repeat the dataset. If None, the dataset + will be repeated infinitely. + seed: Seed for shuffling the dataset. """ def __init__(