diff --git a/.gitignore b/.gitignore index 7881d3a00..3611cec51 100644 --- a/.gitignore +++ b/.gitignore @@ -148,3 +148,4 @@ dmypy.json # Gemini CLI .gemini/ gha-creds-*.json +docs/jupyter_execute/ diff --git a/docs/conf.py b/docs/conf.py index 21f27650a..a095c50b1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -59,6 +59,9 @@ ] myst_linkify_fuzzy_links = False +# Notebook execution mode +nb_execution_mode = "off" + # Theme-specific options # https://sphinx-book-theme.readthedocs.io/en/stable/reference.html html_theme_options = { @@ -76,4 +79,6 @@ "run_maxtext/run_maxtext_via_multihost_job.md", "run_maxtext/run_maxtext_via_multihost_runner.md", "reference/core_concepts/llm_calculator.ipynb", + "jupyter_execute", + "_build", ] diff --git a/docs/tutorials.md b/docs/tutorials.md index 090bb00b0..ff2a17f61 100644 --- a/docs/tutorials.md +++ b/docs/tutorials.md @@ -16,8 +16,26 @@ # Tutorials +Welcome to the MaxText tutorials! If you are new here, we recommend starting with the **Getting Started** guide below. + +## New to MaxText? + +If you haven't installed MaxText yet, please check our [Installation Guide](install_maxtext.md) first. + +Once installed, follow our **[First Run](tutorials/first_run.md)** tutorial to get your first model training on a TPU. This tutorial will guide you through the process of training a model with MaxText and verifying the results. + +--- + +Below is a list of all available tutorials organized by topic. + +### Additional Tutorials +* [Pre-training](tutorials/pretraining.md) - Learn how to run large-scale pre-training jobs. +* [Post-Training Index](tutorials/post_training_index.md) - a collection of guides for fine-tuning, RLHF, and other post-training workflows. + ```{toctree} :maxdepth: 1 +:hidden: +:caption: Tutorials tutorials/first_run.md tutorials/pretraining.md diff --git a/docs/tutorials/demo_decoding.ipynb b/docs/tutorials/demo_decoding.ipynb new file mode 120000 index 000000000..01d1e9781 --- /dev/null +++ b/docs/tutorials/demo_decoding.ipynb @@ -0,0 +1 @@ +../../src/MaxText/examples/demo_decoding.ipynb \ No newline at end of file diff --git a/docs/tutorials/first_run.md b/docs/tutorials/first_run.md index 9e16f034f..262fb6028 100644 --- a/docs/tutorials/first_run.md +++ b/docs/tutorials/first_run.md @@ -20,9 +20,41 @@ This topic provides a basic introduction to get your MaxText workload up and running on single host and multihost environments using Cloud TPUs or NVIDIA GPUs. To help you get familiar with MaxText, we recommend starting with a single host first and then moving to multihost. ## Prerequisites: Set up storage and configure MaxText -1. To store logs and checkpoints, [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) in your project. To run MaxText, the TPU or GPU VMs must have read/write permissions for the bucket. These permissions are granted by service account roles, such as the `STORAGE ADMIN` role. -2. MaxText reads a yaml file for configuration. We also recommend reviewing the configurable options in `configs/base.yml`. This file includes a decoder-only model of ~1B parameters. The configurable options can be overwritten from the command line. For instance, you can change the `steps` or `log_period` by either modifying `configs/base.yml` or by passing in `steps` and `log_period` as additional arguments to the `train.py` call. Set `base_output_directory` to a folder in the bucket you just created. +### 1. Set up Environment Variables +To make the commands in this tutorial copy-paste friendly, set your bucket name as an environment variable: +```bash +export BUCKET_NAME=your-bucket-name +# Example: export BUCKET_NAME=maxtext-test-runs +``` + +### 2. Create Cloud Storage Bucket +To store logs and checkpoints, [Create a Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) in your project. To run MaxText, the TPU or GPU VMs must have read/write permissions for the bucket. These permissions are granted by service account roles, such as the `STORAGE ADMIN` role. + +```bash +gcloud storage buckets create gs://${BUCKET_NAME} --location=us-central1 +# Note: Ensure your TPU VM has read/write access to this bucket. +``` + +### 3. Configuration +MaxText reads a yaml file for configuration. The default configuration is in `configs/base.yml`. +**Note:** This default configuration defines a **Decoder-only** model with **~1B parameters**. + +You can review the full [base.yml](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/configs/base.yml) file, but here are the key configurable options: + +```yaml +# configs/base.yml snippet +model_name: "default" +weight_dtype: "float32" +base_emb_dim: 2048 +base_num_query_heads: 16 +base_num_kv_heads: 16 +base_mlp_dim: 7168 +base_num_decoder_layers: 16 +head_dim: 128 +``` + +The configurable options can be overwritten from the command line. For instance, you can change the `steps` or `log_period` by either modifying `configs/base.yml` or by passing in `steps` and `log_period` as additional arguments to the `train.py` call. ## Local development for single host This procedure describes how to run MaxText on a single GPU or TPU host. @@ -33,38 +65,56 @@ multiple hosts but is a good way to learn about MaxText. 1. [Create and SSH to the single host VM of your choice](https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm). You can use any available single host TPU, such as `v5litepod-8`, `v5p-8`, or `v4-8`. 2. Clone MaxText onto that TPU VM. -3. Within the root directory of the cloned repo, install dependencies and pre-commit hook by running: -```sh +3. Make sure you are in the `MaxText` root directory (where `setup.sh` was run): +```bash +cd ~/MaxText +``` + +4. Within the root directory of the cloned repo, install dependencies and pre-commit hook by running: +```bash python3 -m venv ~/venv-maxtext source ~/venv-maxtext/bin/activate bash tools/setup/setup.sh pre-commit install ``` -4. After installation completes, run training on synthetic data with the following command: -```sh -python3 -m MaxText.train src/MaxText/configs/base.yml \ - run_name=$YOUR_JOB_NAME \ - base_output_directory=gs:// \ +5. After installation completes, run training on synthetic data with the following command: +```bash +# Set a unique run name +export RUN_NAME=run_$(date +%Y%m%d_%H%M%S) + +# Run training +python3 -m MaxText.train configs/base.yml \ + run_name=$RUN_NAME \ + base_output_directory=gs://${BUCKET_NAME} \ dataset_type=synthetic \ steps=10 ``` -Optional: If you want to try training on a Hugging Face dataset, see [Data Input Pipeline](../guides/data_input_pipeline.md) for data input options. +Optional: If you want to try training on a Hugging Face dataset, see [Data Input Pipeline](data-input-pipeline) for data input options. -5. To demonstrate model output, run the following command: -```sh -python3 -m MaxText.decode src/MaxText/configs/base.yml \ - run_name=$YOUR_JOB_NAME \ - base_output_directory=gs:// \ +6. To demonstrate model output, we can run decoding (inference). +> **Note:** We use the same `RUN_NAME` and `BUCKET_NAME` to automatically load the checkpoint we just trained. + +```bash +python3 -m MaxText.decode configs/base.yml \ + run_name=$RUN_NAME \ + base_output_directory=gs://${BUCKET_NAME} \ per_device_batch_size=1 ``` -This command uses a model with randomly initialized weights, so the outputs are also random. To get high quality output you need pass in a checkpoint, typically via the `load_parameters_path` argument. +This command uses the checkpoint from the training run. If no checkpoint is found (e.g. if you changed the run name), it will initialize with random weights. ### Run MaxText via notebook In the same TPU VM where you just installed all the dependencies of MaxText, You can also run training and decoding in MaxText via Notebook (for e.g., via Jupyter or Colab). #### Decoding in MaxText via notebook -You can use [demo_decoding.ipynb](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/examples/demo_decoding.ipynb) to try out decoding on MaxText's `Llama3.1-8b` model implementation. In this notebook, we give `"I love to"` as the prompt, and the greedily sampled first output token is `" cook"`. Please remember to provide the path to your `Llama3.1-8b` checkpoint for the `load_parameters_path` argument in the config inside the notebook. You can use [to_maxtext.py](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/to_maxtext.py) to create a MaxText/Orbax checkpoint from a Huggingface checkpoint. +You can use [demo_decoding.ipynb](demo_decoding) to try out decoding on MaxText's `Llama3.1-8b` model implementation. In this notebook, we give `"I love to"` as the prompt, and the greedily sampled first output token is `" cook"`. Please remember to provide the path to your `Llama3.1-8b` checkpoint for the `load_parameters_path` argument in the config inside the notebook. You can use [to_maxtext.py](https://github.com/AI-Hypercomputer/maxtext/blob/main/src/MaxText/utils/ckpt_conversion/to_maxtext.py) to create a MaxText/Orbax checkpoint from a Huggingface checkpoint. + +```{toctree} +:maxdepth: 1 +:hidden: + +demo_decoding.ipynb +``` ### Run MaxText on NVIDIA GPUs 1. Use `bash dependencies/scripts/docker_build_dependency_image.sh DEVICE=gpu` to build a container with the required dependencies. diff --git a/src/MaxText/examples/demo_decoding.ipynb b/src/MaxText/examples/demo_decoding.ipynb index 9b913318e..965eb7481 100644 --- a/src/MaxText/examples/demo_decoding.ipynb +++ b/src/MaxText/examples/demo_decoding.ipynb @@ -435,4 +435,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file