diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index a0a6ab363d9..45009d9dad9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -692,6 +692,8 @@ peps/pep-0814.rst @vstinner @corona10 peps/pep-0815.rst @emmatyping peps/pep-0816.rst @brettcannon # ... +peps/pep-0819.rst @emmatyping +# ... peps/pep-2026.rst @hugovk # ... peps/pep-3000.rst @gvanrossum diff --git a/peps/pep-0819.rst b/peps/pep-0819.rst new file mode 100644 index 00000000000..8599dd1c0f0 --- /dev/null +++ b/peps/pep-0819.rst @@ -0,0 +1,347 @@ +PEP: 819 +Title: JSON Package Metadata +Author: Emma Harper Smith +PEP-Delegate: Paul Moore +Discussions-To: Pending +Status: Draft +Type: Standards Track +Topic: Packaging +Created: 18-Dec-2025 +Post-History: Pending + + +Abstract +======== + +Python package metadata ("core metadata") was first defined in :pep:`241` to +use :rfc:`822` email headers to encode information about packages. This was +reasonable at the time; email messages were the only widely used, standardized +text format that had a parser in the standard library at the time. However, +issues with handling different encodings, differing handling of line breaks, +and other differences between implementations have caused numerous packaging +bugs. To resolve these issues, this PEP proposes introducing +`Javascript Object Notation (JSON) `_ +encoded core metadata and wheel file format metadata files in Python packages. + + +Motivation +========== + +The email message format has a number of complexities and limitations which +reduce its utility as a portable textual interchange format for packaging +metadata. Due to the :mod:`email` parser requiring configuration changes to +properly generate valid core metadata, many projects do not use the +:mod:`!email` module and instead generate core metadata in a custom manner. +There are many pitfalls with generating email headers that these custom +generators can hit. First, core metadata fields may contain newlines in the +value of fields. These newlines must be handled properly to "unfolded" multiple +lines per :rfc:`822`. One particularly difficult to encode field is the +``Description`` field, which may contain newlines and indentation. To encode +the field in email headers, CRLF line breaks must be followed by seven (7) +spaces and a pip ("|") character. While ``Description`` may now be encoded in +the message body, similar escaping issues occur for the ``Author`` and +``Maintainer`` fields. Improperly escaped newlines can lead to missing, +partial, or invalid core metadata. Second, as discussed in the +`core metadata specifications `__: + +.. epigraph:: + The standard file format for metadata (including in wheels and installed + projects) is based on the format of email headers. However, email formats + have been revised several times, and exactly which email RFC applies to + packaging metadata is not specified. In the absence of a precise + definition, the practical standard is set by what the standard library + :mod:`email.parser` module can parse using the + :data:`email.policy.compat32` policy. + +Since no specific email RFC is selected, the current core metadata +specification is ambiguous whether a given core metadata document is valid. +:rfc:`822` is the only email standard to be explicitly listed in a PEP. +However, the core metadata specifications also requires that core metadata is +encoded using UTF-8 when written to a file. This de-facto makes the core +metadata follow :rfc:`6532`, which specifies internationalization of email +headers. This has practical interoperability concerns. Until a few years ago, +it was unspecified how to handle non-ASCII encoded content in core metadata, +causing confusion about how to properly encode non-ASCII emails in core +metadata. Third, the current format is difficult to properly validate and +parse. Many tools do not check for issues with the output of the :mod:`!email` +parser. If a document is malformed, it may still parse without error by the +:mod:`!email` module as a valid email message. Furthermore, due to limitations +in the email format, fields like ``Project-Url`` must create custom encodings +of nested key-value items, further complicating parsing. Finally, the lack of +a schema makes it difficult to validate the contents of email message encoded +metadata. While introducing a specification for the current format has been +`discussed previously `_, +no progress had been made, and converting to JSON was a suggested resolution +to the issues raised. + +The ``WHEEL`` file format is currently encoded in a custom key-value format. +While this format is easy to parse and write, it requires manual parsing and +validation to ensure that the contents are valid. Moving to a JSON encoded +format will allow for easier parsing and validation of the contents, and +simplify packaging tools and services. + + +Rationale +========= + +Introducing a new core metadata file with a well-specified format will greatly +ease generating, parsing, and validating metadata. JSON is a natural choice for +storing package core metadata. It is easily machine readable and writable, is +understandable to humans, and is well supported across many languages. +Furthermore, :pep:`566` already specifies a canonicalization of email formatted +core metadata to JSON. JSON is also a frequently used format for data +interchange on the web. For discussion of other formats considered, please +refer to the rejected ideas section. + +To maintain backwards compatibility, the JSON metadata file MUST be generated +alongside the existing email formatted metadata file. This ensures that tools +that do not support the new format can still read package metadata for new +packages. + +The JSON formatted metadata file must be semantically equivalent to the email +encoded file. This ensures that the metadata is unambiguous between the two +formats, and tools may read either when both are present. To maintain +performance, this equivalence is not required to be verified by installers, +though other tools may do so. Some tools may choose to make the check dependent +on a configuration flag. + +Package indexes SHOULD check that the metadata files are semantically +equivalent when the package is added to the index. This is a low-cost, one-time +check that ensures users of the index are served valid packages. + + +Specification +============= + +JSON Format Core Metadata File +------------------------------ + +A new optional but recommended file ``METADATA.json`` shall be introduced as a +metadata file for Python packages. If generated, the ``METADATA.json`` file +MUST be placed in the same directory as the current email formatted +``METADATA`` or ``PKG-INFO`` file. + +For wheels, this means that ``METADATA.json`` MUST be located in the +``.dist-info`` directory. The wheel format minor version will be incremented to +indicate the change in the format. + +For source distribution packages, the ``METADATA.json`` file MUST be located +in the root directory of the project sources. Tools that prefer the JSON +formatted metadata file MUST check for the existence of a ``METADATA.json`` +in the source distribution before reading the file. + +The semantic contents of the ``METADATA`` and ``METADATA.json`` files MUST be +equivalent if ``METADATA.json`` is present. Installers MAY verify this +information. Public package indexes SHOULD verify the files are semantically +equivalent. + +Conversion of ``METADATA`` to JSON Encoding +------------------------------------------- + +Conversion from the current email format for core metadata to JSON should +follow the process described in :pep:`566`, with the following modification: +the ``Project-URL`` entries should be converted into an object with keys +containing the labels and values containing the URLs from the original email +value. The overall process thus becomes: + +#. The original key-value format should be read with + ``email.parser.HeaderParser``; +#. All transformed keys should be reduced to lower case. Hyphens should be + replaced with underscores, but otherwise should retain all other characters; +#. The transformed value for any field marked with "(Multiple-use") should be a + single list containing all the original values for the given key; +#. The ``Keywords`` field should be converted to a list by splitting the + original value on commas; +#. The ``Project-URL`` field should be converted into a JSON object with keys + containing the labels and values containing the URLs from the original email + value. +#. The message body, if present, should be set to the value of the + ``description`` key. +#. The result should be stored as a string-keyed dictionary. + +One edge case in the above conversion is that the ``Project-URL`` label is +"free text, with a maximum length of 32 characters." This presents a problem +when trying to decode the label. Therefore this PEP sets the requirement that +the ``Project-URL`` label be any text *except* the comma (``,``) character. +This allows for unambiguous parsing of the ``Project-URL`` entries by splitting +the text on the left-most comma (``,``) character. + +JSON Schema for Core Metadata +----------------------------- + +To enable verification of JSON encoded core metadata, a +`JSON schema `_ for core metadata has been produced. +This schema will be updated with each revision to the core metadata +specification. The schema is available in +:ref:`0819-core-metadata-json-schema`. + +Serving METADATA.json in the Simple Repository API +-------------------------------------------------- + +:pep:`658` introduced a means of serving package metadata in the Simple +Repository API. The JSON encoded version of the package metadata may also be +served, via the following modifications to the Simple Repository API: + +A new attribute ``data-dist-info-metadata-json`` may be added to anchor tags +in the Simple API. This attribute should have a value containing the hash +information for the ``METADATA.json`` file in the same format as +``data-dist-info-metadata``. If ``data-dist-info-metadata-json`` is present, +the repository MUST serve the JSON encoded metadata file at the +distribution's path with ``.metadata.json`` appended to it. For example, if a +distribution is served at ``/simple/foo-1.0-py3-none-any.whl``, the JSON +encoded core metadata file MUST be served at +``/simple/foo-1.0-py3-none-any.whl.metadata.json``. + +JSON Format Wheel Metadata File +------------------------------- + +A new optional but recommended file ``WHEEL.json`` shall be introduced as a +JSON encoded version of the ``WHEEL`` file. If generated, the ``WHEEL.json`` +file MUST be placed in the same directory as the current key-value formatted +``WHEEL`` file, i.e. the ``.dist-info`` directory. The semantic contents of +the ``WHEEL`` and ``WHEEL.json`` files MUST be equivalent. + +The ``WHEEL.json`` file SHOULD be preferred over the ``WHEEL`` file when both +are present. + +Conversion of ``WHEEL`` to JSON Encoding +---------------------------------------- + +Conversion from the current key-value format for wheel file format metadata to +JSON should proceed as follows: + +#. The original key-value format should be read. +#. All transformed keys should be reduced to lower case. Hyphens should be + replaced with underscores, but otherwise should retain all other characters. +#. The ``Tag`` field's entries should be converted to a list containing the + original values. +#. The result should be stored as a string-keyed dictionary. + +This follows a similar process to the conversion of ``METADATA`` to JSON +encoding. + +JSON Schema for Wheel Metadata +------------------------------ + +To enable verification of JSON encoded wheel file format metadata, a +JSON schema for wheel metadata has been produced. +This schema will be updated with each revision to the wheel metadata +specification. The schema is available in :ref:`0819-wheel-json-schema`. + +Deprecation of the ``METADATA``, ``PKG-INFO``, and ``WHEEL`` Files +------------------------------------------------------------------ + +The ``METADATA``, ``PKG-INFO``, and ``WHEEL`` files are now deprecated. This +means that a future PEP may make the ``METADATA``, ``PKG-INFO``, and ``WHEEL`` +files optional and require ``METADATA.json`` and ``WHEEL.json`` to be present. +Please see the next section for more information on backwards compatibility +caveats to that change. + +Despite the ``METADATA`` and ``PKG-INFO`` files being deprecated, new core +metadata revisions should be implemented for both JSON and email to ensure that +they may remain semantically equivalent. Similarly, new ``WHEEL`` metadata keys +should be implemented for both JSON and key-value formats to ensure that they +may remain semantically equivalent. + + +Backwards Compatibility +======================= + +The specification for ``METADATA.json`` and ``WHEEL.json`` is designed such +that the new format is completely backwards compatible. Existing tools may read +metadata from the existing email formatted files, and new tools may take +advantage of the new format. + +A future major revision of the wheel specification may make the ``METADATA``, +``PKG-INFO``, and ``WHEEL`` files optional and make the ``METADATA.json`` and +``WHEEL.json`` files required. + +Note that tools will need to maintain parsing of email metadata and the +key-value formatted ``WHEEL`` file indefinitely to support parsing metadata +for old packages which only have the ``METADATA``, ``PKG-INFO``, +or ``WHEEL`` files. + + +Security Implications +===================== + +One attack vector with JSON encoded core metadata is if the JSON payload is +designed to consume excessive memory or CPU resources in a denial of service +(DoS) attack. While this attack is not likely to affect users whom can cancel +resource-intensive interactive operations, it may be an issue for package +indexes. + +There are several mitigations that can be made to prevent this: + +#. The length of the JSON payload can be restricted to a reasonable size. +#. The reader may use a :class:`~json.JSONDecoder` to omit parsing :class:`int` + and :class:`float` values to avoid quadratic number parsing time complexity + attacks. +#. I plan to contribute a change to :class:`~json.JSONDecoder` in Python + 3.15+ that will allow it to be configured to restrict the nesting of JSON + payloads to a reasonable depth. Core metadata currently has a maximum depth + of 2 to encode mapping and list fields. + +With these mitigations in place, concerns about denial of service attacks with +JSON encoded core metadata are minimal. + + +Reference Implementation +======================== + +A reference implementation of the JSON schema for JSON core metadata is +available in :ref:`0819-core-metadata-json-schema`. + +Furthermore, a reference implementation in the ``packaging`` library `is +available +`__. + +A reference implementation generating both ``METADATA.json`` and ``WHEEL.json`` +in the ``uv`` build backend `is also available `__. + + +Rejected Ideas +============== + +Using Another File Format (TOML, YAML, etc.) +-------------------------------------------- + +While TOML or another format could be used for the new core metadata file +format, JSON has been chosen for a few reasons: + +#. Core metadata is mostly meant as a machine interchange format to be used by + tools and services which wish to interoperate. Therefore the + human-readability of TOML is not an important consideration in this + selection. +#. JSON parsers are implemented in many languages' standard libraries and the + :mod:`json` module has been part of Python's standard library for a very + long time. +#. JSON is fast to parse and emit. +#. JSON schemas are JSON native and commonly used. + + +Open Issues +=========== + +Where should the JSON schema be served? +--------------------------------------- + +Where should the standard JSON Schema be served? Some options would be +packaging.python.org, pypi.org, python.org, or pypa.org. + +My first choice would be packaging.python.org, but I am open to other options. + + +Acknowledgements +================ + +Thanks to Konstantin Schütze for implementing the reference implementation of +this PEP in the ``uv`` build backend and for providing valuable feedback on the +specification. + + +Copyright +========= + +This document is placed in the public domain or under the +CC0-1.0-Universal license, whichever is more permissive. diff --git a/peps/pep-0819/appendix-core-metadata-json-schema.rst b/peps/pep-0819/appendix-core-metadata-json-schema.rst new file mode 100644 index 00000000000..2d7788f2acf --- /dev/null +++ b/peps/pep-0819/appendix-core-metadata-json-schema.rst @@ -0,0 +1,21 @@ +:orphan: + +.. _0819-core-metadata-json-schema: + +Appendix: JSON Schema for Core Metadata +======================================= + +.. literalinclude:: core-metadata.schema.json + :language: json + :linenos: + :name: core-metadata-schema + +.. _0819-wheel-json-schema: + +Appendix: JSON Schema for Wheel Metadata +======================================== + +.. literalinclude:: wheel.schema.json + :language: json + :linenos: + :name: wheel-schema diff --git a/peps/pep-0819/core-metadata.schema.json b/peps/pep-0819/core-metadata.schema.json new file mode 100644 index 00000000000..303314d15db --- /dev/null +++ b/peps/pep-0819/core-metadata.schema.json @@ -0,0 +1,240 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://peps.python.org/pep-0819/core-metadata.schema.json", + "title": "Python Packaging Core Metadata", + "description": "Core metadata for Python packages", + "type": "object", + "properties": { + "metadata_version": { + "type": "string", + "pattern": "^(\\d+(\\.\\d+)*)$", + "description": "The version of the file format." + }, + "name": { + "type": "string", + "pattern": "^([A-Za-z0-9]|[A-Za-z0-9][A-Za-z0-9._-]*[A-Za-z0-9])$", + "description": "The name of the distribution." + }, + "version": { + "type": "string", + "pattern": "^v?([0-9]+!)?[0-9]+(\\.[0-9]+)*([-_\\.]?(alpha|a|beta|b|preview|pre|c|rc)[-_\\.]?[0-9]+)?((-[0-9]+)|([-_\\.]?(post|rev|r)[-_\\.]?[0-9]+))?([-_\\.]?dev[-_\\.]?[0-9]+)?$", + "description": "The distribution's version number." + }, + "dynamic": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "platform", + "supported_platform", + "summary", + "description", + "description_content_type", + "keywords", + "author", + "author_email", + "maintainer", + "maintainer_email", + "license", + "license_expression", + "license_file", + "classifier", + "requires_dist", + "requires_python", + "requires_external", + "project_url", + "provides_extra", + "import_name", + "import_namespace", + "provides_dist", + "obsoletes_dist", + "home_page", + "download_url", + "requires", + "provides", + "obsoletes" + ] + }, + "description": "A list of core metadata fields that are dynamicly calculated." + }, + "platform": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The platforms supported by the distribution." + }, + "supported_platform": { + "type": "array", + "items": { + "type": "string" + }, + "description": "The platforms for which a binary distribution was compiled." + }, + "summary": { + "type": "string", + "description": "A one-line summary of about the distribution." + }, + "description": { + "type": "string", + "description": "A longer description of the distribution that can run to several paragraphs." + }, + "description_content_type": { + "type": "string", + "description": "The content type of the description. In the same format as the HTTP Content-Type header field." + }, + "keywords": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Keywords describing the distribution." + }, + "author": { + "type": "string", + "description": "The name of the author of the distribution. Additional contact information may be provided." + }, + "author_email": { + "type": "string", + "description": "The email address of the author or maintainer. It can contain a name and email address in the legal forms for a RFC 822 ``From:`` header." + }, + "maintainer": { + "type": "string", + "description": "The name of the maintainer. Additional contact information may be provided." + }, + "maintainer_email": { + "type": "string", + "description": "The email address of the maintainer. It can contain a name and email address in the legal forms for a RFC 822 ``From:`` header." + }, + "license": { + "type": "string", + "description": "Text indicating the license covering the distribution where the license is not a selection from the “License” Trove classifiers.", + "deprecated": true + }, + "license_expression": { + "type": "string", + "description": "A valid SPDX license expression indicating the license covering the distribution." + }, + "license_file": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Paths to license files relative to the project root directory." + }, + "classifier": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of Trove classifiers that describe the nature of the distribution." + }, + "requires_dist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of projects required by the distribution." + }, + "requires_python": { + "type": "string", + "description": "The Python version for which the distribution is intended." + }, + "requires_external": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of external dependencies required by the distribution." + }, + "project_url": { + "type": "array", + "items": { + "type": "object", + "properties": { + "label": { + "type": "string", + "description": "The label for the project URL.", + "pattern": "^.{1,32}$" + }, + "url": { + "type": "string", + "description": "The URL for the project URL." + } + }, + "additionalProperties": false + }, + "description": "A mapping of arbitrary text labels to additional URLs relevant to the project." + }, + "provides_extra": { + "type": "array", + "items": { + "type": "string", + "pattern": "^[a-z0-9]+(-[a-z0-9]+)*$" + }, + "description": "A list of optional features provided by the distribution." + }, + "import_name": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of exclusive import names provided by the distribution." + }, + "import_namespace": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of exclusive import namespaces provided by the distribution." + }, + "provides_dist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of project names provided by the distribution." + }, + "obsoletes_dist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of project names that are obsoleted by the distribution." + }, + "home_page": { + "type": "string", + "description": "The home page of the project.", + "deprecated": true + }, + "download_url": { + "type": "string", + "description": "The URL for the distribution's download page.", + "deprecated": true + }, + "requires": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of projects required by the distribution.", + "deprecated": true + }, + "provides": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of projects provided by the distribution.", + "deprecated": true + }, + "obsoletes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "A list of projects that are obsoleted by the distribution.", + "deprecated": true + } + } +} diff --git a/peps/pep-0819/wheel.schema.json b/peps/pep-0819/wheel.schema.json new file mode 100644 index 00000000000..ad557717721 --- /dev/null +++ b/peps/pep-0819/wheel.schema.json @@ -0,0 +1,39 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://peps.python.org/pep-0819/wheel.schema.json", + "title": "Wheel Metadata", + "description": "Metadata for the wheel file format.", + "type": "object", + "properties": { + "wheel_version": { + "type": "string", + "pattern": "^(\\d+(\\.\\d+)*)$", + "description": "The version of the wheel file format." + }, + "generator": { + "type": "string", + "description": "The name and version of the tool that generated the wheel." + }, + "root_is_purelib": { + "type": "boolean", + "description": "Whether the root of the archive should be installed into purelib." + }, + "tag": { + "type": "array", + "items": { + "type": "string", + "description": "The wheel's expanded compatibility tags." + } + }, + "build": { + "type": "string", + "description": "The build tag of the wheel." + } + }, + "required": [ + "wheel_version", + "generator", + "root_is_purelib", + "tag" + ] +}