Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/1047.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added exposure of metadata file to Simple API (PEP 658)
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,14 @@ def repair_metadata(content):
set_of_update_fields = set()
total_repaired = 0
for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
# Get the main artifact
main_artifact = (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.artifact
)
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), package.pulp_domain
package.filename, main_artifact, package.pulp_domain
)
changed = False
for field, value in new_data.items():
Expand Down
211 changes: 211 additions & 0 deletions pulp_python/app/migrations/0019_create_missing_metadata_artifacts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# Generated manually on 2025-12-15 14:00 for creating missing metadata artifacts

from django.db import migrations

BATCH_SIZE = 1000


def pulp_hashlib_new(name, *args, **kwargs):
"""
Copied and updated (to comply with migrations) from pulpcore.
"""
import hashlib as the_real_hashlib
from django.conf import settings

if name not in settings.ALLOWED_CONTENT_CHECKSUMS:
return None

return the_real_hashlib.new(name, *args, **kwargs)


def init_and_validate(file, artifact_model, expected_digests):
"""
Copied and updated (to comply with migrations) from pulpcore.
"""
from django.conf import settings

digest_fields = []
for alg in ("sha512", "sha384", "sha256", "sha224", "sha1", "md5"):
if alg in settings.ALLOWED_CONTENT_CHECKSUMS:
digest_fields.append(alg)

if isinstance(file, str):
with open(file, "rb") as f:
hashers = {
n: hasher for n in digest_fields if (hasher := pulp_hashlib_new(n)) is not None
}
if not hashers:
return None

size = 0
while True:
chunk = f.read(1048576) # 1 megabyte
if not chunk:
break
for algorithm in hashers.values():
algorithm.update(chunk)
size = size + len(chunk)
else:
size = file.size
hashers = file.hashers

mismatched_sha256 = None
for algorithm, expected_digest in expected_digests.items():
if algorithm not in hashers:
return None
actual_digest = hashers[algorithm].hexdigest()
if expected_digest != actual_digest:
# Store the actual value for later fixing if it differs from the package value
mismatched_sha256 = actual_digest

attributes = {"size": size, "file": file}
for algorithm in digest_fields:
attributes[algorithm] = hashers[algorithm].hexdigest()

return artifact_model(**attributes), mismatched_sha256


def extract_wheel_metadata(filename):
"""
Extract the metadata file content from a wheel file.
Return the raw metadata content as bytes or None if metadata cannot be extracted.
"""
import zipfile

try:
with zipfile.ZipFile(filename, "r") as f:
for file_path in f.namelist():
if file_path.endswith(".dist-info/METADATA"):
return f.read(file_path)
except (zipfile.BadZipFile, KeyError, OSError):
pass
return None


def artifact_to_metadata_artifact(filename, artifact, md_digests, tmp_dir, artifact_model):
"""
Create artifact for metadata from the provided wheel artifact.
Return (artifact, mismatched_sha256) on success, "extraction_failed" when metadata extraction
fails, or None on init_and_validate failure.
"""
import shutil
import tempfile

with tempfile.NamedTemporaryFile("wb", dir=tmp_dir, suffix=filename, delete=False) as temp_file:
temp_wheel_path = temp_file.name
artifact.file.seek(0)
shutil.copyfileobj(artifact.file, temp_file)
temp_file.flush()

metadata_content = extract_wheel_metadata(temp_wheel_path)
if not metadata_content:
return "extraction_failed"

with tempfile.NamedTemporaryFile(
"wb", dir=tmp_dir, suffix=".metadata", delete=False
) as temp_md:
temp_metadata_path = temp_md.name
temp_md.write(metadata_content)
temp_md.flush()

return init_and_validate(temp_metadata_path, artifact_model, md_digests)


def create_missing_metadata_artifacts(apps, schema_editor):
"""
Create metadata artifacts for PythonPackageContent instances that have metadata_sha256
but are missing the corresponding metadata artifact.
"""
import tempfile
from django.conf import settings
from django.db import models

PythonPackageContent = apps.get_model("python", "PythonPackageContent")
ContentArtifact = apps.get_model("core", "ContentArtifact")
Artifact = apps.get_model("core", "Artifact")

packages = (
PythonPackageContent.objects.filter(
metadata_sha256__isnull=False,
filename__endswith=".whl",
contentartifact__artifact__isnull=False,
contentartifact__relative_path=models.F("filename"),
)
.exclude(metadata_sha256="")
.prefetch_related("_artifacts")
.only("filename", "metadata_sha256")
)
skipped_pkgs = 0
artifact_batch = []
contentartifact_batch = []
packages_batch = []

with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
for package in packages:
# Get the main artifact for package
main_artifact = package._artifacts.get()

filename = package.filename
metadata_digests = {"sha256": package.metadata_sha256}
result = artifact_to_metadata_artifact(
filename, main_artifact, metadata_digests, temp_dir, Artifact
)
if result == "extraction_failed":
# Unset metadata_sha256 when metadata extraction fails
package.metadata_sha256 = None
packages_batch.append(package)
skipped_pkgs += 1
continue
if result is None:
# Failed to build metadata artifact (init_and_validate failed)
skipped_pkgs += 1
continue
Comment on lines +153 to +162
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we also set package.metadata_sha256 = None if init_and_validate fails, all skipped packages will have the field nullified (None). Then the migration fixed everything because all PPC instances with the metadata_sha256 field set would have the artifact. (And we can maybe drop skipped_pkgs variable?)

metadata_artifact, mismatched_sha256 = result
if mismatched_sha256:
# Fix the package if its metadata_sha256 differs from the actual value
package.metadata_sha256 = mismatched_sha256
packages_batch.append(package)

contentartifact = ContentArtifact(
artifact=metadata_artifact,
content=package,
relative_path=f"{filename}.metadata",
)
artifact_batch.append(metadata_artifact)
contentartifact_batch.append(contentartifact)

if len(artifact_batch) == BATCH_SIZE:
Artifact.objects.bulk_create(artifact_batch, batch_size=BATCH_SIZE)
ContentArtifact.objects.bulk_create(contentartifact_batch, batch_size=BATCH_SIZE)
artifact_batch.clear()
contentartifact_batch.clear()
if len(packages_batch) == BATCH_SIZE:
PythonPackageContent.objects.bulk_update(
packages_batch, ["metadata_sha256"], batch_size=BATCH_SIZE
)
packages_batch.clear()

if artifact_batch:
Artifact.objects.bulk_create(artifact_batch, batch_size=BATCH_SIZE)
ContentArtifact.objects.bulk_create(contentartifact_batch, batch_size=BATCH_SIZE)
if packages_batch:
PythonPackageContent.objects.bulk_update(
packages_batch, ["metadata_sha256"], batch_size=BATCH_SIZE
)

if skipped_pkgs > 0:
print(f"Skipped creation of missing metadata artifacts for {skipped_pkgs} packages")


class Migration(migrations.Migration):

dependencies = [
("python", "0018_packageprovenance"),
]

operations = [
migrations.RunPython(
create_missing_metadata_artifacts,
reverse_code=migrations.RunPython.noop,
),
]
2 changes: 0 additions & 2 deletions pulp_python/app/pypi/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,8 +352,6 @@ def parse_package(release_package):
@extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page")
def retrieve(self, request, path, package):
"""Retrieves the simple api html/json page for a package."""
media_type = request.accepted_renderer.media_type

repo_ver, content = self.get_rvc()
# Should I redirect if the normalized name is different?
normalized = canonicalize_name(package)
Expand Down
45 changes: 45 additions & 0 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import os
import tempfile
from gettext import gettext as _
from django.conf import settings
from django.db.utils import IntegrityError
Expand All @@ -22,6 +23,7 @@
)
from pulp_python.app.utils import (
DIST_EXTENSIONS,
artifact_to_metadata_artifact,
artifact_to_python_content_data,
get_project_metadata_from_file,
parse_project_metadata,
Expand Down Expand Up @@ -93,11 +95,31 @@ class Meta:
model = python_models.PythonDistribution


class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
"""
Custom field with overridden get_attribute method. Meant to be used only in
PythonPackageContentSerializer to handle possible existence of metadata artifact.
"""

def get_attribute(self, instance):
# When content has multiple artifacts (wheel + metadata), return the main one
if instance._artifacts.count() > 1:
for ca in instance.contentartifact_set.all():
if not ca.relative_path.endswith(".metadata"):
return ca.artifact

return super().get_attribute(instance)


class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
"""
A Serializer for PythonPackageContent.
"""

artifact = PythonSingleContentArtifactField(
help_text=_("Artifact file representing the physical content"),
)

# Core metadata
# Version 1.0
author = serializers.CharField(
Expand Down Expand Up @@ -386,8 +408,21 @@ def deferred_validate(self, data):
if attestations := data.pop("attestations", None):
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)

# Create metadata artifact for wheel files
if filename.endswith(".whl"):
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
data["metadata_artifact"] = metadata_artifact
data["metadata_sha256"] = metadata_artifact.sha256

return data

def get_artifacts(self, validated_data):
artifacts = super().get_artifacts(validated_data)
if metadata_artifact := validated_data.pop("metadata_artifact", None):
relative_path = f"{validated_data['filename']}.metadata"
artifacts[relative_path] = metadata_artifact
return artifacts

def retrieve(self, validated_data):
content = python_models.PythonPackageContent.objects.filter(
sha256=validated_data["sha256"], _pulp_domain=get_domain()
Expand Down Expand Up @@ -419,6 +454,7 @@ def create(self, validated_data):

class Meta:
fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
"artifact",
"author",
"author_email",
"description",
Expand Down Expand Up @@ -514,6 +550,15 @@ def validate(self, data):
data["provenance"] = self.handle_attestations(
filename, data["sha256"], attestations, offline=True
)
# Create metadata artifact for wheel files
if filename.endswith(".whl"):
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
if metadata_artifact := artifact_to_metadata_artifact(
filename, artifact, tmp_dir=temp_dir
):
data["metadata_artifact"] = metadata_artifact
data["metadata_sha256"] = metadata_artifact.sha256

return data

class Meta(PythonPackageContentSerializer.Meta):
Expand Down
14 changes: 11 additions & 3 deletions pulp_python/app/tasks/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
progress_report.save()
with progress_report:
for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), domain
# Get the main artifact
main_artifact = (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.artifact
)
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
total_repaired += update_package_if_needed(
package, new_data, batch, set_of_update_fields
)
Expand All @@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
grouped_by_url = defaultdict(list)

for package in group_set:
for ra in package.contentartifact_set.get().remoteartifact_set.all():
for ra in (
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
.first()
.remoteartifact_set.all()
):
grouped_by_url[ra.remote.url].append((package, ra))

# Prioritize the URL that can serve the most packages
Expand Down
Loading