Source code for multistorageclient.generators.manifest_metadata

 1# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2# SPDX-License-Identifier: Apache-2.0
 3#
 4# Licensed under the Apache License, Version 2.0 (the "License");
 5# you may not use this file except in compliance with the License.
 6# You may obtain a copy of the License at
 7#
 8# http://www.apache.org/licenses/LICENSE-2.0
 9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16import json
17from typing import List
18
19from multistorageclient.types import ObjectMetadata
20
21from .. import StorageClient
22from ..providers.manifest_metadata import ManifestMetadataProvider, DEFAULT_MANIFEST_BASE_DIR
23
24
[docs] 25class ManifestMetadataGenerator: 26 """ 27 Generates a file metadata manifest for use with a :py:class:`multistorageclient.providers.ManifestMetadataProvider`. 28 """ 29 30 @staticmethod 31 def _generate_manifest_part_body(object_metadata: List[ObjectMetadata]) -> bytes: 32 return "\n".join( 33 [ 34 json.dumps({**metadata_dict, "size_bytes": metadata_dict.pop("content_length")}) 35 for metadata in object_metadata 36 for metadata_dict in [metadata.to_dict()] 37 ] 38 ).encode(encoding="utf-8") 39
[docs] 40 @staticmethod 41 def generate_and_write_manifest( 42 data_storage_client: StorageClient, 43 manifest_storage_client: StorageClient, 44 ) -> None: 45 """ 46 Generates a file metadata manifest. 47 48 The data storage client's base path should be set to the root path for data objects (e.g. ``my-bucket/my-data-prefix``). 49 50 The manifest storage client's base path should be set to the root path for manifest objects (e.g. ``my-bucket/my-manifest-prefix``). 51 52 The following manifest objects will be written with the destination storage client (with the total number of manifest parts being variable):: 53 54 .msc_manifests/ 55 ├── msc_manifest_index.json 56 └── parts/ 57 ├── msc_manifest_part000001.jsonl 58 ├── ... 59 └── msc_manifest_part999999.jsonl 60 61 :param data_storage_client: Storage client for reading data objects. 62 :param manifest_storage_client: Storage client for writing manifest objects. 63 """ 64 # Get respective StorageProviders. A StorageClient will always have a StorageProvider 65 # TODO: Cleanup by exposing APIs from the client 66 data_storage_provider = data_storage_client._storage_provider 67 manifest_storage_provider = manifest_storage_client._storage_provider 68 69 # Create a ManifestMetadataProvider for writing manifest, configure manifest storage provider 70 # TODO(NGCDP-3018): Opportunity to split up the responsibilities of MetadataProvider 71 manifest_metadata_provider = ManifestMetadataProvider( 72 storage_provider=manifest_storage_provider, manifest_path="", writable=True 73 ) 74 75 # For manifest generation we will always assume direct path for listing objects 76 for object_metadata in data_storage_provider.list_objects(prefix=""): 77 if DEFAULT_MANIFEST_BASE_DIR not in object_metadata.key.split("/"): # Do not track manifest files 78 manifest_metadata_provider.add_file(path=object_metadata.key, metadata=object_metadata) 79 80 manifest_metadata_provider.commit_updates()