Skip to content

Filecopyutil

extend_files(first, second, buffer_size_b=10 * 1024 * 1024, delete_file2_on_complete=False, offset=0)

Concatenates the contents of second into first using memory-efficient operations.

Shrinks second incrementally after reading each chunk. This is not multi-processing safe.

Parameters: - first (str): Path to the first file (will be extended). - second (str): Path to the second file (data will be read from here). - buffer_size_b (int): Size of the buffer to use for reading/writing data. - delete_file2_on_complete (bool): Whether to delete the second file after operation.

Source code in bionemo/scdl/util/filecopyutil.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def extend_files(
    first: str, second: str, buffer_size_b: int = 10 * 1024 * 1024, delete_file2_on_complete: bool = False, offset=0
):
    """Concatenates the contents of `second` into `first` using memory-efficient operations.

    Shrinks `second` incrementally after reading each chunk. This is not multi-processing safe.

    Parameters:
    - first (str): Path to the first file (will be extended).
    - second (str): Path to the second file (data will be read from here).
    - buffer_size_b (int): Size of the buffer to use for reading/writing data.
    - delete_file2_on_complete (bool): Whether to delete the second file after operation.

    """
    with open(first, "r+b") as f1, open(second, "rb") as f2:
        size1 = os.path.getsize(first)
        size2 = os.path.getsize(second)

        # Resize file1 to the final size to accommodate both files
        f1.seek(size1 + size2 - 1 - offset)
        f1.write(b"\0")  # Extend file1

        # Move data from file2 to file1 in chunks
        read_position = offset  # Start reading from the beginning of file2
        write_position = size1  # Start appending at the end of original data1
        f2.seek(read_position)

        while read_position < size2:
            # Determine how much to read/write in this iteration
            chunk_size = min(buffer_size_b, size2 - read_position)

            # Read data from file2
            new_data = f2.read(chunk_size)

            # Write the new data into file1
            f1.seek(write_position)
            f1.write(new_data)

            # Update pointers
            read_position += chunk_size
            write_position += chunk_size
            f2.seek(read_position)

    if delete_file2_on_complete:
        os.remove(second)