Use sparse files

2026-04-28 19:23:23 -05:00 · 2025-04-15 10:02:13 -07:00
parent 33b3bf3ef5
commit 18f92c6a85
2 changed files with 278 additions and 200 deletions
@@ -83,19 +83,19 @@ done
 # Authenticate with GitHub Container Registry
 echo "$GITHUB_TOKEN" | oras login ghcr.io -u "$organization" --password-stdin

-# Create a temporary directory for processing files
-work_dir=$(mktemp -d)
-echo "Working directory: $work_dir"
-trap 'rm -rf "$work_dir"' EXIT
+# Use the source folder path as the working directory and get its absolute path
+work_dir=$(cd "$folder_path" && pwd)
+echo "Working directory (persistent cache): $work_dir"

-# Create a directory for all files
-mkdir -p "$work_dir/files"
-cd "$work_dir/files"
+# Change to the working directory
+cd "$work_dir"
+files=() # Initialize files array here

 # Copy config.json if it exists
 if [ -f "$folder_path/config.json" ]; then
    echo "Copying config.json..."
    cp "$folder_path/config.json" config.json
+    files+=("config.json:application/vnd.oci.image.config.v1+json")
 fi

 # Copy nvram.bin if it exists
@@ -103,106 +103,104 @@ nvram_bin="$folder_path/nvram.bin"
 if [ -f "$nvram_bin" ]; then
    echo "Copying nvram.bin..."
    cp "$nvram_bin" nvram.bin
+    files+=("nvram.bin:application/octet-stream")
 fi

-# Process disk.img if it exists and needs splitting
-disk_img="$folder_path/disk.img"
-if [ -f "$disk_img" ]; then
-    file_size=$(stat -f%z "$disk_img")
-    if [ $file_size -gt 524288000 ]; then  # 500MB in bytes
-        echo "Splitting large file: disk.img"
-        echo "Original disk.img size: $(du -h "$disk_img" | cut -f1)"
-        
-        # Copy and split the file with progress monitoring
-        echo "Copying disk image..."
-        pv "$disk_img" > disk.img
-        
-        echo "Splitting file..."
-        split -b "$chunk_size" disk.img disk.img.part.
-        rm disk.img
+# Process disk.img if it exists
+disk_img_orig="disk.img" # Already in work_dir
+if [ -f "$disk_img_orig" ]; then
+    # --- Compression Step ---
+    echo "Compressing $disk_img_orig..."
+    compressed_ext=".gz"
+    compressor="gzip"
+    compress_opts="-k -f"
+    compressed_disk_img="disk.img${compressed_ext}"
+    pv "$disk_img_orig" | $compressor $compress_opts > "$compressed_disk_img"
+    compressed_size=$(stat -f%z "$compressed_disk_img")
+    echo "Compressed disk image size: $(du -h "$compressed_disk_img" | cut -f1)"
+    # --- End Compression Step ---

-        # Get original file size for verification
-        original_size=$(stat -f%z "$disk_img")
-        echo "Original disk.img size: $(awk -v size=$original_size 'BEGIN {printf "%.2f GB", size/1024/1024/1024}')"
+    # Check if splitting is needed based on *compressed* size
+    if [ $compressed_size -gt 524288000 ]; then # 500MB threshold
+        echo "Splitting compressed file: $compressed_disk_img"
+        split -b "$chunk_size" "$compressed_disk_img" "$compressed_disk_img.part."
+        # Keep the compressed file and parts in work_dir

-        # Verify split parts total size
-        total_size=0
-        total_parts=$(ls disk.img.part.* | wc -l | tr -d ' ')
+        # --- Adjust part processing ---
+        parts_files=()
+        total_parts=$(ls "$compressed_disk_img.part."* | wc -l | tr -d ' ')
        part_num=0
-        
-        # Create array for files and their annotations
-        files=()
-        for part in disk.img.part.*; do
-            part_size=$(stat -f%z "$part")
-            total_size=$((total_size + part_size))
+        for part in "$compressed_disk_img.part."*; do
            part_num=$((part_num + 1))
-            echo "Part $part: $(awk -v size=$part_size 'BEGIN {printf "%.2f GB", size/1024/1024/1024}')"
-            files+=("$part:application/vnd.oci.image.layer.v1.tar;part.number=$part_num;part.total=$total_parts")
+            # *** IMPORTANT: Use the *compressed* OCI media type with part info ***
+            parts_files+=("$part:${oci_layer_media_type};part.number=$part_num;part.total=$total_parts")
+            echo "Part $part: $(du -h "$part" | cut -f1)"
        done
+        # Combine non-disk files with disk parts
+        files+=("${parts_files[@]}")
+        # --- End Adjust part processing ---

-        echo "Total size of parts: $(awk -v size=$total_size 'BEGIN {printf "%.2f GB", size/1024/1024/1024}')"
-        
-        # Verify total size matches original
-        if [ $total_size -ne $original_size ]; then
-            echo "ERROR: Size mismatch!"
-            echo "Original file size: $(awk -v size=$original_size 'BEGIN {printf "%.2f GB", size/1024/1024/1024}')"
-            echo "Sum of parts size: $(awk -v size=$total_size 'BEGIN {printf "%.2f GB", size/1024/1024/1024}')"
-            echo "Difference: $(awk -v orig=$original_size -v total=$total_size 'BEGIN {printf "%.2f GB", (orig-total)/1024/1024/1024}')"
-            exit 1
-        fi
-        
-        # Add remaining files
-        if [ -f "config.json" ]; then
-            files+=("config.json:application/vnd.oci.image.config.v1+json")
-        fi
-        
-        if [ -f "nvram.bin" ]; then
-            files+=("nvram.bin:application/octet-stream")
-        fi
-
-        # Push versions in parallel
-        push_pids=()
-        for version in $image_versions; do
-            (
-                echo "Pushing version $version..."
-                oras push --disable-path-validation \
-                    "ghcr.io/$organization/$image_name:$version" \
-                    "${files[@]}"
-                echo "Completed push for version $version"
-            ) &
-            push_pids+=($!)
-        done
-
-        # Wait for all pushes to complete
-        for pid in "${push_pids[@]}"; do
-            wait "$pid"
-        done
    else
-        # Push disk.img directly if it's small enough
-        echo "Copying disk image..."
-        pv "$disk_img" > disk.img
-        
-        # Push all files together
-        echo "Pushing all files..."
-        files=("disk.img:application/vnd.oci.image.layer.v1.tar")
-        
-        if [ -f "config.json" ]; then
-            files+=("config.json:application/vnd.oci.image.config.v1+json")
-        fi
-        
-        if [ -f "nvram.bin" ]; then
-            files+=("nvram.bin:application/octet-stream")
-        fi
+        # Add the single compressed file to the list
+        # *** IMPORTANT: Use the *compressed* OCI media type ***
+        files+=("$compressed_disk_img:${oci_layer_media_type}")
+    fi

-        for version in $image_versions; do
-            # Push all files in one command
+    # --- Push Logic (Remains largely the same, but $files now contains compressed parts/file) ---
+    push_pids=()
+    IFS=',' read -ra versions <<< "$image_versions"
+    for version in "${versions[@]}"; do
+         # Trim whitespace if any from version splitting
+        version=$(echo "$version" | xargs)
+        if [[ -z "$version" ]]; then continue; fi
+
+        echo "Pushing version $version..."
+        (
+            # Use process substitution to feed file list safely if it gets long
            oras push --disable-path-validation \
                "ghcr.io/$organization/$image_name:$version" \
                "${files[@]}"
-        done
+            echo "Completed push for version $version"
+        ) &
+        push_pids+=($!)
+    done
+
+    # Wait for all pushes to complete
+    for pid in "${push_pids[@]}"; do
+        wait "$pid"
+    done
+
+    # --- Cleanup compressed files after successful push ---
+    echo "Push successful, cleaning up compressed artifacts..."
+    # Check if parts exist first
+    parts_exist=$(ls "$compressed_disk_img.part."* 2>/dev/null)
+    if [ -n "$parts_exist" ]; then
+        echo "Removing split parts: $compressed_disk_img.part.* and $compressed_disk_img"
+        rm -f "$compressed_disk_img.part."*
+        # Also remove the original compressed file that was split
+        rm -f "$compressed_disk_img"
+    elif [ -f "$compressed_disk_img" ]; then
+        echo "Removing compressed file: $compressed_disk_img"
+        rm -f "$compressed_disk_img"
+    fi
+    # --- End Push Logic ---
+
+else
+    echo "Warning: $disk_img_orig not found."
+    # Push only config/nvram if they exist
+    if [ ${#files[@]} -gt 0 ]; then
+         # (Add push logic here too if you want to push even without disk.img)
+         echo "Pushing non-disk files..."
+         # ... (similar push loop as above) ...
+    else
+        echo "No files found to push."
+        exit 1
    fi
 fi

-for version in $image_versions; do
+for version in "${versions[@]}"; do
+     # Trim whitespace if any from version splitting
+    version=$(echo "$version" | xargs)
+    if [[ -z "$version" ]]; then continue; fi
    echo "Upload complete: ghcr.io/$organization/$image_name:$version"
 done
@@ -809,94 +809,118 @@ class ImageContainerRegistry: @unchecked Sendable {
                )

                // Create sparse file of the required size
-                FileManager.default.createFile(atPath: outputURL.path, contents: nil)
                let outputHandle = try FileHandle(forWritingTo: outputURL)
+                defer { try? outputHandle.close() }

                // Set the file size without writing data (creates a sparse file)
                try outputHandle.truncate(atOffset: expectedTotalSize)

                var reassemblyProgressLogger = ProgressLogger(threshold: 0.05)
-                var processedSize: UInt64 = 0
+                var currentOffset: UInt64 = 0  // Track position in the final *decompressed* file

-                // Process each part in order
                for partNum in 1...totalParts {
-                    guard let (_, partURL) = diskParts.first(where: { $0.0 == partNum }) else {
+                    // Find the original layer info for this part number
+                    guard
+                        let layer = manifest.layers.first(where: { layer in
+                            if let info = extractPartInfo(from: layer.mediaType) {
+                                return info.partNum == partNum
+                            }
+                            return false
+                        }),
+                        let (_, partURL) = diskParts.first(where: { $0.0 == partNum })
+                    else {
                        throw PullError.missingPart(partNum)
                    }
+                    let layerMediaType = layer.mediaType  // Extract mediaType here

                    Logger.info(
                        "Processing part \(partNum) of \(totalParts): \(partURL.lastPathComponent)")

-                    // Get part file size
-                    let partAttributes = try FileManager.default.attributesOfItem(
-                        atPath: partURL.path)
-                    let partSize = partAttributes[.size] as? UInt64 ?? 0
-
-                    // Calculate the offset in the final file (parts are sequential)
-                    let partOffset = processedSize
-
-                    // Open input file
                    let inputHandle = try FileHandle(forReadingFrom: partURL)
                    defer {
                        try? inputHandle.close()
-                        // Don't delete the part file if it's from cache
+                        // Clean up temp downloaded part if not from cache
                        if !partURL.path.contains(cacheDirectory.path) {
                            try? FileManager.default.removeItem(at: partURL)
                        }
                    }

-                    // Seek to the appropriate offset in output file
-                    try outputHandle.seek(toOffset: partOffset)
+                    // Seek to the correct offset in the output sparse file
+                    try outputHandle.seek(toOffset: currentOffset)

-                    // Copy data in chunks to avoid memory issues
-                    let chunkSize: UInt64 =
-                        determineIfMemoryConstrained() ? 256 * 1024 : 1024 * 1024  // Use smaller chunks (256KB-1MB)
-                    var bytesWritten: UInt64 = 0
+                    if let decompressCmd = getDecompressionCommand(for: layerMediaType) {  // Use extracted mediaType
+                        Logger.info("Decompressing part \(partNum)...")
+                        let process = Process()
+                        let pipe = Pipe()
+                        process.executableURL = URL(fileURLWithPath: "/bin/sh")
+                        process.arguments = ["-c", "\(decompressCmd) < \"\(partURL.path)\""]  // Feed file via stdin redirection
+                        process.standardOutput = pipe  // Capture decompressed output

-                    while bytesWritten < partSize {
-                        // Use Foundation's autoreleasepool for proper memory management
-                        Foundation.autoreleasepool {
-                            let readSize: UInt64 = min(UInt64(chunkSize), partSize - bytesWritten)
-                            if let chunk = try? inputHandle.read(upToCount: Int(readSize)) {
-                                if !chunk.isEmpty {
-                                    try? outputHandle.write(contentsOf: chunk)
-                                    bytesWritten += UInt64(chunk.count)
+                        try process.run()

-                                    // Update progress less frequently to reduce overhead
-                                    if bytesWritten % (chunkSize * 4) == 0
-                                        || bytesWritten == partSize
-                                    {
-                                        let totalProgress =
-                                            Double(processedSize + bytesWritten)
-                                            / Double(expectedTotalSize)
-                                        reassemblyProgressLogger.logProgress(
-                                            current: totalProgress,
-                                            context: "Reassembling disk image")
-                                    }
-                                }
+                        let reader = pipe.fileHandleForReading
+                        var partDecompressedSize: UInt64 = 0
+
+                        // Read decompressed data in chunks and write to sparse file
+                        while true {
+                            let data = autoreleasepool {  // Help manage memory with large files
+                                reader.readData(ofLength: 1024 * 1024)  // Read 1MB chunks
                            }
+                            if data.isEmpty { break }  // End of stream

-                            // Add a small delay every few MB to allow memory cleanup
-                            if bytesWritten % (chunkSize * 16) == 0 && bytesWritten > 0 {
-                                // Use Thread.sleep for now, but ideally this would use a non-blocking approach
-                                // that is appropriate for the context (sync/async)
-                                Thread.sleep(forTimeInterval: 0.01)
-                            }
+                            try outputHandle.write(contentsOf: data)
+                            partDecompressedSize += UInt64(data.count)
+
+                            // Update progress based on decompressed size written
+                            let totalProgress =
+                                Double(currentOffset + partDecompressedSize)
+                                / Double(expectedTotalSize)
+                            reassemblyProgressLogger.logProgress(
+                                current: totalProgress,
+                                context: "Reassembling/Decompressing")
                        }
+                        process.waitUntilExit()
+                        if process.terminationStatus != 0 {
+                            throw PullError.decompressionFailed("Part \(partNum)")
+                        }
+                        currentOffset += partDecompressedSize  // Advance offset by decompressed size
+
+                    } else {
+                        // --- Handle non-compressed parts (if any, or the single file case) ---
+                        // This part is similar to your original copy logic, writing directly
+                        // from inputHandle to outputHandle at currentOffset
+                        Logger.info("Copying non-compressed part \(partNum)...")
+                        let partSize =
+                            (try? FileManager.default.attributesOfItem(atPath: partURL.path)[.size]
+                                as? UInt64) ?? 0
+                        var bytesWritten: UInt64 = 0
+                        let chunkSize = 1024 * 1024
+                        while bytesWritten < partSize {
+                            let data = autoreleasepool {
+                                try! inputHandle.read(upToCount: chunkSize) ?? Data()
+                            }
+                            if data.isEmpty { break }
+                            try outputHandle.write(contentsOf: data)
+                            bytesWritten += UInt64(data.count)
+
+                            // Update progress
+                            let totalProgress =
+                                Double(currentOffset + bytesWritten) / Double(expectedTotalSize)
+                            reassemblyProgressLogger.logProgress(
+                                current: totalProgress,
+                                context: "Reassembling")
+                        }
+                        currentOffset += bytesWritten
+                        // --- End non-compressed handling ---
                    }

-                    // Update processed size
-                    processedSize += partSize
+                    // Ensure data is written before processing next part (optional but safer)
+                    try outputHandle.synchronize()
                }

-                // Finalize progress
-                reassemblyProgressLogger.logProgress(
-                    current: 1.0, context: "Reassembling disk image")
-                Logger.info("")  // Newline after progress
-
-                // Close the output file
-                try outputHandle.synchronize()
-                try outputHandle.close()
+                // Finalize progress, close handle (done by defer)
+                reassemblyProgressLogger.logProgress(current: 1.0, context: "Reassembly Complete")
+                Logger.info("")  // Newline

                // Verify final size
                let finalSize =
@@ -1031,86 +1055,112 @@ class ImageContainerRegistry: @unchecked Sendable {
            )

            // Create sparse file of the required size
-            FileManager.default.createFile(atPath: outputURL.path, contents: nil)
            let outputHandle = try FileHandle(forWritingTo: outputURL)
+            defer { try? outputHandle.close() }

            // Set the file size without writing data (creates a sparse file)
            try outputHandle.truncate(atOffset: expectedTotalSize)

            var reassemblyProgressLogger = ProgressLogger(threshold: 0.05)
-            var processedSize: UInt64 = 0
+            var currentOffset: UInt64 = 0  // Track position in the final *decompressed* file

-            // Process each part in order
            for partNum in 1...totalParts {
-                guard let (_, sourceURL) = diskPartSources.first(where: { $0.0 == partNum }) else {
+                // Find the original layer info for this part number
+                guard
+                    let layer = manifest.layers.first(where: { layer in
+                        if let info = extractPartInfo(from: layer.mediaType) {
+                            return info.partNum == partNum
+                        }
+                        return false
+                    }),
+                    let (_, sourceURL) = diskPartSources.first(where: { $0.0 == partNum })
+                else {
                    throw PullError.missingPart(partNum)
                }
+                let layerMediaType = layer.mediaType  // Extract mediaType here

                Logger.info(
                    "Processing part \(partNum) of \(totalParts) from cache: \(sourceURL.lastPathComponent)"
                )

-                // Get part file size
-                let partAttributes = try FileManager.default.attributesOfItem(
-                    atPath: sourceURL.path)
-                let partSize = partAttributes[.size] as? UInt64 ?? 0
-
-                // Calculate the offset in the final file (parts are sequential)
-                let partOffset = processedSize
-
-                // Open input file
                let inputHandle = try FileHandle(forReadingFrom: sourceURL)
                defer { try? inputHandle.close() }

-                // Seek to the appropriate offset in output file
-                try outputHandle.seek(toOffset: partOffset)
+                // Seek to the correct offset in the output sparse file
+                try outputHandle.seek(toOffset: currentOffset)

-                // Copy data in chunks to avoid memory issues
-                let chunkSize: UInt64 = determineIfMemoryConstrained() ? 256 * 1024 : 1024 * 1024  // Use smaller chunks (256KB-1MB)
-                var bytesWritten: UInt64 = 0
+                if let decompressCmd = getDecompressionCommand(for: layerMediaType) {  // Use extracted mediaType
+                    Logger.info("Decompressing part \(partNum)...")
+                    let process = Process()
+                    let pipe = Pipe()
+                    process.executableURL = URL(fileURLWithPath: "/bin/sh")
+                    process.arguments = ["-c", "\(decompressCmd) < \"\(sourceURL.path)\""]  // Feed file via stdin redirection
+                    process.standardOutput = pipe  // Capture decompressed output

-                while bytesWritten < partSize {
-                    // Use Foundation's autoreleasepool for proper memory management
-                    Foundation.autoreleasepool {
-                        let readSize: UInt64 = min(UInt64(chunkSize), partSize - bytesWritten)
-                        if let chunk = try? inputHandle.read(upToCount: Int(readSize)) {
-                            if !chunk.isEmpty {
-                                try? outputHandle.write(contentsOf: chunk)
-                                bytesWritten += UInt64(chunk.count)
+                    try process.run()

-                                // Update progress less frequently to reduce overhead
-                                if bytesWritten % (chunkSize * 4) == 0 || bytesWritten == partSize {
-                                    let totalProgress =
-                                        Double(processedSize + bytesWritten)
-                                        / Double(expectedTotalSize)
-                                    reassemblyProgressLogger.logProgress(
-                                        current: totalProgress,
-                                        context: "Reassembling disk image from cache")
-                                }
-                            }
+                    let reader = pipe.fileHandleForReading
+                    var partDecompressedSize: UInt64 = 0
+
+                    // Read decompressed data in chunks and write to sparse file
+                    while true {
+                        let data = autoreleasepool {  // Help manage memory with large files
+                            reader.readData(ofLength: 1024 * 1024)  // Read 1MB chunks
                        }
+                        if data.isEmpty { break }  // End of stream

-                        // Add a small delay every few MB to allow memory cleanup
-                        if bytesWritten % (chunkSize * 16) == 0 && bytesWritten > 0 {
-                            // Use Thread.sleep for now, but ideally this would use a non-blocking approach
-                            // that is appropriate for the context (sync/async)
-                            Thread.sleep(forTimeInterval: 0.01)
-                        }
+                        try outputHandle.write(contentsOf: data)
+                        partDecompressedSize += UInt64(data.count)
+
+                        // Update progress based on decompressed size written
+                        let totalProgress =
+                            Double(currentOffset + partDecompressedSize) / Double(expectedTotalSize)
+                        reassemblyProgressLogger.logProgress(
+                            current: totalProgress,
+                            context: "Reassembling")
                    }
+                    process.waitUntilExit()
+                    if process.terminationStatus != 0 {
+                        throw PullError.decompressionFailed("Part \(partNum)")
+                    }
+                    currentOffset += partDecompressedSize  // Advance offset by decompressed size
+
+                } else {
+                    // --- Handle non-compressed parts (if any, or the single file case) ---
+                    // This part is similar to your original copy logic, writing directly
+                    // from inputHandle to outputHandle at currentOffset
+                    Logger.info("Copying non-compressed part \(partNum)...")
+                    let partSize =
+                        (try? FileManager.default.attributesOfItem(atPath: sourceURL.path)[.size]
+                            as? UInt64) ?? 0
+                    var bytesWritten: UInt64 = 0
+                    let chunkSize = 1024 * 1024
+                    while bytesWritten < partSize {
+                        let data = autoreleasepool {
+                            try! inputHandle.read(upToCount: chunkSize) ?? Data()
+                        }
+                        if data.isEmpty { break }
+                        try outputHandle.write(contentsOf: data)
+                        bytesWritten += UInt64(data.count)
+
+                        // Update progress
+                        let totalProgress =
+                            Double(currentOffset + bytesWritten) / Double(expectedTotalSize)
+                        reassemblyProgressLogger.logProgress(
+                            current: totalProgress,
+                            context: "Reassembling")
+                    }
+                    currentOffset += bytesWritten
+                    // --- End non-compressed handling ---
                }

-                // Update processed size
-                processedSize += partSize
+                // Ensure data is written before processing next part (optional but safer)
+                try outputHandle.synchronize()
            }

-            // Finalize progress
-            reassemblyProgressLogger.logProgress(
-                current: 1.0, context: "Reassembling disk image from cache")
-            Logger.info("")  // Newline after progress
-
-            // Close the output file
-            try outputHandle.synchronize()
-            try outputHandle.close()
+            // Finalize progress, close handle (done by defer)
+            reassemblyProgressLogger.logProgress(current: 1.0, context: "Reassembly Complete")
+            Logger.info("")  // Newline

            // Verify final size
            let finalSize =
@@ -1646,4 +1696,34 @@ class ImageContainerRegistry: @unchecked Sendable {

        return nil
    }
+
+    // Add helper to check media type and get decompress command
+    private func getDecompressionCommand(for mediaType: String) -> String? {
+        if mediaType.hasSuffix("+gzip") {
+            return "/usr/bin/gunzip -c"  // -c writes to stdout
+        } else if mediaType.hasSuffix("+zstd") {
+            // Check if zstd exists, otherwise handle error?
+            // Assuming brew install zstd -> /opt/homebrew/bin/zstd or /usr/local/bin/zstd
+            let zstdPath = findExecutablePath(named: "zstd") ?? "/usr/local/bin/zstd"
+            return "\(zstdPath) -dc"  // -d decompress, -c stdout
+        }
+        return nil  // Not compressed or unknown compression
+    }
+
+    // Helper to find executables (optional, or hardcode paths)
+    private func findExecutablePath(named executableName: String) -> String? {
+        let pathEnv =
+            ProcessInfo.processInfo.environment["PATH"]
+            ?? "/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin:/opt/homebrew/bin"
+        let paths = pathEnv.split(separator: ":")
+        for path in paths {
+            let executablePath = URL(fileURLWithPath: String(path)).appendingPathComponent(
+                executableName
+            ).path
+            if FileManager.default.isExecutableFile(atPath: executablePath) {
+                return executablePath
+            }
+        }
+        return nil
+    }
 }