Sparse file optimizations

This commit is contained in:
f-trycua
2025-04-19 09:59:32 +02:00
parent 18f92c6a85
commit 353f3cf45d
3 changed files with 1445 additions and 250 deletions

View File

@@ -9,6 +9,8 @@ folder_path=""
image_name=""
image_versions=""
chunk_size="500M" # Default chunk size for splitting large files
# Define the OCI media type for the compressed disk layer
oci_layer_media_type="application/octet-stream+lzfse" # Apple Archive format
# Parse the command line arguments
while [[ $# -gt 0 ]]; do
@@ -41,6 +43,7 @@ while [[ $# -gt 0 ]]; do
echo " --image-name <name> : Name of the image to publish (required)"
echo " --image-versions <versions> : Comma separated list of versions of the image to publish (required)"
echo " --chunk-size <size> : Size of chunks for large files (e.g., 500M, default: 500M)"
echo "Note: The script will automatically resume from the last attempt if available"
exit 0
;;
*)
@@ -69,7 +72,7 @@ if [[ ! -d "$folder_path" ]]; then
fi
# Check and install required tools
for tool in "oras" "split" "pv" "gzip"; do
for tool in "oras" "split" "pv" "jq"; do
if ! command -v "$tool" &> /dev/null; then
echo "$tool is not installed. Installing using Homebrew..."
if ! command -v brew &> /dev/null; then
@@ -80,80 +83,252 @@ for tool in "oras" "split" "pv" "gzip"; do
fi
done
# Check if Apple Archive is available
if ! command -v compression_tool &> /dev/null; then
echo "Error: Apple Archive (compression_tool) is required but not found"
echo "This script requires macOS with Apple Archive support"
exit 1
fi
echo "Apple Archive detected - will use for optimal sparse file handling"
compressed_ext=".aa"
# Authenticate with GitHub Container Registry
echo "$GITHUB_TOKEN" | oras login ghcr.io -u "$organization" --password-stdin
# Use the source folder path as the working directory and get its absolute path
work_dir=$(cd "$folder_path" && pwd)
echo "Working directory (persistent cache): $work_dir"
echo "Working directory: $work_dir"
# Change to the working directory
cd "$work_dir"
# Function to find the most recent cache directory
find_latest_cache() {
local latest_cache=$(ls -td "$work_dir"/.ghcr_cache_* 2>/dev/null | head -n1)
if [ -n "$latest_cache" ]; then
echo "$latest_cache"
else
echo ""
fi
}
# Function to check if a cache directory is valid for resuming
is_valid_cache() {
local cache_dir="$1"
# Check if it contains the necessary files
[ -f "$cache_dir/config.json" ] || [ -f "$cache_dir/nvram.bin" ] || \
[ -f "$cache_dir/disk.img.aa" ] || ls "$cache_dir"/disk.img.aa.part.* 1>/dev/null 2>&1
}
# Always try to find and use an existing cache
existing_cache=$(find_latest_cache)
if [ -n "$existing_cache" ] && is_valid_cache "$existing_cache"; then
cache_dir="$existing_cache"
# Check if the cache contains old gzip format
if [ -f "$cache_dir/disk.img.gz" ] || ls "$cache_dir"/disk.img.gz.part.* 1>/dev/null 2>&1; then
echo "Error: Found legacy gzip format in cache. This script only supports Apple Archive format."
echo "Please delete the cache directory and start fresh: $cache_dir"
exit 1
fi
echo "Resuming from existing cache: $cache_dir"
else
echo "No valid cache found. Starting fresh."
cache_dir="$work_dir/.ghcr_cache_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$cache_dir"
fi
echo "Using cache directory: $cache_dir"
# Display space information
echo "=== DISK SPACE INFORMATION ==="
df -h "$cache_dir" | head -1
df -h "$cache_dir" | grep -v "Filesystem"
echo
# Change to the cache directory
cd "$cache_dir"
files=() # Initialize files array here
# Copy config.json if it exists
if [ -f "$folder_path/config.json" ]; then
echo "Copying config.json..."
cp "$folder_path/config.json" config.json
# Function to check if a version was already pushed
version_pushed() {
local version="$1"
local version_file="$cache_dir/.pushed_$version"
[ -f "$version_file" ]
}
# Function to mark a version as pushed
mark_version_pushed() {
local version="$1"
touch "$cache_dir/.pushed_$version"
}
# Copy config.json if it exists and not already in cache
config_json_source="$folder_path/config.json"
config_json_dest="$cache_dir/config.json"
if [ -f "$config_json_source" ]; then
if [ ! -f "$config_json_dest" ]; then
echo "Copying config.json..."
# Add the uncompressed disk size annotation if disk.img exists and jq is available
if [ -n "$original_disk_size" ] && command -v jq &> /dev/null; then
echo "Adding uncompressed disk size annotation: $original_disk_size bytes"
jq --arg size "$original_disk_size" '.annotations += {"com.trycua.lume.disk.uncompressed_size": $size}' "$config_json_source" > "$config_json_dest" || \
(echo "jq failed, copying original config.json"; cp "$config_json_source" "$config_json_dest") # Fallback to copy if jq fails
else
cp "$config_json_source" "$config_json_dest"
fi
fi
fi
if [ -f "$config_json_dest" ]; then
files+=("config.json:application/vnd.oci.image.config.v1+json")
fi
# Copy nvram.bin if it exists
nvram_bin="$folder_path/nvram.bin"
if [ -f "$nvram_bin" ]; then
# Copy nvram.bin if it exists and not already in cache
if [ -f "$folder_path/nvram.bin" ] && [ ! -f "$cache_dir/nvram.bin" ]; then
echo "Copying nvram.bin..."
cp "$nvram_bin" nvram.bin
cp "$folder_path/nvram.bin" nvram.bin
fi
if [ -f "$cache_dir/nvram.bin" ]; then
files+=("nvram.bin:application/octet-stream")
fi
# Process disk.img if it exists
disk_img_orig="disk.img" # Already in work_dir
disk_img_orig="$folder_path/disk.img"
original_disk_size=""
if [ -f "$disk_img_orig" ]; then
# --- Compression Step ---
echo "Compressing $disk_img_orig..."
compressed_ext=".gz"
compressor="gzip"
compress_opts="-k -f"
# Get original size *before* compression
original_disk_size=$(stat -f%z "$disk_img_orig")
# Get real (non-sparse) size
real_size=$(du -k "$disk_img_orig" | cut -f1)
real_size_bytes=$((real_size * 1024))
sparseness_ratio=$(echo "scale=2; $original_disk_size / $real_size_bytes" | bc)
echo "Disk image: $disk_img_orig"
echo " Logical size: $original_disk_size bytes ($(du -h "$disk_img_orig" | cut -f1))"
echo " Actual disk usage: $((real_size_bytes / 1073741824)) GB"
echo " Sparseness ratio: ${sparseness_ratio}:1"
# Check if we already have compressed files in the cache
compressed_disk_img="disk.img${compressed_ext}"
pv "$disk_img_orig" | $compressor $compress_opts > "$compressed_disk_img"
compressed_size=$(stat -f%z "$compressed_disk_img")
echo "Compressed disk image size: $(du -h "$compressed_disk_img" | cut -f1)"
# --- End Compression Step ---
# Check if splitting is needed based on *compressed* size
if [ $compressed_size -gt 524288000 ]; then # 500MB threshold
echo "Splitting compressed file: $compressed_disk_img"
split -b "$chunk_size" "$compressed_disk_img" "$compressed_disk_img.part."
# Keep the compressed file and parts in work_dir
# --- Adjust part processing ---
parts_files=()
total_parts=$(ls "$compressed_disk_img.part."* | wc -l | tr -d ' ')
part_num=0
for part in "$compressed_disk_img.part."*; do
part_num=$((part_num + 1))
# *** IMPORTANT: Use the *compressed* OCI media type with part info ***
parts_files+=("$part:${oci_layer_media_type};part.number=$part_num;part.total=$total_parts")
echo "Part $part: $(du -h "$part" | cut -f1)"
done
# Combine non-disk files with disk parts
files+=("${parts_files[@]}")
# --- End Adjust part processing ---
else
# Add the single compressed file to the list
# *** IMPORTANT: Use the *compressed* OCI media type ***
files+=("$compressed_disk_img:${oci_layer_media_type}")
already_compressed=false
if [ -f "$cache_dir/$compressed_disk_img" ]; then
already_compressed=true
echo "Using existing compressed file from cache: $compressed_disk_img"
elif ls "$cache_dir"/disk.img${compressed_ext}.part.* 1>/dev/null 2>&1; then
already_compressed=true
echo "Using existing compressed parts from cache"
fi
# --- Push Logic (Remains largely the same, but $files now contains compressed parts/file) ---
# Only compress if not already compressed in cache
if [ "$already_compressed" = false ]; then
# Check for free disk space before compression
avail_space=$(df -k "$cache_dir" | tail -1 | awk '{print $4}')
avail_space_bytes=$((avail_space * 1024))
# Assume compressed size is roughly 30% of real size as a safe estimate
estimated_compressed=$((real_size_bytes * 30 / 100))
if [ "$avail_space_bytes" -lt "$estimated_compressed" ]; then
echo "WARNING: Possibly insufficient disk space for compression!"
echo "Available: $((avail_space_bytes / 1073741824)) GB, Estimated required: $((estimated_compressed / 1073741824)) GB"
read -p "Continue anyway? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "Exiting. Free up some space and try again."
exit 1
fi
fi
# --- Compression Step ---
echo "Compressing $disk_img_orig with Apple Archive..."
# Apple Archive compression
echo "Starting compression with Apple Archive (showing output file growth)..."
compression_tool -encode -i "$disk_img_orig" -o "$compressed_disk_img" -a lzfse &
COMP_PID=$!
sleep 1 # Give compression a moment to start
# Display progress based on output file growth
while kill -0 $COMP_PID 2>/dev/null; do
if [ -f "$compressed_disk_img" ]; then
current_size=$(stat -f%z "$compressed_disk_img" 2>/dev/null || echo 0)
percent=$(echo "scale=2; 100 * $current_size / $original_disk_size" | bc)
echo -ne "Progress: $percent% ($(du -h "$compressed_disk_img" 2>/dev/null | cut -f1 || echo "0"))\r"
else
echo -ne "Preparing compression...\r"
fi
sleep 2
done
wait $COMP_PID
echo -e "\nCompression complete!"
compressed_size=$(stat -f%z "$compressed_disk_img")
echo "Compressed disk image size: $(du -h "$compressed_disk_img" | cut -f1)"
echo "Compression ratio: $(echo "scale=2; $compressed_size * 100 / $original_disk_size" | bc)%"
# --- End Compression Step ---
# Check if splitting is needed based on *compressed* size
if [ $compressed_size -gt 524288000 ]; then # 500MB threshold
echo "Splitting compressed file into chunks of $chunk_size..."
pv "$compressed_disk_img" | split -b "$chunk_size" - "$compressed_disk_img.part."
rm -f "$compressed_disk_img" # Remove the unsplit compressed file
# Verify that parts were created
echo "Verifying split parts..."
ls -la "$cache_dir"/disk.img${compressed_ext}.part.*
fi
else
echo "Using existing compressed/split files from cache"
fi
# --- Adjust part processing ---
echo "Looking for compressed files in $cache_dir..."
# List all files in the cache directory for debugging
ls -la "$cache_dir"
if [ -f "$cache_dir/$compressed_disk_img" ]; then
echo "Found single compressed file: $compressed_disk_img"
# Add the single compressed file to the list
files+=("$compressed_disk_img:${oci_layer_media_type}")
else
# Look for split parts
part_files=($(ls "$cache_dir"/disk.img${compressed_ext}.part.* 2>/dev/null || echo ""))
if [ ${#part_files[@]} -gt 0 ]; then
echo "Found ${#part_files[@]} split parts"
parts_files=()
part_num=0
for part in "${part_files[@]}"; do
part_num=$((part_num + 1))
part_basename=$(basename "$part")
parts_files+=("$part_basename:${oci_layer_media_type};part.number=$part_num;part.total=${#part_files[@]}")
echo "Part $part_num: $(du -h "$part" | cut -f1)"
done
files+=("${parts_files[@]}")
else
echo "ERROR: No compressed files found in cache directory: $cache_dir"
echo "Contents of cache directory:"
find "$cache_dir" -type f | sort
exit 1
fi
fi
# --- Push Logic ---
push_pids=()
IFS=',' read -ra versions <<< "$image_versions"
for version in "${versions[@]}"; do
# Trim whitespace if any from version splitting
# Trim whitespace if any from version splitting
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
# Skip if version was already pushed
if version_pushed "$version"; then
echo "Version $version was already pushed, skipping..."
continue
fi
echo "Pushing version $version..."
(
# Use process substitution to feed file list safely if it gets long
@@ -161,6 +336,7 @@ if [ -f "$disk_img_orig" ]; then
"ghcr.io/$organization/$image_name:$version" \
"${files[@]}"
echo "Completed push for version $version"
mark_version_pushed "$version"
) &
push_pids+=($!)
done
@@ -170,37 +346,108 @@ if [ -f "$disk_img_orig" ]; then
wait "$pid"
done
# --- Cleanup compressed files after successful push ---
echo "Push successful, cleaning up compressed artifacts..."
# Check if parts exist first
parts_exist=$(ls "$compressed_disk_img.part."* 2>/dev/null)
if [ -n "$parts_exist" ]; then
echo "Removing split parts: $compressed_disk_img.part.* and $compressed_disk_img"
rm -f "$compressed_disk_img.part."*
# Also remove the original compressed file that was split
rm -f "$compressed_disk_img"
elif [ -f "$compressed_disk_img" ]; then
echo "Removing compressed file: $compressed_disk_img"
rm -f "$compressed_disk_img"
# --- Cleanup only if all versions were pushed successfully ---
all_versions_pushed=true
for version in "${versions[@]}"; do
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
if ! version_pushed "$version"; then
all_versions_pushed=false
break
fi
done
if [ "$all_versions_pushed" = true ]; then
echo "All versions pushed successfully, cleaning up cache directory..."
cd "$work_dir"
rm -rf "$cache_dir"
else
echo "Some versions failed to push. Cache directory preserved at: $cache_dir"
echo "Run again to resume from this point"
fi
# --- End Push Logic ---
else
echo "Warning: $disk_img_orig not found."
# Push only config/nvram if they exist
if [ ${#files[@]} -gt 0 ]; then
# (Add push logic here too if you want to push even without disk.img)
echo "Pushing non-disk files..."
# ... (similar push loop as above) ...
echo "Pushing non-disk files..."
push_pids=()
IFS=',' read -ra versions <<< "$image_versions"
for version in "${versions[@]}"; do
# Trim whitespace if any from version splitting
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
# Skip if version was already pushed
if version_pushed "$version"; then
echo "Version $version was already pushed, skipping..."
continue
fi
echo "Pushing version $version (config/nvram only)..."
(
oras push --disable-path-validation \
"ghcr.io/$organization/$image_name:$version" \
"${files[@]}"
echo "Completed push for version $version"
mark_version_pushed "$version"
) &
push_pids+=($!)
done
# Wait for all pushes to complete
for pid in "${push_pids[@]}"; do
wait "$pid"
done
# --- Cleanup only if all versions were pushed successfully ---
all_versions_pushed=true
for version in "${versions[@]}"; do
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
if ! version_pushed "$version"; then
all_versions_pushed=false
break
fi
done
if [ "$all_versions_pushed" = true ]; then
echo "All non-disk versions pushed successfully, cleaning up cache directory..."
cd "$work_dir"
rm -rf "$cache_dir"
else
echo "Some non-disk versions failed to push. Cache directory preserved at: $cache_dir"
echo "Run again to resume from this point"
fi
else
echo "No files found to push."
cd "$work_dir"
rm -rf "$cache_dir"
exit 1
fi
fi
for version in "${versions[@]}"; do
# Trim whitespace if any from version splitting
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
echo "Upload complete: ghcr.io/$organization/$image_name:$version"
done
# Determine final status based on the success check *before* potential cleanup
echo # Add a newline for better readability
if [ "$all_versions_pushed" = true ]; then
echo "All versions pushed successfully:"
for version in "${versions[@]}"; do
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
echo " Upload complete: ghcr.io/$organization/$image_name:$version"
done
else
echo "Final upload status:"
for version in "${versions[@]}"; do
version=$(echo "$version" | xargs)
if [[ -z "$version" ]]; then continue; fi
# Check the marker file only if the overall process failed (cache preserved)
if version_pushed "$version"; then
echo " Upload complete: ghcr.io/$organization/$image_name:$version"
else
echo " Upload failed: ghcr.io/$organization/$image_name:$version"
fi
done
# Exit with error code if any version failed
exit 1
fi