FF-1507: Comic PDF Extractor now tries to read image type from PDF and falls back to JPG if cannot

2025-12-20 14:59:31 -06:00 · 2024-04-24 09:06:37 +12:00
parent 90be3d7530
commit 478cf768c6
4 changed files with 64 additions and 5 deletions
--- a/ComicNodes/Helpers/PdfHelper.cs
+++ b/ComicNodes/Helpers/PdfHelper.cs
@@ -2,8 +2,14 @@
 using Docnet.Core.Editors;
 using Docnet.Core.Models;
 using SixLabors.ImageSharp;
+using SixLabors.ImageSharp.Formats;
+using SixLabors.ImageSharp.Formats.Bmp;
+using SixLabors.ImageSharp.Formats.Gif;
+using SixLabors.ImageSharp.Formats.Jpeg;
+using SixLabors.ImageSharp.Formats.Png;
+using SixLabors.ImageSharp.Formats.Tiff;
+using SixLabors.ImageSharp.Formats.Webp;
 using SixLabors.ImageSharp.PixelFormats;
-using System.Text.RegularExpressions;

 namespace FileFlows.ComicNodes.Helpers;

@@ -27,8 +33,26 @@ internal class PdfHelper
            var height = pageReader.GetPageHeight();

            using var image = Image.LoadPixelData<Bgra32>(rawBytes, width, height);
-            string file = Path.Combine(destinationDirectory, filePrefix + "-" + i.ToString(new String('0', pageCount.ToString().Length)) + ".png");
-            image.SaveAsPng(file);
+            
+            // Infer the image format
+            (IImageFormat? imageFormat, string? fileExtension) = InferImageFormat(rawBytes);
+            if (imageFormat == null)
+            {
+                args?.Logger?.WLog("Failed to inter image type from PDF, failing back to JPG");
+                imageFormat = JpegFormat.Instance;
+                fileExtension = "jpg";
+            }
+            else
+            {
+                args?.Logger?.ILog("File Extension of image: " + fileExtension);
+            }
+
+            var file = Path.Combine(destinationDirectory, filePrefix + "-" + i.ToString(new string('0', pageCount.ToString().Length))) + "." + fileExtension;
+
+            using (var outputStream = File.Create(file + "." + fileExtension))
+            {
+                image.Save(outputStream, imageFormat);
+            }

            if (args?.PartPercentageUpdate != null)
            {
@@ -44,6 +68,41 @@ internal class PdfHelper
            args?.PartPercentageUpdate(halfProgress ? 50 : 0);
    }

+    /// <summary>
+    /// Infers the image format based on the first few bytes of the image data.
+    /// </summary>
+    /// <param name="bytes">The image data bytes.</param>
+    /// <returns>The inferred image format and file extension.</returns>
+    private static (IImageFormat? Format, string Extension)  InferImageFormat(byte[] bytes)
+    {
+        // Try to infer image format based on magic numbers
+        if (bytes.Length >= 2 && bytes[0] == 0xFF && bytes[1] == 0xD8) // JPEG
+            return (JpegFormat.Instance, "jpg");
+        if (bytes.Length >= 8 && BitConverter.ToUInt64(bytes, 0) == 0x89504E470D0A1A0A) // PNG
+            return (PngFormat.Instance, "png");
+        if (bytes.Length >= 4 && bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x38) // GIF
+            return (GifFormat.Instance, "gif");
+        if (bytes.Length >= 4 && bytes[0] == 0x52 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x46 &&
+            bytes[8] == 0x57 && bytes[9] == 0x45 && bytes[10] == 0x42 && bytes[11] == 0x50) // WebP
+            return (WebpFormat.Instance, "webp");
+        if (bytes.Length >= 4 && BitConverter.ToUInt32(bytes, 0) == 0x49492A00) // TIFF
+            return (TiffFormat.Instance, "tiff");
+        if (bytes.Length >= 2 && bytes[0] == 0x42 && bytes[1] == 0x4D) // BMP
+            return (BmpFormat.Instance, "bmp");
+
+        // If none of the known formats are detected, fall back to Image.DetectFormat()
+        try
+        {
+            IImageFormat format = Image.DetectFormat(bytes);
+            string extension = format?.DefaultMimeType?.Split('/')[1] ?? "png";
+            return (format, extension);
+        }
+        catch (Exception)
+        {
+            return (null, null);
+        }
+    }
+

    /// <summary>
    /// Creates a PDF from images
--- a/ComicNodes/Tests/ExtractTests.cs
+++ b/ComicNodes/Tests/ExtractTests.cs
@@ -12,10 +12,10 @@ public class ExtractTests
    public void Extract_Pdf()
    {
        var logger = new TestLogger();
-        var args = new NodeParameters(@"D:\comics\testfiles\fp1.pdf", logger, false, string.Empty, null);
+        var args = new NodeParameters(@"/home/john/Comics/unprocessed/Ghosts Book 1 Excerpt.pdf", logger, false, string.Empty, null);

        var node = new ComicExtractor();
-        node.DestinationPath = @"D:\comics\converted\pdf";
+        node.DestinationPath = @"/home/john/Comics/extracted";
        if (Directory.Exists(node.DestinationPath))
            Directory.Delete(node.DestinationPath, true);   
        Directory.CreateDirectory(node.DestinationPath);
--- a/FileFlows.Plugin.dll
+++ b/FileFlows.Plugin.dll
--- a/FileFlows.Plugin.pdb
+++ b/FileFlows.Plugin.pdb