FF-1507: Comic PDF Extractor now tries to read image type from PDF and falls back to JPG if cannot

This commit is contained in:
John Andrews
2024-04-24 09:06:37 +12:00
parent 90be3d7530
commit 478cf768c6
4 changed files with 64 additions and 5 deletions

View File

@@ -2,8 +2,14 @@
using Docnet.Core.Editors;
using Docnet.Core.Models;
using SixLabors.ImageSharp;
using SixLabors.ImageSharp.Formats;
using SixLabors.ImageSharp.Formats.Bmp;
using SixLabors.ImageSharp.Formats.Gif;
using SixLabors.ImageSharp.Formats.Jpeg;
using SixLabors.ImageSharp.Formats.Png;
using SixLabors.ImageSharp.Formats.Tiff;
using SixLabors.ImageSharp.Formats.Webp;
using SixLabors.ImageSharp.PixelFormats;
using System.Text.RegularExpressions;
namespace FileFlows.ComicNodes.Helpers;
@@ -27,8 +33,26 @@ internal class PdfHelper
var height = pageReader.GetPageHeight();
using var image = Image.LoadPixelData<Bgra32>(rawBytes, width, height);
string file = Path.Combine(destinationDirectory, filePrefix + "-" + i.ToString(new String('0', pageCount.ToString().Length)) + ".png");
image.SaveAsPng(file);
// Infer the image format
(IImageFormat? imageFormat, string? fileExtension) = InferImageFormat(rawBytes);
if (imageFormat == null)
{
args?.Logger?.WLog("Failed to inter image type from PDF, failing back to JPG");
imageFormat = JpegFormat.Instance;
fileExtension = "jpg";
}
else
{
args?.Logger?.ILog("File Extension of image: " + fileExtension);
}
var file = Path.Combine(destinationDirectory, filePrefix + "-" + i.ToString(new string('0', pageCount.ToString().Length))) + "." + fileExtension;
using (var outputStream = File.Create(file + "." + fileExtension))
{
image.Save(outputStream, imageFormat);
}
if (args?.PartPercentageUpdate != null)
{
@@ -44,6 +68,41 @@ internal class PdfHelper
args?.PartPercentageUpdate(halfProgress ? 50 : 0);
}
/// <summary>
/// Infers the image format based on the first few bytes of the image data.
/// </summary>
/// <param name="bytes">The image data bytes.</param>
/// <returns>The inferred image format and file extension.</returns>
private static (IImageFormat? Format, string Extension) InferImageFormat(byte[] bytes)
{
// Try to infer image format based on magic numbers
if (bytes.Length >= 2 && bytes[0] == 0xFF && bytes[1] == 0xD8) // JPEG
return (JpegFormat.Instance, "jpg");
if (bytes.Length >= 8 && BitConverter.ToUInt64(bytes, 0) == 0x89504E470D0A1A0A) // PNG
return (PngFormat.Instance, "png");
if (bytes.Length >= 4 && bytes[0] == 0x47 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x38) // GIF
return (GifFormat.Instance, "gif");
if (bytes.Length >= 4 && bytes[0] == 0x52 && bytes[1] == 0x49 && bytes[2] == 0x46 && bytes[3] == 0x46 &&
bytes[8] == 0x57 && bytes[9] == 0x45 && bytes[10] == 0x42 && bytes[11] == 0x50) // WebP
return (WebpFormat.Instance, "webp");
if (bytes.Length >= 4 && BitConverter.ToUInt32(bytes, 0) == 0x49492A00) // TIFF
return (TiffFormat.Instance, "tiff");
if (bytes.Length >= 2 && bytes[0] == 0x42 && bytes[1] == 0x4D) // BMP
return (BmpFormat.Instance, "bmp");
// If none of the known formats are detected, fall back to Image.DetectFormat()
try
{
IImageFormat format = Image.DetectFormat(bytes);
string extension = format?.DefaultMimeType?.Split('/')[1] ?? "png";
return (format, extension);
}
catch (Exception)
{
return (null, null);
}
}
/// <summary>
/// Creates a PDF from images

View File

@@ -12,10 +12,10 @@ public class ExtractTests
public void Extract_Pdf()
{
var logger = new TestLogger();
var args = new NodeParameters(@"D:\comics\testfiles\fp1.pdf", logger, false, string.Empty, null);
var args = new NodeParameters(@"/home/john/Comics/unprocessed/Ghosts Book 1 Excerpt.pdf", logger, false, string.Empty, null);
var node = new ComicExtractor();
node.DestinationPath = @"D:\comics\converted\pdf";
node.DestinationPath = @"/home/john/Comics/extracted";
if (Directory.Exists(node.DestinationPath))
Directory.Delete(node.DestinationPath, true);
Directory.CreateDirectory(node.DestinationPath);

Binary file not shown.

Binary file not shown.