fix(epub): fix and improve cover image extraction from EPUB files (specifically support for: EPUB 3 specification) (#1773)

* fix(epub): improve cover image extraction from EPUB files

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>

* fix(epub): enhance cover image extraction and path resolution

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>

---------

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs
2025-12-07 04:17:51 +01:00
committed by GitHub
parent d336cde261
commit 7b5128a511
3 changed files with 164 additions and 37 deletions

View File

@@ -14,14 +14,13 @@ import com.adityachandel.booklore.service.metadata.MetadataMatchService;
import com.adityachandel.booklore.service.metadata.extractor.EpubMetadataExtractor;
import com.adityachandel.booklore.util.FileService;
import com.adityachandel.booklore.util.FileUtils;
import io.documentnode.epub4j.domain.Resource;
import io.documentnode.epub4j.epub.EpubReader;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.time.Instant;
import java.util.List;
import java.util.Set;
@@ -63,34 +62,28 @@ public class EpubProcessor extends AbstractFileProcessor implements BookFileProc
public boolean generateCover(BookEntity bookEntity) {
try {
File epubFile = new File(FileUtils.getBookFullPath(bookEntity));
io.documentnode.epub4j.domain.Book epub;
try (FileInputStream fis = new FileInputStream(epubFile)) {
epub = new EpubReader().readEpub(fis);
}
Resource coverImage = epub.getCoverImage();
byte[] coverData = epubMetadataExtractor.extractCover(epubFile);
if (coverImage == null) {
for (Resource res : epub.getResources().getAll()) {
String id = res.getId();
String href = res.getHref();
if ((id != null && id.toLowerCase().contains("cover")) ||
(href != null && href.toLowerCase().contains("cover"))) {
if (res.getMediaType() != null && res.getMediaType().getName().startsWith("image")) {
coverImage = res;
break;
}
}
}
}
if (coverImage == null) {
if (coverData == null) {
log.warn("No cover image found in EPUB '{}'", bookEntity.getFileName());
return false;
}
boolean saved = saveCoverImage(coverImage, bookEntity.getId());
bookEntity.getMetadata().setCoverUpdatedOn(Instant.now());
bookMetadataRepository.save(bookEntity.getMetadata());
boolean saved;
try (ByteArrayInputStream bais = new ByteArrayInputStream(coverData)) {
BufferedImage originalImage = ImageIO.read(bais);
if (originalImage == null) {
log.warn("Cover image found but could not be decoded (possibly SVG or unsupported format) in EPUB '{}'", bookEntity.getFileName());
return false;
}
saved = fileService.saveCoverImages(originalImage, bookEntity.getId());
originalImage.flush();
}
if (saved) {
bookEntity.getMetadata().setCoverUpdatedOn(Instant.now());
bookMetadataRepository.save(bookEntity.getMetadata());
}
return saved;
} catch (Exception e) {
@@ -148,15 +141,4 @@ public class EpubProcessor extends AbstractFileProcessor implements BookFileProc
bookCreatorService.addCategoriesToBook(validSubjects, bookEntity);
}
}
private boolean saveCoverImage(Resource coverImage, long bookId) throws IOException {
BufferedImage originalImage = ImageIO.read(new ByteArrayInputStream(coverImage.getData()));
try {
return fileService.saveCoverImages(originalImage, bookId);
} finally {
if (originalImage != null) {
originalImage.flush(); // Release resources after processing
}
}
}
}

View File

@@ -41,6 +41,14 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
Book epub = new EpubReader().readEpub(fis);
io.documentnode.epub4j.domain.Resource coverImage = epub.getCoverImage();
if (coverImage == null) {
String coverHref = findCoverImageHrefInOpf(epubFile);
if (coverHref != null) {
byte[] data = extractFileFromZip(epubFile, coverHref);
if (data != null) return data;
}
}
if (coverImage == null) {
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
String id = res.getId();
@@ -297,4 +305,82 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
log.warn("Failed to parse date from string: {}", value);
return null;
}
private String findCoverImageHrefInOpf(File epubFile) {
try (ZipFile zip = new ZipFile(epubFile)) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
dbf.setNamespaceAware(true);
dbf.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
DocumentBuilder builder = dbf.newDocumentBuilder();
FileHeader containerHdr = zip.getFileHeader("META-INF/container.xml");
if (containerHdr == null) return null;
try (InputStream cis = zip.getInputStream(containerHdr)) {
Document containerDoc = builder.parse(cis);
NodeList roots = containerDoc.getElementsByTagName("rootfile");
if (roots.getLength() == 0) return null;
String opfPath = ((Element) roots.item(0)).getAttribute("full-path");
if (StringUtils.isBlank(opfPath)) return null;
FileHeader opfHdr = zip.getFileHeader(opfPath);
if (opfHdr == null) return null;
try (InputStream in = zip.getInputStream(opfHdr)) {
Document doc = builder.parse(in);
NodeList manifestItems = doc.getElementsByTagName("item");
for (int i = 0; i < manifestItems.getLength(); i++) {
Element item = (Element) manifestItems.item(i);
String properties = item.getAttribute("properties");
if (properties != null && properties.contains("cover-image")) {
String href = item.getAttribute("href");
return resolvePath(opfPath, href);
}
}
}
}
} catch (Exception e) {
log.debug("Failed to find cover image in OPF: {}", e.getMessage());
}
return null;
}
private String resolvePath(String opfPath, String href) {
if (href == null || href.isEmpty()) return null;
// If href is absolute within the zip (starts with /), return it without leading /
if (href.startsWith("/")) return href.substring(1);
int lastSlash = opfPath.lastIndexOf('/');
String basePath = (lastSlash == -1) ? "" : opfPath.substring(0, lastSlash + 1);
String combined = basePath + href;
// Normalize path components to handle ".." and "."
java.util.LinkedList<String> parts = new java.util.LinkedList<>();
for (String part : combined.split("/")) {
if (part.equals("..")) {
if (!parts.isEmpty()) parts.removeLast();
} else if (!part.equals(".") && !part.isEmpty()) {
parts.add(part);
}
}
return String.join("/", parts);
}
private byte[] extractFileFromZip(File epubFile, String path) {
try (ZipFile zip = new ZipFile(epubFile)) {
FileHeader header = zip.getFileHeader(path);
if (header == null) return null;
try (InputStream is = zip.getInputStream(header)) {
return is.readAllBytes();
}
} catch (Exception e) {
log.warn("Failed to extract file {} from zip", path);
return null;
}
}
}

View File

@@ -412,6 +412,20 @@ class EpubMetadataExtractorTest {
assertNull(cover);
}
@Test
@DisplayName("Should extract cover declared with properties='cover-image' even if ID/href doesn't contain 'cover'")
void extractCover_propertiesCoverImage_returnsCoverBytes() throws IOException {
byte[] pngImage = createMinimalPngImage();
// Use an ID and HREF that do not contain "cover"
File epubFile = createEpubWithPropertiesCover(pngImage, "image123", "images/img001.png");
byte[] cover = extractor.extractCover(epubFile);
assertNotNull(cover, "Cover should be extracted");
assertTrue(cover.length > 0);
assertEquals(pngImage.length, cover.length);
}
}
@Nested
@@ -662,5 +676,50 @@ class EpubMetadataExtractorTest {
return epubFile;
}
private File createEpubWithPropertiesCover(byte[] coverImageData, String id, String href) throws IOException {
String opfContent = String.format("""
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title>Book with Properties Cover</dc:title>
</metadata>
<manifest>
<item id="%s" href="%s" media-type="image/png" properties="cover-image"/>
</manifest>
</package>
""", id, href);
File epubFile = tempDir.resolve("test-prop-cover-" + System.nanoTime() + ".epub").toFile();
String containerXml = """
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""";
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(epubFile))) {
zos.putNextEntry(new ZipEntry("mimetype"));
zos.write("application/epub+zip".getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("META-INF/container.xml"));
zos.write(containerXml.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("OEBPS/content.opf"));
zos.write(opfContent.getBytes(StandardCharsets.UTF_8));
zos.closeEntry();
zos.putNextEntry(new ZipEntry("OEBPS/" + href));
zos.write(coverImageData);
zos.closeEntry();
}
return epubFile;
}
}