fix(metadata): cover no longer fails to extract with invalid zip files (#2012)

Previously if any file in the epub was invalid-ly packed into the zip
container, the covers will fail to fetch due to the EpubReader trying to
load everything in the file into memory.

This patch fixes this by making the EpubReader lazy-load all of the
files in the epub so we only actually try to read the strictly-necesasry
files to find the cover. This should also improve loading times ever so
slightly by not needing to read everything in for no reason.
This commit is contained in:
MightyHalfGinger
2025-12-30 10:49:50 -08:00
committed by GitHub
parent 11e64904a5
commit bfb77ecee0

View File

@@ -2,6 +2,9 @@ package com.adityachandel.booklore.service.metadata.extractor;
import com.adityachandel.booklore.model.dto.BookMetadata;
import io.documentnode.epub4j.domain.Book;
import io.documentnode.epub4j.domain.MediaType;
import io.documentnode.epub4j.domain.MediaTypes;
import io.documentnode.epub4j.domain.Resource;
import io.documentnode.epub4j.epub.EpubReader;
import lombok.extern.slf4j.Slf4j;
import net.lingala.zip4j.ZipFile;
@@ -20,13 +23,16 @@ import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@@ -38,39 +44,54 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
private static final Pattern YEAR_ONLY_PATTERN = Pattern.compile("^\\d{4}$");
private static final String OPF_NS = "http://www.idpf.org/2007/opf";
// List of all media types that epub4j has so we can lazy load them.
// Note that we have to add in null to handle files without extentions like mimetype.
private static List<MediaType> MEDIA_TYPES = new ArrayList<>();
static {
for (int i = 0; i < MediaTypes.mediaTypes.length; i++) {
MEDIA_TYPES.add(MediaTypes.mediaTypes[i]);
}
MEDIA_TYPES.add(null);
}
@Override
public byte[] extractCover(File epubFile) {
try (FileInputStream fis = new FileInputStream(epubFile)) {
Book epub = new EpubReader().readEpub(fis);
io.documentnode.epub4j.domain.Resource coverImage = epub.getCoverImage();
try (ZipFile zip = new ZipFile(epubFile)) {
Book epub = new EpubReader().readEpubLazy(zip, "UTF-8", MEDIA_TYPES);
if (coverImage == null) {
String coverHref = findCoverImageHrefInOpf(epubFile);
if (coverHref != null) {
byte[] data = extractFileFromZip(epubFile, coverHref);
if (data != null) return data;
// First we read the cover image from the epub4j reader.
// We filter to only images since it will default to the first page.
byte[] image = getImageFromEpubResource(epub.getCoverImage());
if (image != null) {
return image;
}
// We fall back to reading the image based on the cover-image property.
String coverHref = findCoverImageHrefInOpf(epubFile);
if (coverHref != null) {
image = extractFileFromZip(epubFile, coverHref);
if (image != null) {
return image;
}
}
if (coverImage == null) {
for (io.documentnode.epub4j.domain.Resource res : epub.getResources().getAll()) {
String id = res.getId();
String href = res.getHref();
if ((id != null && id.toLowerCase().contains("cover")) ||
(href != null && href.toLowerCase().contains("cover"))) {
if (res.getMediaType() != null && res.getMediaType().getName().startsWith("image")) {
coverImage = res;
break;
}
// As a last resort we look at all of the files in the epub for something cover related.
for (Resource res : epub.getResources().getAll()) {
String id = res.getId();
String href = res.getHref();
if ((id != null && id.toLowerCase().contains("cover")) ||
(href != null && href.toLowerCase().contains("cover"))) {
image = getImageFromEpubResource(res);
if (image != null) {
return image;
}
}
}
return (coverImage != null) ? coverImage.getData() : null;
} catch (Exception e) {
log.warn("Failed to extract cover from EPUB: {}", epubFile.getName(), e);
return null;
}
return null;
}
@Override
@@ -332,6 +353,24 @@ public class EpubMetadataExtractor implements FileMetadataExtractor {
return null;
}
private byte[] getImageFromEpubResource(Resource res) {
if (res == null) {
return null;
}
MediaType mt = res.getMediaType();
if (mt == null || !mt.getName().startsWith("image")) {
return null;
}
try {
return res.getData();
} catch (IOException e) {
log.warn("Failed to read data for resource", e);
return null;
}
}
private String findCoverImageHrefInOpf(File epubFile) {
try (ZipFile zip = new ZipFile(epubFile)) {
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();