fix(metadata): improve ComicInfo.xml detection and normalize fallback titles (#2080)

Signed-off-by: Balázs Szücs <bszucs1209@gmail.com>
This commit is contained in:
Balázs Szücs 2026-01-01 08:51:39 +01:00 committed by GitHub
parent f3299915f5
commit 52329e659c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 270 additions and 17 deletions

View File

@ -44,11 +44,12 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
@Override
public BookMetadata extractMetadata(File file) {
String baseName = FilenameUtils.getBaseName(file.getName());
String processedBaseName = processFilename(baseName);
String lowerName = file.getName().toLowerCase();
// Non-archive (fallback)
if (!lowerName.endsWith(".cbz") && !lowerName.endsWith(".cbr") && !lowerName.endsWith(".cb7")) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
// CBZ path (ZIP)
@ -56,15 +57,15 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
try (ZipFile zipFile = new ZipFile(file)) {
ZipEntry entry = findComicInfoEntry(zipFile);
if (entry == null) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
try (InputStream is = zipFile.getInputStream(entry)) {
Document document = buildSecureDocument(is);
return mapDocumentToMetadata(document, baseName);
return mapDocumentToMetadata(document, processedBaseName);
}
} catch (Exception e) {
log.warn("Failed to extract metadata from CBZ", e);
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
}
@ -73,19 +74,19 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
try (SevenZFile sevenZ = SevenZFile.builder().setFile(file).get()) {
SevenZArchiveEntry entry = findSevenZComicInfoEntry(sevenZ);
if (entry == null) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
byte[] xmlBytes = readSevenZEntryBytes(sevenZ, entry);
if (xmlBytes == null) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
try (InputStream is = new ByteArrayInputStream(xmlBytes)) {
Document document = buildSecureDocument(is);
return mapDocumentToMetadata(document, baseName);
return mapDocumentToMetadata(document, processedBaseName);
}
} catch (Exception e) {
log.warn("Failed to extract metadata from CB7", e);
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
}
@ -94,23 +95,23 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
try {
FileHeader header = findComicInfoHeader(archive);
if (header == null) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
byte[] xmlBytes = readRarEntryBytes(archive, header);
if (xmlBytes == null) {
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
try (InputStream is = new ByteArrayInputStream(xmlBytes)) {
Document document = buildSecureDocument(is);
return mapDocumentToMetadata(document, baseName);
return mapDocumentToMetadata(document, processedBaseName);
}
} catch (Exception e) {
log.warn("Failed to extract metadata from CBR", e);
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
} catch (Exception ignore) {
}
return BookMetadata.builder().title(baseName).build();
return BookMetadata.builder().title(processedBaseName).build();
}
private ZipEntry findComicInfoEntry(ZipFile zipFile) {
@ -118,7 +119,7 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
while (entries.hasMoreElements()) {
ZipEntry entry = entries.nextElement();
String name = entry.getName();
if ("comicinfo.xml".equalsIgnoreCase(name)) {
if (isComicInfoName(name)) {
return entry;
}
}
@ -583,8 +584,7 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
for (FileHeader fh : archive.getFileHeaders()) {
String name = fh.getFileName();
if (name == null) continue;
String base = baseName(name);
if ("comicinfo.xml".equalsIgnoreCase(base)) {
if (isComicInfoName(name)) {
return fh;
}
}
@ -674,7 +674,7 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
for (SevenZArchiveEntry e : sevenZ.getEntries()) {
if (e == null || e.isDirectory()) continue;
String name = e.getName();
if (name != null && "ComicInfo.xml".equalsIgnoreCase(name)) {
if (name != null && isComicInfoName(name)) {
return e;
}
}
@ -797,4 +797,17 @@ public class CbxMetadataExtractor implements FileMetadataExtractor {
}
return null;
}
private String processFilename(String baseName) {
// Replace underscores and hyphens with spaces
return Pattern.compile("[_\\-]").matcher(baseName).replaceAll(" ").trim();
}
private static boolean isComicInfoName(String name) {
if (name == null) return false;
String n = name.replace('\\', '/');
if (n.endsWith("/")) return false;
String lower = n.toLowerCase();
return "comicinfo.xml".equals(lower) || lower.endsWith("/comicinfo.xml");
}
}

View File

@ -0,0 +1,240 @@
package com.adityachandel.booklore.service.metadata.extractor;
import com.adityachandel.booklore.model.dto.BookMetadata;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDate;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import java.awt.Color;
import static org.junit.jupiter.api.Assertions.*;
class ComicInfoParsingIssuesTest {
private CbxMetadataExtractor extractor;
private Path tempDir;
@BeforeEach
void setUp() throws IOException {
extractor = new CbxMetadataExtractor();
tempDir = Files.createTempDirectory("comicinfo_test_");
}
@AfterEach
void tearDown() throws IOException {
if (tempDir != null) {
Files.walk(tempDir)
.sorted(Comparator.reverseOrder())
.forEach(p -> { try { Files.deleteIfExists(p); } catch (Exception ignore) {} });
}
}
@Test
void testComicInfoExtractionFromEmbeddedXml() throws Exception {
String xml = "<ComicInfo>" +
" <Title>Daredevil #1</Title>" +
" <Series>Daredevil</Series>" +
" <Number>1</Number>" +
" <Year>1964</Year>" +
" <Publisher>Marvel</Publisher>" +
" <Writer>Stan Lee</Writer>" +
" <Penciller>Joe Orlando</Penciller>" +
"</ComicInfo>";
File cbz = createCbz("daredevil_1964.cbz", new LinkedHashMap<>() {{
put("ComicInfo.xml", xml.getBytes(StandardCharsets.UTF_8));
put("page1.jpg", createTestImage(Color.RED));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
// Verify that the metadata was properly extracted from ComicInfo.xml
assertEquals("Daredevil #1", metadata.getTitle());
assertEquals("Daredevil", metadata.getSeriesName());
assertEquals(1.0f, metadata.getSeriesNumber());
assertEquals(Integer.valueOf(1964), metadata.getPublishedDate().getYear());
assertEquals("Marvel", metadata.getPublisher());
assertTrue(metadata.getAuthors().contains("Stan Lee"));
assertTrue(metadata.getAuthors().contains("Joe Orlando"));
}
@Test
void testComicInfoExtractionWithDifferentCase() throws Exception {
String xml = "<ComicInfo>" +
" <Title>Daredevil #12</Title>" +
" <Series>Daredevil</Series>" +
" <Number>12</Number>" +
" <Year>1966</Year>" +
"</ComicInfo>";
File cbz = createCbz("daredevil_1966.cbz", new LinkedHashMap<>() {{
put("comicinfo.xml", xml.getBytes(StandardCharsets.UTF_8)); // lowercase
put("page1.jpg", createTestImage(Color.BLUE));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
assertEquals("Daredevil #12", metadata.getTitle());
assertEquals("Daredevil", metadata.getSeriesName());
assertEquals(12.0f, metadata.getSeriesNumber());
}
@Test
void testComicInfoExtractionWithPathInName() throws Exception {
String xml = "<ComicInfo>" +
" <Title>Daredevil #200</Title>" +
" <Series>Daredevil</Series>" +
" <Number>200</Number>" +
" <Year>1985</Year>" +
"</ComicInfo>";
File cbz = createCbz("daredevil_1985.cbz", new LinkedHashMap<>() {{
put("metadata/ComicInfo.xml", xml.getBytes(StandardCharsets.UTF_8)); // in subdirectory
put("page1.jpg", createTestImage(Color.GREEN));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
assertEquals("Daredevil #200", metadata.getTitle());
assertEquals("Daredevil", metadata.getSeriesName());
assertEquals(200.0f, metadata.getSeriesNumber());
}
@Test
void testComicInfoExtractionFallbackWhenNoComicInfo() throws Exception {
File cbz = createCbz("Daredevil_v1964_c001.cbz", new LinkedHashMap<>() {{
put("page1.jpg", createTestImage(Color.YELLOW));
put("page2.jpg", createTestImage(Color.CYAN));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
assertEquals("Daredevil v1964 c001", metadata.getTitle());
}
@Test
void testComicInfoWithExtendedFields() throws Exception {
String xml = "<ComicInfo>" +
" <Title>Daredevil #500</Title>" +
" <Series>Daredevil</Series>" +
" <Number>500</Number>" +
" <Count>800</Count>" +
" <Year>2004</Year>" +
" <Month>10</Month>" +
" <Day>15</Day>" +
" <Publisher>Marvel Comics</Publisher>" +
" <Genre>Superhero</Genre>" +
" <Tags>Marvel;Daredevil;Superhero;Frank Miller</Tags>" +
" <Summary>Special anniversary issue</Summary>" +
" <PageCount>32</PageCount>" +
" <LanguageISO>en</LanguageISO>" +
" <Writer>Frank Miller</Writer>" +
" <Penciller>John Romita Jr.</Penciller>" +
" <Inker>Scott Hanna</Inker>" +
" <Colorist>Steve Oliff</Colorist>" +
" <Letterer>Joe Rosen</Letterer>" +
" <CoverArtist>Frank Miller</CoverArtist>" +
"</ComicInfo>";
File cbz = createCbz("daredevil_anniversary.cbz", new LinkedHashMap<>() {{
put("ComicInfo.xml", xml.getBytes(StandardCharsets.UTF_8));
put("page1.jpg", createTestImage(Color.MAGENTA));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
assertEquals("Daredevil #500", metadata.getTitle());
assertEquals("Daredevil", metadata.getSeriesName());
assertEquals(500.0f, metadata.getSeriesNumber());
assertEquals(Integer.valueOf(800), metadata.getSeriesTotal());
assertEquals(LocalDate.of(2004, 10, 15), metadata.getPublishedDate());
assertEquals("Marvel Comics", metadata.getPublisher());
assertEquals("en", metadata.getLanguage());
assertEquals(Integer.valueOf(32), metadata.getPageCount());
assertEquals("Special anniversary issue", metadata.getDescription());
assertTrue(metadata.getAuthors().contains("Frank Miller"));
assertTrue(metadata.getAuthors().contains("John Romita Jr."));
assertTrue(metadata.getAuthors().contains("Scott Hanna"));
assertTrue(metadata.getAuthors().contains("Steve Oliff"));
assertTrue(metadata.getAuthors().contains("Joe Rosen"));
assertTrue(metadata.getCategories().contains("Marvel"));
assertTrue(metadata.getCategories().contains("Daredevil"));
assertTrue(metadata.getCategories().contains("Superhero"));
assertTrue(metadata.getCategories().contains("Frank Miller"));
}
@Test
void testComicInfoWithSpecialCharacters() throws Exception {
String xml = "<ComicInfo>" +
" <Title>Daredevil: The Man Without Fear #1</Title>" +
" <Series>Daredevil: The Man Without Fear</Series>" +
" <Number>1</Number>" +
" <Year>1993</Year>" +
" <Summary>Daredevil's origin story reimagined</Summary>" +
"</ComicInfo>";
File cbz = createCbz("daredevil_origin.cbz", new LinkedHashMap<>() {{
put("ComicInfo.xml", xml.getBytes(StandardCharsets.UTF_8));
put("page1.jpg", createTestImage(Color.ORANGE));
}});
BookMetadata metadata = extractor.extractMetadata(cbz);
assertEquals("Daredevil: The Man Without Fear #1", metadata.getTitle());
assertEquals("Daredevil: The Man Without Fear", metadata.getSeriesName());
assertEquals(1.0f, metadata.getSeriesNumber());
assertEquals("Daredevil's origin story reimagined", metadata.getDescription());
}
private File createCbz(String name, Map<String, byte[]> entries) throws IOException {
Path out = tempDir.resolve(name);
try (ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(out.toFile()))) {
for (Map.Entry<String, byte[]> e : entries.entrySet()) {
String entryName = e.getKey();
byte[] data = e.getValue();
ZipEntry ze = new ZipEntry(entryName);
ze.setTime(0L);
zos.putNextEntry(ze);
try (InputStream is = new ByteArrayInputStream(data)) {
is.transferTo(zos);
}
zos.closeEntry();
}
}
return out.toFile();
}
private byte[] createTestImage(Color color) throws IOException {
BufferedImage image = new BufferedImage(10, 10, BufferedImage.TYPE_INT_RGB);
for (int x = 0; x < 10; x++) {
for (int y = 0; y < 10; y++) {
image.setRGB(x, y, color.getRGB());
}
}
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image, "jpg", baos);
return baos.toByteArray();
}
}
}