FF-1524: tweaked comic parsing

This commit is contained in:
John Andrews
2024-05-09 16:24:39 +12:00
parent 705606fc3e
commit 5fc95f0510
2 changed files with 171 additions and 63 deletions
+143 -61
View File
@@ -267,73 +267,155 @@ public class CreateComicInfo : Node
string shortname = FileHelper.GetShortFileName(libraryFile);
info.Tags = GetTags(ref shortname);
shortname = shortname[..shortname.LastIndexOf('.')];
(shortname, int? year2) = ExtractYear(shortname);
year ??= year2;
//(shortname, int? year2) = ExtractYear(shortname);
// year ??= year2;
// Title - #number (of #) - Issue Title
var ofMatch = Regex.Match(shortname, @"\(of\s+#?(\d+)\)");
if (ofMatch.Success)
{
// Extract the issue number
var ofNumber = int.Parse(ofMatch.Groups[1].Value);
// Remove the issue number from the string
shortname = ofMatch.Groups[0].Success
? shortname.Replace(ofMatch.Value, "").Trim()
: shortname;
// Use the extracted issue number as needed
info.Count = ofNumber;
}
//var issueNumberMatch2 = Regex.Match(shortname, @"(?<!['])[#]?\b(\-?\d{1,3})\b(?!\w)");
ExtractIssueNumber(info, ref shortname);
if (info.Number != null)
ExtractYear(ref shortname);
else
ExtractAnnualInfo(info, ref shortname);
var volMatch = Regex.Match(shortname, @"\b[Vv](?:olume|ol)?\s*(\d+)\b", RegexOptions.IgnoreCase);
if (volMatch.Success)
{
info.Volume = "Volume " + int.Parse(volMatch.Groups[1].Value);
// Remove the issue number from the string
shortname = volMatch.Groups[0].Success
? shortname.Replace(volMatch.Value, "").Trim()
: shortname;
}
// remove any junk
shortname = Regex.Replace(shortname, @"\(([\-]?\d+)\)", "$1").Trim();
shortname = Regex.Replace(shortname, @"\s*\([^)]*\)\s*", " ").Trim();
var parts = shortname.Split(" - ");
if (parts.Length < 2)
{
parts = shortname.Split('#');
if (parts.Length < 2)
{
// remove any junk
shortname = Regex.Replace(shortname, @"\(([\-]?\d+)\)", "$1").Trim();
shortname = Regex.Replace(shortname, @"\s*\([^)]*\)\s*", " ").Trim();
var lastChanceMatch = Regex.Match(shortname, @"([\-]?\d)+$");
if(lastChanceMatch.Success)
{
info.Number = int.Parse(lastChanceMatch.Value);
return info;
}
if (shortname.ToLowerInvariant().Contains("annual") && year != null)
{
info.Volume = "Annual";
info.Number = year.Value;
return info;
}
if (year != null && yearInFolder == false)
{
info.Number = year;
return info;
}
return Result<ComicInfo>.Fail("Invalid filename: " + shortname);
}
parts[1] = '#' + parts[1];
}
var issueNumberMatch = Regex.Match(parts[1], @"(^|#)(?<first>\d+)(?:\s+of\s+[#]?(?<second>\d+))?");
if (issueNumberMatch.Success)
{
info.Number = int.Parse(issueNumberMatch.Groups["first"].Value);
info.Count = issueNumberMatch.Groups["second"].Success
? int.Parse(issueNumberMatch.Groups["second"].Value)
: null;
}
else
{
var volMatch = Regex.Match(parts[1], @"\b[Vv](?:olume|ol)?\s*(\d+)\b", RegexOptions.IgnoreCase);
if (volMatch.Success)
{
info.Volume = "Volume " + int.Parse(volMatch.Groups[1].Value);
}
else
{
logger?.WLog("Issue number not found in: " + parts[1]);
}
}
if (parts.Length > 2)
{
info.Title = Regex.Replace(parts[2], @"\s*\([^)]*\)\s*", " ").Trim();
}
if (Regex.IsMatch(info.Series, "^(other|misc|miscellaneous|assorted)$",
RegexOptions.CultureInvariant | RegexOptions.IgnoreCase))
info.Series = parts[0].Trim();
if (parts.Length > 1)
info.Title = parts.Last();
return info;
}
private static void ExtractIssueNumber(ComicInfo info, ref string shortname)
{
// Define the regex pattern to match numbers up to 3 characters long
string pattern = @"\b(\d{1,3})\b";
// Match against the shortname using the regex pattern
MatchCollection matches = Regex.Matches(shortname, pattern);
// Iterate through the matches
foreach (Match match in matches)
{
int issueNumber;
if (int.TryParse(match.Groups[1].Value, out issueNumber))
{
// Check if the match is preceded by a hyphen
int matchIndex = match.Index;
if (matchIndex > 0 && shortname[matchIndex - 1] == '-')
{
// Check if the hyphen is not part of a word and is preceded by specific characters
if (matchIndex == 1 || shortname[matchIndex - 2] is '#' or ' ' or '(' or '[' or '.' or '_' or '-')
{
// Remove the issue number from the string
shortname = shortname.Remove(matchIndex - 1, match.Length + 1).Trim();
// Use the extracted issue number as needed
info.Number = -issueNumber;
return; // Exit the loop after finding the issue number
}
}
else if (matchIndex == 0 || shortname[matchIndex - 1] is '#' or ' ' or '(' or '[' or '.' or '_' or '-')
{
// Remove the issue number from the string
shortname = shortname.Remove(matchIndex, match.Length).Trim();
// Use the extracted issue number as needed
info.Number = issueNumber;
return; // Exit the loop after finding the issue number
}
}
}
}
private static void ExtractYear(ref string shortname)
{
var yearMatches = Regex.Matches(shortname, @"\b(?:\((19|20)\d{2}\)|\b(19|20)\d{2}\b)\b");
// Iterate over the matches in reverse order
for (int i = yearMatches.Count - 1; i >= 0; i--)
{
var match = yearMatches[i];
// Remove the year from the shortname string
shortname = shortname.Remove(match.Index, match.Length).Trim();
}
}
private static void ExtractAnnualInfo(ComicInfo info, ref string shortname)
{
int year;
// Match the "Annual" pattern followed by a year in various formats
var annualMatch = Regex.Match(shortname, @"\bAnnual\s*(?:['#]?\s*-?\s*)?(\d{2}|\d{4})\b");
if (annualMatch.Success == false)
{
var yearMatches = Regex.Matches(shortname, @"\b(?:\((19|20)\d{2}\)|\b(19|20)\d{2}\b)\b");
if (yearMatches.Count > 0)
{
// Extract the matched year
var lastMatch = yearMatches.Cast<Match>().Last();
year = int.Parse(lastMatch.Value.Trim('(', ')'));
if (info.Series?.Contains(year.ToString()) == true)
return;
info.Number = year;
info.Volume = "Annual";
// Remove the year from the shortname string
shortname = shortname.Remove(lastMatch.Index, lastMatch.Length).Trim().Replace("()", string.Empty);
}
return;
}
string yearString = annualMatch.Groups[1].Value;
if (yearString.Length == 2) // Two-digit year
{
year = int.Parse(yearString);
year += year >= 40 ? 1900 : 2000;
info.Number = year;
}
else if (yearString.Length == 4) // Four-digit year
{
year = int.Parse(yearString);
info.Number = year;
}
info.Volume = "Annual";
// Remove "Annual + year" from the shortname string
shortname = shortname.Replace(annualMatch.Value, "").Trim();
}
/// <summary>
/// Extracts the year from a file
/// </summary>
+28 -2
View File
@@ -145,6 +145,32 @@ public class ComicInfoTests : TestBase
Assert.AreEqual("Cable (1993) - #-01.cbz", name.Value);
}
[TestMethod]
public void Other()
{
var result = CreateComicInfo.GetInfo(Logger,
"/home/john/Comics/Other/Something Random 12 - Random title.cbz",
"/home/john/Comics",
true);
TestContext.WriteLine(Logger.ToString());
Assert.IsFalse(result.IsFailed);
var info = result.Value;
Assert.IsNotNull(info);
Assert.AreEqual("Other", info.Publisher);
Assert.AreEqual("Something Random", info.Series);
Assert.AreEqual(12, info.Number);
Assert.AreEqual("Random title", info.Title);
var xml = CreateComicInfo.SerializeToXml(info);
Assert.IsFalse(string.IsNullOrWhiteSpace(xml));
TestContext.WriteLine(new string('-', 70));
TestContext.WriteLine(xml);
var name = CreateComicInfo.GetNewName(info, "cbz", 3);
Assert.AreEqual("Something Random - #012 - Random title.cbz", name.Value);
}
[TestMethod]
public void NameAndNumber2()
{
@@ -277,7 +303,7 @@ public class ComicInfoTests : TestBase
TestContext.WriteLine(xml);
var name = CreateComicInfo.GetNewName(info, "cbz", 3);
Assert.AreEqual("X-Man - 2004.cbz", name.Value);
Assert.AreEqual("X-Man - Annual 2004.cbz", name.Value);
}
[TestMethod]
@@ -356,7 +382,7 @@ public class ComicInfoTests : TestBase
TestContext.WriteLine(xml);
var name = CreateComicInfo.GetNewName(info, "cbz", 3);
Assert.AreEqual("Grimm Fairy Tales Specials - 2016.cbz", name.Value);
Assert.AreEqual("Grimm Fairy Tales Specials - Annual 2016.cbz", name.Value);
}
// [TestMethod]
public void PhysicalFileTest()