Optimize and refine how book similarity is calculated (#1356)

This commit is contained in:
Aditya Chandel
2025-10-15 14:45:56 -06:00
committed by GitHub
parent a8424e8592
commit 675bed1a65
4 changed files with 235 additions and 9 deletions

View File

@@ -196,6 +196,12 @@ public class BookMetadataEntity {
@Column(name = "reviews_locked")
private Boolean reviewsLocked = Boolean.FALSE;
@Column(name = "embedding_vector", columnDefinition = "TEXT")
private String embeddingVector;
@Column(name = "embedding_updated_at")
private Instant embeddingUpdatedAt;
@OneToOne(fetch = FetchType.LAZY)
@MapsId
@JoinColumn(name = "book_id")

View File

@@ -2,6 +2,7 @@ package com.adityachandel.booklore.service.recommender;
import com.adityachandel.booklore.model.dto.BookRecommendationLite;
import com.adityachandel.booklore.model.entity.BookEntity;
import com.adityachandel.booklore.model.entity.BookMetadataEntity;
import com.adityachandel.booklore.service.appsettings.AppSettingService;
import com.adityachandel.booklore.service.BookQueryService;
import jakarta.transaction.Transactional;
@@ -10,8 +11,9 @@ import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.util.List;
import java.util.Set;
import java.time.Instant;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Component
@@ -19,8 +21,8 @@ import java.util.Set;
public class BookRecommendationScheduler {
private final BookQueryService bookQueryService;
private final BookRecommendationService recommendationService;
private final AppSettingService appSettingService;
private final BookVectorService vectorService;
private static final int RECOMMENDATION_LIMIT = 25;
@@ -36,15 +38,68 @@ public class BookRecommendationScheduler {
List<BookEntity> allBooks = bookQueryService.getAllFullBookEntities();
Map<Long, double[]> embeddings = new HashMap<>();
List<BookEntity> booksToUpdate = new ArrayList<>();
for (BookEntity book : allBooks) {
try {
Set<BookRecommendationLite> recommendations = recommendationService.findSimilarBookIds(book.getId(), RECOMMENDATION_LIMIT);
book.setSimilarBooksJson(recommendations);
} catch (Exception e) {
log.error("Error updating similar books for book ID {}: {}", book.getId(), e.getMessage(), e);
double[] embedding = vectorService.generateEmbedding(book);
embeddings.put(book.getId(), embedding);
if (book.getMetadata() != null) {
String embeddingJson = vectorService.serializeVector(embedding);
if (!Objects.equals(book.getMetadata().getEmbeddingVector(), embeddingJson)) {
book.getMetadata().setEmbeddingVector(embeddingJson);
book.getMetadata().setEmbeddingUpdatedAt(Instant.now());
}
}
}
bookQueryService.saveAll(allBooks);
for (BookEntity targetBook : allBooks) {
try {
double[] targetVector = embeddings.get(targetBook.getId());
if (targetVector == null) continue;
String targetSeries = Optional.ofNullable(targetBook.getMetadata())
.map(BookMetadataEntity::getSeriesName)
.map(String::toLowerCase)
.orElse(null);
List<BookVectorService.ScoredBook> candidates = allBooks.stream()
.filter(candidate -> !candidate.getId().equals(targetBook.getId()))
.filter(candidate -> {
String candidateSeries = Optional.ofNullable(candidate.getMetadata())
.map(BookMetadataEntity::getSeriesName)
.map(String::toLowerCase)
.orElse(null);
return targetSeries == null || !targetSeries.equals(candidateSeries);
})
.map(candidate -> {
double[] candidateVector = embeddings.get(candidate.getId());
double similarity = vectorService.cosineSimilarity(targetVector, candidateVector);
return new BookVectorService.ScoredBook(candidate.getId(), similarity);
})
.filter(scored -> scored.getScore() > 0.1)
.collect(Collectors.toList());
List<BookVectorService.ScoredBook> topSimilar = vectorService.findTopKSimilar(
targetVector,
candidates,
RECOMMENDATION_LIMIT
);
Set<BookRecommendationLite> recommendations = topSimilar.stream()
.map(scored -> new BookRecommendationLite(scored.getBookId(), scored.getScore()))
.collect(Collectors.toSet());
targetBook.setSimilarBooksJson(recommendations);
booksToUpdate.add(targetBook);
} catch (Exception e) {
log.error("Error updating similar books for book ID {}: {}", targetBook.getId(), e.getMessage(), e);
}
}
bookQueryService.saveAll(booksToUpdate);
long endTime = System.currentTimeMillis();
log.info("Completed scheduled task 'updateAllSimilarBooks' at: {}. Duration: {} ms", endTime, endTime - startTime);

View File

@@ -0,0 +1,163 @@
package com.adityachandel.booklore.service.recommender;
import com.adityachandel.booklore.model.entity.AuthorEntity;
import com.adityachandel.booklore.model.entity.BookEntity;
import com.adityachandel.booklore.model.entity.BookMetadataEntity;
import com.adityachandel.booklore.model.entity.CategoryEntity;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@Service
@RequiredArgsConstructor
public class BookVectorService {
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final int VECTOR_DIMENSION = 128;
public double[] generateEmbedding(BookEntity book) {
if (book.getMetadata() == null) {
return new double[VECTOR_DIMENSION];
}
BookMetadataEntity metadata = book.getMetadata();
Map<String, Double> features = new HashMap<>();
if (metadata.getTitle() != null) {
addTextFeatures(features, "title", metadata.getTitle(), 3.0);
}
if (metadata.getAuthors() != null) {
metadata.getAuthors().stream()
.map(AuthorEntity::getName)
.filter(Objects::nonNull)
.forEach(author -> features.put("author_" + author.toLowerCase(), 5.0));
}
if (metadata.getCategories() != null) {
metadata.getCategories().stream()
.map(CategoryEntity::getName)
.filter(Objects::nonNull)
.forEach(cat -> features.put("category_" + cat.toLowerCase(), 4.0));
}
if (metadata.getSeriesName() != null) {
features.put("series_" + metadata.getSeriesName().toLowerCase(), 6.0);
}
if (metadata.getPublisher() != null) {
features.put("publisher_" + metadata.getPublisher().toLowerCase(), 2.0);
}
if (metadata.getDescription() != null) {
addTextFeatures(features, "desc", metadata.getDescription(), 1.0);
}
return featuresToVector(features);
}
private void addTextFeatures(Map<String, Double> features, String prefix, String text, double weight) {
String[] words = text.toLowerCase()
.replaceAll("[^a-z0-9\\s]", " ")
.split("\\s+");
Arrays.stream(words)
.filter(w -> w.length() > 3)
.limit(20)
.forEach(word -> features.merge(prefix + "_" + word, weight, Double::sum));
}
private double[] featuresToVector(Map<String, Double> features) {
double[] vector = new double[VECTOR_DIMENSION];
for (Map.Entry<String, Double> entry : features.entrySet()) {
int hash = Math.abs(entry.getKey().hashCode());
int index = hash % VECTOR_DIMENSION;
vector[index] += entry.getValue();
}
double norm = 0.0;
for (double v : vector) {
norm += v * v;
}
norm = Math.sqrt(norm);
if (norm > 0) {
for (int i = 0; i < vector.length; i++) {
vector[i] /= norm;
}
}
return vector;
}
public String serializeVector(double[] vector) {
try {
return objectMapper.writeValueAsString(vector);
} catch (JsonProcessingException e) {
log.error("Error serializing vector", e);
return null;
}
}
public double[] deserializeVector(String vectorJson) {
if (vectorJson == null || vectorJson.isEmpty()) {
return null;
}
try {
return objectMapper.readValue(vectorJson, double[].class);
} catch (JsonProcessingException e) {
log.error("Error deserializing vector", e);
return null;
}
}
public double cosineSimilarity(double[] v1, double[] v2) {
if (v1 == null || v2 == null || v1.length != v2.length) {
return 0.0;
}
double dotProduct = 0.0;
for (int i = 0; i < v1.length; i++) {
dotProduct += v1[i] * v2[i];
}
return dotProduct;
}
public List<ScoredBook> findTopKSimilar(double[] targetVector, List<ScoredBook> candidates, int k) {
if (targetVector == null) {
return Collections.emptyList();
}
return candidates.stream()
.sorted(Comparator.comparingDouble(ScoredBook::getScore).reversed())
.limit(k)
.collect(Collectors.toList());
}
public static class ScoredBook {
private final Long bookId;
private final double score;
public ScoredBook(Long bookId, double score) {
this.bookId = bookId;
this.score = score;
}
public Long getBookId() {
return bookId;
}
public double getScore() {
return score;
}
}
}

View File

@@ -0,0 +1,2 @@
ALTER TABLE book_metadata ADD COLUMN IF NOT EXISTS embedding_vector TEXT;
ALTER TABLE book_metadata ADD COLUMN IF NOT EXISTS embedding_updated_at DATETIME;