mirror of
https://github.com/adityachandelgit/BookLore.git
synced 2026-01-05 15:59:50 -06:00
Optimize and refine how book similarity is calculated (#1356)
This commit is contained in:
@@ -196,6 +196,12 @@ public class BookMetadataEntity {
|
||||
@Column(name = "reviews_locked")
|
||||
private Boolean reviewsLocked = Boolean.FALSE;
|
||||
|
||||
@Column(name = "embedding_vector", columnDefinition = "TEXT")
|
||||
private String embeddingVector;
|
||||
|
||||
@Column(name = "embedding_updated_at")
|
||||
private Instant embeddingUpdatedAt;
|
||||
|
||||
@OneToOne(fetch = FetchType.LAZY)
|
||||
@MapsId
|
||||
@JoinColumn(name = "book_id")
|
||||
|
||||
@@ -2,6 +2,7 @@ package com.adityachandel.booklore.service.recommender;
|
||||
|
||||
import com.adityachandel.booklore.model.dto.BookRecommendationLite;
|
||||
import com.adityachandel.booklore.model.entity.BookEntity;
|
||||
import com.adityachandel.booklore.model.entity.BookMetadataEntity;
|
||||
import com.adityachandel.booklore.service.appsettings.AppSettingService;
|
||||
import com.adityachandel.booklore.service.BookQueryService;
|
||||
import jakarta.transaction.Transactional;
|
||||
@@ -10,8 +11,9 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.scheduling.annotation.Scheduled;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Component
|
||||
@@ -19,8 +21,8 @@ import java.util.Set;
|
||||
public class BookRecommendationScheduler {
|
||||
|
||||
private final BookQueryService bookQueryService;
|
||||
private final BookRecommendationService recommendationService;
|
||||
private final AppSettingService appSettingService;
|
||||
private final BookVectorService vectorService;
|
||||
|
||||
private static final int RECOMMENDATION_LIMIT = 25;
|
||||
|
||||
@@ -36,15 +38,68 @@ public class BookRecommendationScheduler {
|
||||
|
||||
List<BookEntity> allBooks = bookQueryService.getAllFullBookEntities();
|
||||
|
||||
Map<Long, double[]> embeddings = new HashMap<>();
|
||||
List<BookEntity> booksToUpdate = new ArrayList<>();
|
||||
|
||||
for (BookEntity book : allBooks) {
|
||||
try {
|
||||
Set<BookRecommendationLite> recommendations = recommendationService.findSimilarBookIds(book.getId(), RECOMMENDATION_LIMIT);
|
||||
book.setSimilarBooksJson(recommendations);
|
||||
} catch (Exception e) {
|
||||
log.error("Error updating similar books for book ID {}: {}", book.getId(), e.getMessage(), e);
|
||||
double[] embedding = vectorService.generateEmbedding(book);
|
||||
embeddings.put(book.getId(), embedding);
|
||||
|
||||
if (book.getMetadata() != null) {
|
||||
String embeddingJson = vectorService.serializeVector(embedding);
|
||||
if (!Objects.equals(book.getMetadata().getEmbeddingVector(), embeddingJson)) {
|
||||
book.getMetadata().setEmbeddingVector(embeddingJson);
|
||||
book.getMetadata().setEmbeddingUpdatedAt(Instant.now());
|
||||
}
|
||||
}
|
||||
}
|
||||
bookQueryService.saveAll(allBooks);
|
||||
|
||||
for (BookEntity targetBook : allBooks) {
|
||||
try {
|
||||
double[] targetVector = embeddings.get(targetBook.getId());
|
||||
if (targetVector == null) continue;
|
||||
|
||||
String targetSeries = Optional.ofNullable(targetBook.getMetadata())
|
||||
.map(BookMetadataEntity::getSeriesName)
|
||||
.map(String::toLowerCase)
|
||||
.orElse(null);
|
||||
|
||||
List<BookVectorService.ScoredBook> candidates = allBooks.stream()
|
||||
.filter(candidate -> !candidate.getId().equals(targetBook.getId()))
|
||||
.filter(candidate -> {
|
||||
String candidateSeries = Optional.ofNullable(candidate.getMetadata())
|
||||
.map(BookMetadataEntity::getSeriesName)
|
||||
.map(String::toLowerCase)
|
||||
.orElse(null);
|
||||
return targetSeries == null || !targetSeries.equals(candidateSeries);
|
||||
})
|
||||
.map(candidate -> {
|
||||
double[] candidateVector = embeddings.get(candidate.getId());
|
||||
double similarity = vectorService.cosineSimilarity(targetVector, candidateVector);
|
||||
return new BookVectorService.ScoredBook(candidate.getId(), similarity);
|
||||
})
|
||||
.filter(scored -> scored.getScore() > 0.1)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
List<BookVectorService.ScoredBook> topSimilar = vectorService.findTopKSimilar(
|
||||
targetVector,
|
||||
candidates,
|
||||
RECOMMENDATION_LIMIT
|
||||
);
|
||||
|
||||
Set<BookRecommendationLite> recommendations = topSimilar.stream()
|
||||
.map(scored -> new BookRecommendationLite(scored.getBookId(), scored.getScore()))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
targetBook.setSimilarBooksJson(recommendations);
|
||||
booksToUpdate.add(targetBook);
|
||||
|
||||
} catch (Exception e) {
|
||||
log.error("Error updating similar books for book ID {}: {}", targetBook.getId(), e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
bookQueryService.saveAll(booksToUpdate);
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
log.info("Completed scheduled task 'updateAllSimilarBooks' at: {}. Duration: {} ms", endTime, endTime - startTime);
|
||||
|
||||
@@ -0,0 +1,163 @@
|
||||
package com.adityachandel.booklore.service.recommender;
|
||||
|
||||
import com.adityachandel.booklore.model.entity.AuthorEntity;
|
||||
import com.adityachandel.booklore.model.entity.BookEntity;
|
||||
import com.adityachandel.booklore.model.entity.BookMetadataEntity;
|
||||
import com.adityachandel.booklore.model.entity.CategoryEntity;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class BookVectorService {
|
||||
|
||||
private static final ObjectMapper objectMapper = new ObjectMapper();
|
||||
private static final int VECTOR_DIMENSION = 128;
|
||||
|
||||
public double[] generateEmbedding(BookEntity book) {
|
||||
if (book.getMetadata() == null) {
|
||||
return new double[VECTOR_DIMENSION];
|
||||
}
|
||||
|
||||
BookMetadataEntity metadata = book.getMetadata();
|
||||
Map<String, Double> features = new HashMap<>();
|
||||
|
||||
if (metadata.getTitle() != null) {
|
||||
addTextFeatures(features, "title", metadata.getTitle(), 3.0);
|
||||
}
|
||||
|
||||
if (metadata.getAuthors() != null) {
|
||||
metadata.getAuthors().stream()
|
||||
.map(AuthorEntity::getName)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(author -> features.put("author_" + author.toLowerCase(), 5.0));
|
||||
}
|
||||
|
||||
if (metadata.getCategories() != null) {
|
||||
metadata.getCategories().stream()
|
||||
.map(CategoryEntity::getName)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(cat -> features.put("category_" + cat.toLowerCase(), 4.0));
|
||||
}
|
||||
|
||||
if (metadata.getSeriesName() != null) {
|
||||
features.put("series_" + metadata.getSeriesName().toLowerCase(), 6.0);
|
||||
}
|
||||
|
||||
if (metadata.getPublisher() != null) {
|
||||
features.put("publisher_" + metadata.getPublisher().toLowerCase(), 2.0);
|
||||
}
|
||||
|
||||
if (metadata.getDescription() != null) {
|
||||
addTextFeatures(features, "desc", metadata.getDescription(), 1.0);
|
||||
}
|
||||
|
||||
return featuresToVector(features);
|
||||
}
|
||||
|
||||
private void addTextFeatures(Map<String, Double> features, String prefix, String text, double weight) {
|
||||
String[] words = text.toLowerCase()
|
||||
.replaceAll("[^a-z0-9\\s]", " ")
|
||||
.split("\\s+");
|
||||
|
||||
Arrays.stream(words)
|
||||
.filter(w -> w.length() > 3)
|
||||
.limit(20)
|
||||
.forEach(word -> features.merge(prefix + "_" + word, weight, Double::sum));
|
||||
}
|
||||
|
||||
private double[] featuresToVector(Map<String, Double> features) {
|
||||
double[] vector = new double[VECTOR_DIMENSION];
|
||||
|
||||
for (Map.Entry<String, Double> entry : features.entrySet()) {
|
||||
int hash = Math.abs(entry.getKey().hashCode());
|
||||
int index = hash % VECTOR_DIMENSION;
|
||||
vector[index] += entry.getValue();
|
||||
}
|
||||
|
||||
double norm = 0.0;
|
||||
for (double v : vector) {
|
||||
norm += v * v;
|
||||
}
|
||||
norm = Math.sqrt(norm);
|
||||
|
||||
if (norm > 0) {
|
||||
for (int i = 0; i < vector.length; i++) {
|
||||
vector[i] /= norm;
|
||||
}
|
||||
}
|
||||
|
||||
return vector;
|
||||
}
|
||||
|
||||
public String serializeVector(double[] vector) {
|
||||
try {
|
||||
return objectMapper.writeValueAsString(vector);
|
||||
} catch (JsonProcessingException e) {
|
||||
log.error("Error serializing vector", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public double[] deserializeVector(String vectorJson) {
|
||||
if (vectorJson == null || vectorJson.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return objectMapper.readValue(vectorJson, double[].class);
|
||||
} catch (JsonProcessingException e) {
|
||||
log.error("Error deserializing vector", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public double cosineSimilarity(double[] v1, double[] v2) {
|
||||
if (v1 == null || v2 == null || v1.length != v2.length) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
double dotProduct = 0.0;
|
||||
for (int i = 0; i < v1.length; i++) {
|
||||
dotProduct += v1[i] * v2[i];
|
||||
}
|
||||
|
||||
return dotProduct;
|
||||
}
|
||||
|
||||
public List<ScoredBook> findTopKSimilar(double[] targetVector, List<ScoredBook> candidates, int k) {
|
||||
if (targetVector == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return candidates.stream()
|
||||
.sorted(Comparator.comparingDouble(ScoredBook::getScore).reversed())
|
||||
.limit(k)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static class ScoredBook {
|
||||
private final Long bookId;
|
||||
private final double score;
|
||||
|
||||
public ScoredBook(Long bookId, double score) {
|
||||
this.bookId = bookId;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public Long getBookId() {
|
||||
return bookId;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return score;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,2 @@
|
||||
ALTER TABLE book_metadata ADD COLUMN IF NOT EXISTS embedding_vector TEXT;
|
||||
ALTER TABLE book_metadata ADD COLUMN IF NOT EXISTS embedding_updated_at DATETIME;
|
||||
Reference in New Issue
Block a user