PrivateCaptcha/pkg/difficulty/levels.go

package difficulty

import (
	"context"
	"errors"
	"log/slog"
	"math"
	"runtime/debug"
	"time"

	"github.com/PrivateCaptcha/PrivateCaptcha/pkg/common"
	dbgen "github.com/PrivateCaptcha/PrivateCaptcha/pkg/db/generated"
	"github.com/PrivateCaptcha/PrivateCaptcha/pkg/leakybucket"
	"github.com/PrivateCaptcha/PrivateCaptcha/pkg/puzzle"
)

var (
	errBackfillPanic = errors.New("panic during backfill")
)

type Levels struct {
	timeSeries      common.TimeSeriesStore
	propertyBuckets *leakybucket.Manager[int32, leakybucket.VarLeakyBucket[int32], *leakybucket.VarLeakyBucket[int32]]
	userBuckets     *leakybucket.Manager[common.TFingerprint, leakybucket.ConstLeakyBucket[common.TFingerprint], *leakybucket.ConstLeakyBucket[common.TFingerprint]]
	accessChan      chan *common.AccessRecord
	backfillChan    chan *common.BackfillRequest
	batchSize       int
	accessLogCancel context.CancelFunc
}

func NewLevels(timeSeries common.TimeSeriesStore, batchSize int, bucketSize time.Duration) *Levels {
	const (
		propertyBucketCap = math.MaxUint32
		// below numbers are rather arbitrary as we can support "many"
		// as for users, we want to keep only the "most active" ones as "not active enough" activity
		// does not affect difficulty much, if at all
		maxUserBuckets     = 1_000_000
		maxPropertyBuckets = 100_000
		userBucketCap      = math.MaxUint32
		// user worst case: everybody in the private network (VPN, BigCorp internal) access single resource (survey, login)
		// estimate: 12 "free" requests per minute should be "enough for everybody" (tm), after that difficulty grows
		userLeakRatePerMinute = 12
		userBucketSize        = time.Minute / userLeakRatePerMinute
	)

	levels := &Levels{
		timeSeries:      timeSeries,
		propertyBuckets: leakybucket.NewManager[int32, leakybucket.VarLeakyBucket[int32]](maxPropertyBuckets, propertyBucketCap, bucketSize),
		userBuckets:     leakybucket.NewManager[common.TFingerprint, leakybucket.ConstLeakyBucket[common.TFingerprint]](maxUserBuckets, userBucketCap, userBucketSize),
		accessChan:      make(chan *common.AccessRecord, 10*batchSize),
		backfillChan:    make(chan *common.BackfillRequest, batchSize),
		batchSize:       batchSize,
		accessLogCancel: func() {},
	}

	return levels
}

func requestsToDifficulty(requests float64, minDifficulty float64, level dbgen.DifficultyGrowth) uint8 {
	if (requests < 1.0) || (level == dbgen.DifficultyGrowthConstant) || (minDifficulty >= 255.0) {
		return uint8(min(minDifficulty, 255.0))
	}

	// full formula is
	// y = log2(log2(x**a)) * x**b
	// parameter "a" affects sensitivity to growth

	a := 1.0
	switch level {
	case dbgen.DifficultyGrowthSlow:
		a = 0.9
	case dbgen.DifficultyGrowthMedium:
		a = 1.0
	case dbgen.DifficultyGrowthFast:
		a = 1.1
	}

	log2A := math.Log2(a)

	m := log2A
	if requests > 2.0 {
		m += math.Log2(math.Log2(requests))
	}
	m = math.Max(m, 0.0)

	b := math.Log2((256.0-minDifficulty)/(5.0+log2A)) / 32.0
	fx := m * math.Pow(requests, b)
	difficulty := minDifficulty + math.Round(fx)

	if difficulty >= 255.0 {
		return 255
	}

	return uint8(difficulty)
}

func (levels *Levels) Init(accessLogInterval, backfillInterval time.Duration) {
	const (
		maxPendingBatchSize = 100_000
		levelsService       = "levels"
	)
	var accessCtx context.Context
	accessBaseCtx := context.WithValue(context.Background(), common.ServiceContextKey, levelsService)
	accessCtx, levels.accessLogCancel = context.WithCancel(
		context.WithValue(accessBaseCtx, common.TraceIDContextKey, "access_log"))
	go common.ProcessBatchArray(accessCtx, levels.accessChan, accessLogInterval, levels.batchSize, maxPendingBatchSize, levels.timeSeries.WriteAccessLogBatch)

	difficultyBaseCtx := context.WithValue(context.Background(), common.ServiceContextKey, levelsService)
	go levels.backfillDifficulty(context.WithValue(difficultyBaseCtx, common.TraceIDContextKey, "backfill_difficulty"),
		backfillInterval)
}

func (l *Levels) Shutdown() {
	slog.Debug("Shutting down levels routines")
	l.accessLogCancel()
	close(l.accessChan)
	close(l.backfillChan)
}

func (l *Levels) DifficultyEx(fingerprint common.TFingerprint, p *dbgen.Property, baseDifficulty uint8, tnow time.Time) (uint8, leakybucket.TLevel) {
	l.recordAccess(fingerprint, p, tnow)

	minDifficulty := float64(max(p.Level.Int16, int16(baseDifficulty)))

	propertyAddResult := l.propertyBuckets.Add(p.ID, 1, tnow)
	if !propertyAddResult.Found {
		l.backfillProperty(p)
	}

	userAddResult := l.userBuckets.Add(fingerprint, 1, tnow)

	level := int64(userAddResult.CurrLevel)
	level += int64(propertyAddResult.CurrLevel)

	// just as bucket's level is the measure of deviation of requests
	// difficulty is the scaled deviation from minDifficulty
	return requestsToDifficulty(float64(level), minDifficulty, p.Growth), propertyAddResult.CurrLevel
}

func (l *Levels) Difficulty(fingerprint common.TFingerprint, p *dbgen.Property, tnow time.Time) uint8 {
	diff, _ := l.DifficultyEx(fingerprint, p, 0, tnow)
	return diff
}

func (l *Levels) backfillProperty(p *dbgen.Property) {
	br := &common.BackfillRequest{
		OrgID:      p.OrgID.Int32,
		UserID:     p.OrgOwnerID.Int32,
		PropertyID: p.ID,
	}
	l.backfillChan <- br
}

func (l *Levels) BackfillAccess(result *puzzle.VerifyResult) {
	ar := &common.AccessRecord{
		Fingerprint: 0, // we lose information about user but having totals still helps for difficulty calculation
		UserID:      result.UserID,
		OrgID:       result.OrgID,
		PropertyID:  result.PropertyID,
		Timestamp:   result.CreatedAt,
	}

	l.accessChan <- ar
}

func (l *Levels) recordAccess(fingerprint common.TFingerprint, p *dbgen.Property, tnow time.Time) {
	if (p == nil) || !p.ExternalID.Valid {
		return
	}

	ar := &common.AccessRecord{
		Fingerprint: fingerprint,
		// we record events for the user that owns the org where the property belongs
		// (effectively, who is billed for the org), rather than who created it
		UserID:     p.OrgOwnerID.Int32,
		OrgID:      p.OrgID.Int32,
		PropertyID: p.ID,
		Timestamp:  tnow,
	}

	l.accessChan <- ar
}

func (l *Levels) Reset() {
	l.propertyBuckets.Clear()
	l.userBuckets.Clear()
}

func (l *Levels) retrievePropertyStatsSafe(ctx context.Context, r *common.BackfillRequest) (data []*common.TimeCount, err error) {
	defer func() {
		if rvr := recover(); rvr != nil {
			slog.ErrorContext(ctx, "Recovered from ClickHouse query panic", "panic", rvr, "stack", string(debug.Stack()))
			data = []*common.TimeCount{}
			err = errBackfillPanic
		}
	}()

	// 12 because we keep last hour of 5-minute intervals in Clickhouse, so we grab all of them
	timeFrom := time.Now().UTC().Add(-time.Duration(12) * l.propertyBuckets.LeakInterval())
	return l.timeSeries.RetrievePropertyStatsSince(ctx, r, timeFrom)
}

func (l *Levels) backfillDifficulty(ctx context.Context, cacheDuration time.Duration) {
	slog.DebugContext(ctx, "Backfilling difficulty", "cacheDuration", cacheDuration)

	const maxCacheSize = 250
	cache := make(map[string]time.Time, maxCacheSize/3)
	lastCleanupTime := time.Now()

	for r := range l.backfillChan {
		blog := slog.With("pid", r.PropertyID)
		cacheKey := r.Key()
		tnow := time.Now()
		if t, ok := cache[cacheKey]; ok && tnow.Sub(t) <= cacheDuration {
			blog.WarnContext(ctx, "Skipping duplicate backfill request", "time", t)
			continue
		}

		counts, err := l.retrievePropertyStatsSafe(ctx, r)
		if err != nil {
			blog.ErrorContext(ctx, "Failed to backfill stats", common.ErrAttr(err))
			continue
		}

		cache[cacheKey] = tnow

		if len(counts) > 0 {
			var addResult leakybucket.AddResult
			for _, count := range counts {
				addResult = l.propertyBuckets.Add(r.PropertyID, count.Count, count.Timestamp)
			}
			blog.InfoContext(ctx, "Backfilled requests counts", "counts", len(counts), "level", addResult.CurrLevel)
		}

		if (len(cache) > maxCacheSize) || (time.Since(lastCleanupTime) >= cacheDuration) {
			slog.DebugContext(ctx, "Cleaning up backfill cache", "size", len(cache))
			for key, value := range cache {
				if tnow.Sub(value) > cacheDuration {
					delete(cache, key)
				}
			}

			lastCleanupTime = time.Now()
		}
	}

	slog.DebugContext(ctx, "Finished backfilling difficulty")
}