[statspro] Avoid copying histograms, perf improvement (#7666)

* [statspro] Avoid unnecessary histogram copies

* bump

* dropped fds and colset

* fix more tests

* merge main

* [ga-format-pr] Run go/utils/repofmt/format_repo.sh and go/Godeps/update.sh

* bump

---------

Co-authored-by: max-hoffman <max-hoffman@users.noreply.github.com>
This commit is contained in:
Maximilian Hoffman
2024-04-02 09:38:31 -07:00
committed by GitHub
parent 56e261abe7
commit d6aa1e6af0
14 changed files with 337 additions and 265 deletions

View File

@@ -57,7 +57,7 @@ require (
github.com/cespare/xxhash v1.1.0
github.com/creasty/defaults v1.6.0
github.com/dolthub/flatbuffers/v23 v23.3.3-dh.2
github.com/dolthub/go-mysql-server v0.18.1-0.20240401223252-947e7c377fd3
github.com/dolthub/go-mysql-server v0.18.1-0.20240402153908-f98252471387
github.com/dolthub/swiss v0.1.0
github.com/goccy/go-json v0.10.2
github.com/google/go-github/v57 v57.0.0

View File

@@ -183,8 +183,8 @@ github.com/dolthub/fslock v0.0.3 h1:iLMpUIvJKMKm92+N1fmHVdxJP5NdyDK5bK7z7Ba2s2U=
github.com/dolthub/fslock v0.0.3/go.mod h1:QWql+P17oAAMLnL4HGB5tiovtDuAjdDTPbuqx7bYfa0=
github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e h1:kPsT4a47cw1+y/N5SSCkma7FhAPw7KeGmD6c9PBZW9Y=
github.com/dolthub/go-icu-regex v0.0.0-20230524105445-af7e7991c97e/go.mod h1:KPUcpx070QOfJK1gNe0zx4pA5sicIK1GMikIGLKC168=
github.com/dolthub/go-mysql-server v0.18.1-0.20240401223252-947e7c377fd3 h1:W+E0m/aPEiBFwW7teLHusek2sjTdrpyqWZyiIihH6ik=
github.com/dolthub/go-mysql-server v0.18.1-0.20240401223252-947e7c377fd3/go.mod h1:SJleIOwC74u9tdUoGgVgM/eLlwVj3sJEFfx0sdStvW0=
github.com/dolthub/go-mysql-server v0.18.1-0.20240402153908-f98252471387 h1:/611tSrBfDRH38MbrSgdvWZiX++d5txRThBwX0e+l2s=
github.com/dolthub/go-mysql-server v0.18.1-0.20240402153908-f98252471387/go.mod h1:SJleIOwC74u9tdUoGgVgM/eLlwVj3sJEFfx0sdStvW0=
github.com/dolthub/ishell v0.0.0-20221214210346-d7db0b066488 h1:0HHu0GWJH0N6a6keStrHhUAK5/o9LVfkh44pvsV4514=
github.com/dolthub/ishell v0.0.0-20221214210346-d7db0b066488/go.mod h1:ehexgi1mPxRTk0Mok/pADALuHbvATulTh6gzr7NzZto=
github.com/dolthub/jsonpath v0.0.2-0.20240227200619-19675ab05c71 h1:bMGS25NWAGTEtT5tOBsCuCrlYnLRKpbJVJkDbrTRhwQ=

View File

@@ -244,7 +244,7 @@ func (n *NomsStatsDatabase) DeleteBranchStats(ctx context.Context, branch string
return nil
}
func (n *NomsStatsDatabase) ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, _, newChunks []statspro.DoltBucket) error {
func (n *NomsStatsDatabase) ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []sql.HistogramBucket) error {
var dbStat dbStats
for i, b := range n.branches {
if strings.EqualFold(b, branch) {
@@ -261,12 +261,12 @@ func (n *NomsStatsDatabase) ReplaceChunks(ctx context.Context, branch string, qu
}
if _, ok := dbStat[qual]; ok {
oldChunks := dbStat[qual].Histogram
oldChunks := dbStat[qual].Hist
targetBuckets, err := statspro.MergeNewChunks(targetHashes, oldChunks, newChunks)
if err != nil {
return err
}
dbStat[qual].Histogram = targetBuckets
dbStat[qual].Hist = targetBuckets
} else {
dbStat[qual] = statspro.NewDoltStats()
}

View File

@@ -109,68 +109,70 @@ func loadStats(ctx *sql.Context, db dsess.SqlDatabase, m prolly.Map) (map[sql.St
}
qual := sql.NewStatQualifier(dbName, tableName, indexName)
if currentStat.Qual.String() != qual.String() {
if !currentStat.Qual.Empty() {
currentStat.LowerBound, err = loadLowerBound(ctx, currentStat.Qual)
if currentStat.Statistic.Qual.String() != qual.String() {
if !currentStat.Statistic.Qual.Empty() {
currentStat.Statistic.LowerBnd, err = loadLowerBound(ctx, currentStat.Statistic.Qual)
if err != nil {
return nil, err
}
fds, colSet, err := loadFuncDeps(ctx, db, currentStat.Qual)
fds, colSet, err := loadFuncDeps(ctx, db, currentStat.Statistic.Qual)
if err != nil {
return nil, err
}
currentStat.Fds = fds
currentStat.ColSet = colSet
currentStat.Statistic.Fds = fds
currentStat.Statistic.Colset = colSet
currentStat.UpdateActive()
qualToStats[currentStat.Qual] = currentStat
qualToStats[currentStat.Statistic.Qual] = currentStat
}
currentStat = statspro.NewDoltStats()
currentStat.Qual = qual
currentStat.Columns = columns
currentStat.LowerBound = lowerBound
currentStat.Statistic.Qual = qual
currentStat.Statistic.Cols = columns
currentStat.Statistic.LowerBnd = lowerBound
}
if currentStat.Histogram == nil {
currentStat.Types, err = stats.ParseTypeStrings(typs)
if currentStat.Statistic.Hist == nil {
currentStat.Statistic.Typs, err = stats.ParseTypeStrings(typs)
if err != nil {
return nil, err
}
currentStat.Qual = qual
currentStat.Statistic.Qual = qual
}
bucket := statspro.DoltBucket{
Chunk: commit,
RowCount: uint64(rowCount),
DistinctCount: uint64(distinctCount),
NullCount: uint64(nullCount),
CreatedAt: createdAt,
Mcvs: mcvs,
McvCount: mcvCnts,
BoundCount: upperBoundCnt,
UpperBound: boundRow,
Chunk: commit,
Created: createdAt,
Bucket: &stats.Bucket{
RowCnt: uint64(rowCount),
DistinctCnt: uint64(distinctCount),
NullCnt: uint64(nullCount),
McvVals: mcvs,
McvsCnt: mcvCnts,
BoundCnt: upperBoundCnt,
BoundVal: boundRow,
},
}
currentStat.Histogram = append(currentStat.Histogram, bucket)
currentStat.RowCount += uint64(rowCount)
currentStat.DistinctCount += uint64(distinctCount)
currentStat.NullCount += uint64(rowCount)
if currentStat.CreatedAt.Before(createdAt) {
currentStat.CreatedAt = createdAt
currentStat.Hist = append(currentStat.Hist, bucket)
currentStat.Statistic.RowCnt += uint64(rowCount)
currentStat.Statistic.DistinctCnt += uint64(distinctCount)
currentStat.Statistic.NullCnt += uint64(rowCount)
if currentStat.Statistic.Created.Before(createdAt) {
currentStat.Statistic.Created = createdAt
}
}
currentStat.LowerBound, err = loadLowerBound(ctx, currentStat.Qual)
currentStat.Statistic.LowerBnd, err = loadLowerBound(ctx, currentStat.Statistic.Qual)
if err != nil {
return nil, err
}
fds, colSet, err := loadFuncDeps(ctx, db, currentStat.Qual)
fds, colSet, err := loadFuncDeps(ctx, db, currentStat.Statistic.Qual)
if err != nil {
return nil, err
}
currentStat.Fds = fds
currentStat.ColSet = colSet
currentStat.Statistic.Fds = fds
currentStat.Statistic.Colset = colSet
currentStat.UpdateActive()
qualToStats[currentStat.Qual] = currentStat
qualToStats[currentStat.Statistic.Qual] = currentStat
return qualToStats, nil
}

View File

@@ -48,7 +48,7 @@ func deleteIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *s
keyBuilder := val.NewTupleBuilder(kd)
qual := dStats.Qual
qual := dStats.Qualifier()
pool := statsMap.NodeStore().Pool()
// delete previous entries for this index -> (db, table, index, pos)
@@ -92,22 +92,22 @@ func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *stat
keyBuilder := val.NewTupleBuilder(kd)
valueBuilder := val.NewTupleBuilder(vd)
qual := dStats.Qual
qual := dStats.Qualifier()
pool := statsMap.NodeStore().Pool()
// now add new buckets
typesB := strings.Builder{}
sep := ""
for _, t := range dStats.Types {
for _, t := range dStats.Statistic.Typs {
typesB.WriteString(sep + t.String())
sep = ","
}
typesStr := typesB.String()
var pos int64
for _, h := range dStats.Histogram {
for _, h := range dStats.Hist {
var upperBoundElems []string
for _, v := range h.UpperBound {
for _, v := range h.UpperBound() {
upperBoundElems = append(upperBoundElems, fmt.Sprintf("%v", v))
}
@@ -117,23 +117,23 @@ func putIndexRows(ctx context.Context, statsMap *prolly.MutableMap, dStats *stat
keyBuilder.PutInt64(3, pos)
valueBuilder.PutInt64(0, schema.StatsVersion)
valueBuilder.PutString(1, h.Chunk.String())
valueBuilder.PutInt64(2, int64(h.RowCount))
valueBuilder.PutInt64(3, int64(h.DistinctCount))
valueBuilder.PutInt64(4, int64(h.NullCount))
valueBuilder.PutString(5, strings.Join(dStats.Columns, ","))
valueBuilder.PutString(1, statspro.DoltBucketChunk(h).String())
valueBuilder.PutInt64(2, int64(h.RowCount()))
valueBuilder.PutInt64(3, int64(h.DistinctCount()))
valueBuilder.PutInt64(4, int64(h.NullCount()))
valueBuilder.PutString(5, strings.Join(dStats.Columns(), ","))
valueBuilder.PutString(6, typesStr)
valueBuilder.PutString(7, stats.StringifyKey(h.UpperBound, dStats.Types))
valueBuilder.PutInt64(8, int64(h.BoundCount))
valueBuilder.PutDatetime(9, h.CreatedAt)
for i, r := range h.Mcvs {
valueBuilder.PutString(10+i, stats.StringifyKey(r, dStats.Types))
valueBuilder.PutString(7, stats.StringifyKey(h.UpperBound(), dStats.Statistic.Typs))
valueBuilder.PutInt64(8, int64(h.BoundCount()))
valueBuilder.PutDatetime(9, statspro.DoltBucketCreated(h))
for i, r := range h.Mcvs() {
valueBuilder.PutString(10+i, stats.StringifyKey(r, dStats.Statistic.Typs))
}
var mcvCntsRow sql.Row
for _, v := range h.McvCount {
for _, v := range h.McvCounts() {
mcvCntsRow = append(mcvCntsRow, int(v))
}
valueBuilder.PutString(14, stats.StringifyKey(mcvCntsRow, dStats.Types))
valueBuilder.PutString(14, stats.StringifyKey(mcvCntsRow, dStats.Statistic.Typs))
key := keyBuilder.Build(pool)
value := valueBuilder.Build(pool)

View File

@@ -94,7 +94,7 @@ func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db strin
curStat, ok := statDb.GetStat(branch, qual)
if !ok {
curStat = NewDoltStats()
curStat.Qual = qual
curStat.Statistic.Qual = qual
}
idxMeta, err := newIdxMeta(ctx, curStat, dTab, idx, cols)
if err != nil {
@@ -111,7 +111,7 @@ func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db strin
// merge new chunks with preexisting chunks
for _, idxMeta := range idxMetas {
stat := newTableStats[idxMeta.qual]
targetChunks, err := MergeNewChunks(idxMeta.allAddrs, idxMeta.keepChunks, stat.Histogram)
targetChunks, err := MergeNewChunks(idxMeta.allAddrs, idxMeta.keepChunks, stat.Hist)
if err != nil {
return err
}
@@ -120,7 +120,7 @@ func (p *Provider) RefreshTableStats(ctx *sql.Context, table sql.Table, db strin
continue
}
stat.Chunks = idxMeta.allAddrs
stat.Histogram = targetChunks
stat.Hist = targetChunks
stat.UpdateActive()
if err := statDb.SetStat(ctx, branch, idxMeta.qual, stat); err != nil {
return err
@@ -176,7 +176,7 @@ func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table,
return indexMeta{}, err
} else if cnt == 0 {
return indexMeta{
qual: curStats.Qual,
qual: curStats.Statistic.Qual,
cols: cols,
}, nil
}
@@ -188,7 +188,7 @@ func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table,
}
var addrs []hash.Hash
var keepChunks []DoltBucket
var keepChunks []sql.HistogramBucket
var missingAddrs float64
var missingChunks []tree.Node
var missingOffsets []updateOrdinal
@@ -210,27 +210,27 @@ func newIdxMeta(ctx *sql.Context, curStats *DoltStats, doltTable *doltdb.Table,
missingOffsets = append(missingOffsets, updateOrdinal{offset, offset + uint64(treeCnt)})
missingAddrs++
} else {
keepChunks = append(keepChunks, curStats.Histogram[bucketIdx])
keepChunks = append(keepChunks, curStats.Hist[bucketIdx])
}
offset += uint64(treeCnt)
}
var dropChunks []DoltBucket
var dropChunks []sql.HistogramBucket
for _, h := range curStats.Chunks {
var match bool
for _, b := range keepChunks {
if b.Chunk == h {
if DoltBucketChunk(b) == h {
match = true
break
}
}
if !match {
dropChunks = append(dropChunks, curStats.Histogram[curStats.Active[h]])
dropChunks = append(dropChunks, curStats.Hist[curStats.Active[h]])
}
}
return indexMeta{
qual: curStats.Qual,
qual: curStats.Statistic.Qual,
cols: cols,
newNodes: missingChunks,
updateOrdinals: missingOffsets,

View File

@@ -168,18 +168,18 @@ func (p *Provider) checkRefresh(ctx *sql.Context, sqlDb sql.Database, dbName, br
curStat, ok := statDb.GetStat(branch, qual)
if !ok {
curStat = NewDoltStats()
curStat.Qual = qual
curStat.Statistic.Qual = qual
cols := make([]string, len(index.Expressions()))
tablePrefix := fmt.Sprintf("%s.", table)
for i, c := range index.Expressions() {
cols[i] = strings.TrimPrefix(strings.ToLower(c), tablePrefix)
}
curStat.Columns = cols
curStat.Statistic.Cols = cols
}
ctx.GetLogger().Debugf("statistics refresh index: %s", qual.String())
updateMeta, err := newIdxMeta(ctx, curStat, dTab, index, curStat.Columns)
updateMeta, err := newIdxMeta(ctx, curStat, dTab, index, curStat.Columns())
if err != nil {
ctx.GetLogger().Debugf("statistics refresh error: %s", err.Error())
continue
@@ -215,7 +215,7 @@ func (p *Provider) checkRefresh(ctx *sql.Context, sqlDb sql.Database, dbName, br
if _, ok := statDb.GetStat(branch, updateMeta.qual); !ok {
err = statDb.SetStat(ctx, branch, updateMeta.qual, stat)
} else {
err = statDb.ReplaceChunks(ctx, branch, updateMeta.qual, updateMeta.allAddrs, updateMeta.dropChunks, stat.Histogram)
err = statDb.ReplaceChunks(ctx, branch, updateMeta.qual, updateMeta.allAddrs, updateMeta.dropChunks, stat.Hist)
}
if err != nil {
return err

View File

@@ -26,31 +26,142 @@ import (
)
type DoltStats struct {
mu *sync.Mutex
Statistic *stats.Statistic
mu *sync.Mutex
// Chunks is a list of addresses for the histogram fanout level
Chunks []hash.Hash
// Active maps a chunk/bucket address to its position in
// the histogram. 1-indexed to differentiate from an empty
// field on disk
Active map[hash.Hash]int
Hist sql.Histogram
}
RowCount uint64
DistinctCount uint64
NullCount uint64
AvgSize uint64
Qual sql.StatQualifier
CreatedAt time.Time
Histogram DoltHistogram
Columns []string
Types []sql.Type
IdxClass uint8
LowerBound sql.Row
Fds *sql.FuncDepSet
ColSet sql.ColSet
var _ sql.Statistic = (*DoltStats)(nil)
func (s *DoltStats) WithColSet(set sql.ColSet) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithColSet(set).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithFuncDeps(set *sql.FuncDepSet) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithFuncDeps(set).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithDistinctCount(u uint64) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithDistinctCount(u).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithRowCount(u uint64) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithRowCount(u).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithNullCount(u uint64) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithNullCount(u).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithAvgSize(u uint64) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithAvgSize(u).(*stats.Statistic)
return &ret
}
func (s *DoltStats) WithLowerBound(row sql.Row) sql.Statistic {
ret := *s
ret.Statistic = ret.Statistic.WithLowerBound(row).(*stats.Statistic)
return &ret
}
func (s *DoltStats) RowCount() uint64 {
return s.Statistic.RowCount()
}
func (s *DoltStats) DistinctCount() uint64 {
return s.Statistic.DistinctCount()
}
func (s *DoltStats) NullCount() uint64 {
return s.Statistic.NullCount()
}
func (s *DoltStats) AvgSize() uint64 {
return s.Statistic.AvgSize()
}
func (s *DoltStats) CreatedAt() time.Time {
return s.Statistic.CreatedAt()
}
func (s *DoltStats) Columns() []string {
return s.Statistic.Columns()
}
func (s *DoltStats) Types() []sql.Type {
return s.Statistic.Types()
}
func (s *DoltStats) Qualifier() sql.StatQualifier {
return s.Statistic.Qualifier()
}
func (s *DoltStats) IndexClass() sql.IndexClass {
return s.Statistic.IndexClass()
}
func (s *DoltStats) FuncDeps() *sql.FuncDepSet {
return s.Statistic.FuncDeps()
}
func (s *DoltStats) ColSet() sql.ColSet {
return s.Statistic.ColSet()
}
func (s *DoltStats) LowerBound() sql.Row {
return s.Statistic.LowerBound()
}
func NewDoltStats() *DoltStats {
return &DoltStats{mu: &sync.Mutex{}, Active: make(map[hash.Hash]int)}
return &DoltStats{mu: &sync.Mutex{}, Active: make(map[hash.Hash]int), Statistic: &stats.Statistic{}}
}
func (s *DoltStats) ToInterface() interface{} {
ret := s.Statistic.ToInterface().(map[string]interface{})
var hist sql.Histogram
for _, b := range s.Hist {
hist = append(hist, b)
}
ret["statistic"].(map[string]interface{})["buckets"] = hist.ToInterface()
return ret
}
func (s *DoltStats) WithHistogram(h sql.Histogram) (sql.Statistic, error) {
ret := *s
ret.Hist = nil
for _, b := range h {
doltB, ok := b.(DoltBucket)
if !ok {
return nil, fmt.Errorf("invalid bucket type: %T", b)
}
ret.Hist = append(ret.Hist, doltB)
}
return &ret, nil
}
func (s *DoltStats) Histogram() sql.Histogram {
return s.Hist
}
func DoltStatsFromSql(stat sql.Statistic) (*DoltStats, error) {
@@ -58,22 +169,15 @@ func DoltStatsFromSql(stat sql.Statistic) (*DoltStats, error) {
if err != nil {
return nil, err
}
return &DoltStats{
mu: &sync.Mutex{},
Qual: stat.Qualifier(),
RowCount: stat.RowCount(),
DistinctCount: stat.DistinctCount(),
NullCount: stat.NullCount(),
AvgSize: stat.AvgSize(),
CreatedAt: stat.CreatedAt(),
Histogram: hist,
Columns: stat.Columns(),
Types: stat.Types(),
IdxClass: uint8(stat.IndexClass()),
LowerBound: stat.LowerBound(),
Fds: stat.FuncDeps(),
ColSet: stat.ColSet(),
}, nil
ret := &DoltStats{
mu: &sync.Mutex{},
Hist: hist,
Statistic: stats.NewStatistic(stat.RowCount(), stat.DistinctCount(), stat.NullCount(), stat.AvgSize(), stat.CreatedAt(), stat.Qualifier(), stat.Columns(), stat.Types(), nil, stat.IndexClass(), stat.LowerBound()),
Active: make(map[hash.Hash]int),
}
ret.Statistic.Fds = stat.FuncDeps()
ret.Statistic.Colset = stat.ColSet()
return ret, nil
}
func (s *DoltStats) UpdateActive() {
@@ -86,49 +190,26 @@ func (s *DoltStats) UpdateActive() {
s.Active = newActive
}
func (s *DoltStats) updateCounts() {
s.mu.Lock()
defer s.mu.Unlock()
var newDistinct uint64
var newRows uint64
var newNulls uint64
for _, b := range s.Histogram {
newDistinct += b.DistinctCount
newRows += b.RowCount
newNulls += b.NullCount
}
s.RowCount = newRows
s.DistinctCount = newDistinct
s.NullCount = newNulls
}
func (s *DoltStats) toSql() sql.Statistic {
s.mu.Lock()
defer s.mu.Unlock()
typStrs := make([]string, len(s.Types))
for i, typ := range s.Types {
typStrs[i] = typ.String()
}
stat := stats.NewStatistic(s.RowCount, s.DistinctCount, s.NullCount, s.AvgSize, s.CreatedAt, s.Qual, s.Columns, s.Types, s.Histogram.toSql(), sql.IndexClass(s.IdxClass), s.LowerBound)
return stat.WithColSet(s.ColSet).WithFuncDeps(s.Fds)
}
type DoltHistogram []DoltBucket
type DoltBucket struct {
Chunk hash.Hash
RowCount uint64
DistinctCount uint64
NullCount uint64
CreatedAt time.Time
Mcvs []sql.Row
McvCount []uint64
BoundCount uint64
UpperBound sql.Row
*stats.Bucket
Chunk hash.Hash
Created time.Time
}
func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (DoltHistogram, error) {
ret := make([]DoltBucket, len(hist))
func DoltBucketChunk(b sql.HistogramBucket) hash.Hash {
return b.(DoltBucket).Chunk
}
func DoltBucketCreated(b sql.HistogramBucket) time.Time {
return b.(DoltBucket).Created
}
var _ sql.HistogramBucket = (*DoltBucket)(nil)
func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (sql.Histogram, error) {
ret := make(sql.Histogram, len(hist))
var err error
for i, b := range hist {
upperBound := make(sql.Row, len(b.UpperBound()))
@@ -149,24 +230,8 @@ func DoltHistFromSql(hist sql.Histogram, types []sql.Type) (DoltHistogram, error
}
}
ret[i] = DoltBucket{
RowCount: b.RowCount(),
DistinctCount: b.DistinctCount(),
NullCount: b.NullCount(),
Mcvs: mcvs,
McvCount: b.McvCounts(),
BoundCount: b.BoundCount(),
UpperBound: upperBound,
Bucket: stats.NewHistogramBucket(b.RowCount(), b.DistinctCount(), b.NullCount(), b.BoundCount(), upperBound, b.McvCounts(), mcvs),
}
}
return ret, nil
}
func (s DoltHistogram) toSql() []*stats.Bucket {
ret := make([]*stats.Bucket, len(s))
for i, b := range s {
upperBound := make([]interface{}, len(b.UpperBound))
copy(upperBound, b.UpperBound)
ret[i] = stats.NewHistogramBucket(b.RowCount, b.DistinctCount, b.NullCount, b.BoundCount, upperBound, b.McvCount, b.Mcvs)
}
return ret
}

View File

@@ -45,7 +45,7 @@ type Database interface {
DeleteStats(branch string, quals ...sql.StatQualifier)
// ReplaceChunks is an update interface that lets a stats implementation
// decide how to edit stats for a stats refresh.
ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []DoltBucket) error
ReplaceChunks(ctx context.Context, branch string, qual sql.StatQualifier, targetHashes []hash.Hash, dropChunks, newChunks []sql.HistogramBucket) error
// Flush instructs the database to sync any partial state to disk
Flush(ctx context.Context, branch string) error
// Close finalizes any file references.

View File

@@ -38,8 +38,8 @@ type indexMeta struct {
newNodes []tree.Node
// updateOrdinals are [start, stop] tuples for each update chunk
updateOrdinals []updateOrdinal
keepChunks []DoltBucket
dropChunks []DoltBucket
keepChunks []sql.HistogramBucket
dropChunks []sql.HistogramBucket
allAddrs []hash.Hash
}
@@ -160,7 +160,7 @@ func (p *Provider) GetTableDoltStats(ctx *sql.Context, branch, db, table string)
for _, qual := range statDb.ListStatQuals(branch) {
if strings.EqualFold(db, qual.Database) && strings.EqualFold(table, qual.Tab) {
stat, _ := statDb.GetStat(branch, qual)
ret = append(ret, stat.toSql())
ret = append(ret, stat)
}
}
@@ -224,7 +224,7 @@ func (p *Provider) GetStats(ctx *sql.Context, qual sql.StatQualifier, _ []string
if !ok {
return nil, false
}
return stat.toSql(), true
return stat, true
}
func (p *Provider) DropDbStats(ctx *sql.Context, db string, flush bool) error {
@@ -299,7 +299,7 @@ func (p *Provider) RowCount(ctx *sql.Context, db, table string) (uint64, error)
return 0, nil
}
return priStats.RowCount, nil
return priStats.RowCount(), nil
}
func (p *Provider) DataLength(ctx *sql.Context, db, table string) (uint64, error) {
@@ -322,5 +322,5 @@ func (p *Provider) DataLength(ctx *sql.Context, db, table string) (uint64, error
return 0, nil
}
return priStats.AvgSize, nil
return priStats.AvgSize(), nil
}

View File

@@ -82,13 +82,13 @@ func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Ta
} else if cnt == 0 {
// table is empty
ret[meta.qual] = NewDoltStats()
ret[meta.qual].CreatedAt = time.Now()
ret[meta.qual].Columns = meta.cols
ret[meta.qual].Types = types
ret[meta.qual].Qual = meta.qual
ret[meta.qual].Statistic.Created = time.Now()
ret[meta.qual].Statistic.Cols = meta.cols
ret[meta.qual].Statistic.Typs = types
ret[meta.qual].Statistic.Qual = meta.qual
ret[meta.qual].Fds = fds
ret[meta.qual].ColSet = colSet
ret[meta.qual].Statistic.Fds = fds
ret[meta.qual].Statistic.Colset = colSet
continue
}
@@ -100,10 +100,10 @@ func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Ta
updater := newBucketBuilder(meta.qual, len(meta.cols), prollyMap.KeyDesc())
ret[meta.qual] = NewDoltStats()
ret[meta.qual].Chunks = meta.allAddrs
ret[meta.qual].CreatedAt = time.Now()
ret[meta.qual].Columns = meta.cols
ret[meta.qual].Types = types
ret[meta.qual].Qual = meta.qual
ret[meta.qual].Statistic.Created = time.Now()
ret[meta.qual].Statistic.Cols = meta.cols
ret[meta.qual].Statistic.Typs = types
ret[meta.qual].Statistic.Qual = meta.qual
var start, stop uint64
// read leaf rows for each bucket
@@ -140,14 +140,14 @@ func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Ta
return nil, err
}
bucket.Chunk = chunk.HashOf()
ret[updater.qual].Histogram = append(ret[updater.qual].Histogram, bucket)
ret[updater.qual].Hist = append(ret[updater.qual].Hist, bucket)
}
ret[updater.qual].DistinctCount = uint64(updater.globalDistinct)
ret[updater.qual].RowCount = uint64(updater.globalCount)
ret[updater.qual].LowerBound = firstRow
ret[updater.qual].Fds = fds
ret[updater.qual].ColSet = colSet
ret[updater.qual].Statistic.DistinctCnt = uint64(updater.globalDistinct)
ret[updater.qual].Statistic.RowCnt = uint64(updater.globalCount)
ret[updater.qual].Statistic.LowerBnd = firstRow
ret[updater.qual].Statistic.Fds = fds
ret[updater.qual].Statistic.Colset = colSet
ret[updater.qual].UpdateActive()
}
return ret, nil
@@ -156,22 +156,22 @@ func createNewStatsBuckets(ctx *sql.Context, sqlTable sql.Table, dTab *doltdb.Ta
// MergeNewChunks combines a set of old and new chunks to create
// the desired target histogram. Undefined behavior if a |targetHash|
// does not exist in either |oldChunks| or |newChunks|.
func MergeNewChunks(inputHashes []hash.Hash, oldChunks, newChunks []DoltBucket) ([]DoltBucket, error) {
func MergeNewChunks(inputHashes []hash.Hash, oldChunks, newChunks []sql.HistogramBucket) ([]sql.HistogramBucket, error) {
hashToPos := make(map[hash.Hash]int, len(inputHashes))
for i, h := range inputHashes {
hashToPos[h] = i
}
var cnt int
targetBuckets := make([]DoltBucket, len(inputHashes))
targetBuckets := make([]sql.HistogramBucket, len(inputHashes))
for _, c := range oldChunks {
if idx, ok := hashToPos[c.Chunk]; ok {
if idx, ok := hashToPos[DoltBucketChunk(c)]; ok {
cnt++
targetBuckets[idx] = c
}
}
for _, c := range newChunks {
if idx, ok := hashToPos[c.Chunk]; ok && targetBuckets[idx].Chunk.IsEmpty() {
if idx, ok := hashToPos[DoltBucketChunk(c)]; ok && targetBuckets[idx] == nil {
cnt++
targetBuckets[idx] = c
}
@@ -280,13 +280,15 @@ func (u *bucketBuilder) finalize(ctx context.Context, ns tree.NodeStore) (DoltBu
}
}
return DoltBucket{
RowCount: uint64(u.count),
DistinctCount: uint64(u.distinct),
BoundCount: uint64(u.currentCnt),
Mcvs: mcvRows,
McvCount: u.mcvs.Counts(),
UpperBound: upperBound,
NullCount: uint64(u.nulls),
Bucket: &stats.Bucket{
RowCnt: uint64(u.count),
DistinctCnt: uint64(u.distinct),
BoundCnt: uint64(u.currentCnt),
McvVals: mcvRows,
McvsCnt: u.mcvs.Counts(),
BoundVal: upperBound,
NullCnt: uint64(u.nulls),
},
}, nil
}

View File

@@ -21,6 +21,7 @@ import (
"testing"
"github.com/dolthub/go-mysql-server/sql"
"github.com/dolthub/go-mysql-server/sql/stats"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -66,109 +67,109 @@ func TestBucketBuilder(t *testing.T) {
name: "ints",
keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}),
bucket: DoltBucket{
RowCount: 15,
DistinctCount: 5,
Mcvs: []sql.Row{{int64(4)}, {int64(2)}, {int64(3)}},
McvCount: []uint64{3, 4, 3},
UpperBound: sql.Row{int64(5)},
BoundCount: 2,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 5,
McvVals: []sql.Row{{int64(4)}, {int64(2)}, {int64(3)}},
McvsCnt: []uint64{3, 4, 3},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
},
{
// technically nulls should be at beginning
name: "ints with middle nulls",
keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {nil}, {nil}, {nil}, {3}, {4}, {4}, {4}, {5}, {5}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}),
bucket: DoltBucket{
RowCount: 16,
DistinctCount: 6,
NullCount: 3,
Mcvs: []sql.Row{{int64(4)}, {int64(2)}, {nil}},
McvCount: []uint64{3, 4, 3},
UpperBound: sql.Row{int64(5)},
BoundCount: 2,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 16,
DistinctCnt: 6,
NullCnt: 3,
McvVals: []sql.Row{{int64(4)}, {int64(2)}, {nil}},
McvsCnt: []uint64{3, 4, 3},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
},
{
name: "ints with beginning nulls",
keys: []sql.Row{{nil}, {nil}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}),
bucket: DoltBucket{
RowCount: 15,
DistinctCount: 6,
NullCount: 2,
Mcvs: []sql.Row{{int64(3)}, {int64(4)}, {int64(2)}},
McvCount: []uint64{3, 3, 4},
UpperBound: sql.Row{int64(5)},
BoundCount: 2,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 6,
NullCnt: 2,
McvVals: []sql.Row{{int64(3)}, {int64(4)}, {int64(2)}},
McvsCnt: []uint64{3, 3, 4},
BoundVal: sql.Row{int64(5)},
BoundCnt: 2,
}},
},
{
name: "more ints",
keys: []sql.Row{{1}, {1}, {1}, {2}, {2}, {2}, {2}, {3}, {3}, {3}, {4}, {4}, {4}, {5}, {5}, {5}, {5}, {6}, {6}, {6}, {6}, {7}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}),
bucket: DoltBucket{
RowCount: 22,
DistinctCount: 7,
BoundCount: 1,
Mcvs: []sql.Row{{int64(2)}, {int64(6)}, {int64(5)}},
McvCount: []uint64{4, 4, 4},
UpperBound: sql.Row{int64(7)},
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 22,
DistinctCnt: 7,
BoundCnt: 1,
McvVals: []sql.Row{{int64(2)}, {int64(6)}, {int64(5)}},
McvsCnt: []uint64{4, 4, 4},
BoundVal: sql.Row{int64(7)},
}},
},
{
name: "2-ints",
keys: []sql.Row{{1, 1}, {1, 1}, {1, 2}, {2, 1}, {2, 2}, {2, 3}, {2, 3}, {3, 1}, {3, 2}, {3, 3}, {4, 1}, {4, 1}, {4, 1}, {5, 1}, {5, 2}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: false}, val.Type{Enc: val.Int64Enc, Nullable: false}),
bucket: DoltBucket{
RowCount: 15,
DistinctCount: 11,
Mcvs: []sql.Row{{int64(1), int64(1)}, {int64(4), int64(1)}, {int64(2), int64(3)}},
McvCount: []uint64{2, 3, 2},
UpperBound: sql.Row{int64(5), int64(2)},
BoundCount: 1,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 11,
McvVals: []sql.Row{{int64(1), int64(1)}, {int64(4), int64(1)}, {int64(2), int64(3)}},
McvsCnt: []uint64{2, 3, 2},
BoundVal: sql.Row{int64(5), int64(2)},
BoundCnt: 1,
}},
},
{
name: "2-ints with nulls",
keys: []sql.Row{{nil, 1}, {1, nil}, {1, 2}, {2, nil}, {2, 2}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.Int64Enc, Nullable: true}, val.Type{Enc: val.Int64Enc, Nullable: true}),
bucket: DoltBucket{
RowCount: 5,
DistinctCount: 5,
NullCount: 3,
Mcvs: []sql.Row{{int64(2), int64(2)}, {int64(1), nil}, {int64(1), int64(2)}},
McvCount: []uint64{1, 1, 1},
UpperBound: sql.Row{int64(2), int64(2)},
BoundCount: 1,
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 5,
DistinctCnt: 5,
NullCnt: 3,
McvVals: []sql.Row{{int64(2), int64(2)}, {int64(1), nil}, {int64(1), int64(2)}},
McvsCnt: []uint64{1, 1, 1},
BoundVal: sql.Row{int64(2), int64(2)},
BoundCnt: 1},
},
},
{
name: "varchars",
keys: []sql.Row{{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, {"e"}, {"f"}, {"g"}, {"g"}, {"g"}, {"h"}, {"h"}, {"h"}, {"i"}, {"i"}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.StringEnc, Nullable: false}),
bucket: DoltBucket{
RowCount: 15,
DistinctCount: 9,
Mcvs: []sql.Row{{"i"}, {"h"}, {"g"}},
McvCount: []uint64{2, 3, 3},
UpperBound: sql.Row{"i"},
BoundCount: 2,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 9,
McvVals: []sql.Row{{"i"}, {"h"}, {"g"}},
McvsCnt: []uint64{2, 3, 3},
BoundVal: sql.Row{"i"},
BoundCnt: 2,
}},
},
{
name: "varchar-ints",
keys: []sql.Row{{"a", 1}, {"b", 1}, {"c", 1}, {"d", 1}, {"e", 1}, {"e", 2}, {"f", 1}, {"g", 1}, {"g", 2}, {"g", 2}, {"h", 1}, {"h", 1}, {"h", 2}, {"i", 1}, {"i", 1}},
keyDesc: val.NewTupleDescriptor(val.Type{Enc: val.StringEnc, Nullable: false}, val.Type{Enc: val.Int64Enc, Nullable: false}),
bucket: DoltBucket{
RowCount: 15,
DistinctCount: 12,
Mcvs: []sql.Row{{"i", int64(1)}, {"g", int64(2)}, {"h", int64(1)}},
McvCount: []uint64{2, 2, 2},
UpperBound: sql.Row{"i", int64(1)},
BoundCount: 2,
},
bucket: DoltBucket{Bucket: &stats.Bucket{
RowCnt: 15,
DistinctCnt: 12,
McvVals: []sql.Row{{"i", int64(1)}, {"g", int64(2)}, {"h", int64(1)}},
McvsCnt: []uint64{2, 2, 2},
BoundVal: sql.Row{"i", int64(1)},
BoundCnt: 2,
}},
},
}
@@ -190,13 +191,13 @@ func TestBucketBuilder(t *testing.T) {
bucket, err := b.finalize(ctx, nil)
require.NoError(t, err)
require.Equal(t, int(tt.bucket.RowCount), int(bucket.RowCount))
require.Equal(t, int(tt.bucket.NullCount), int(bucket.NullCount))
require.Equal(t, int(tt.bucket.DistinctCount), int(bucket.DistinctCount))
require.Equal(t, int(tt.bucket.BoundCount), int(bucket.BoundCount))
require.Equal(t, tt.bucket.UpperBound, bucket.UpperBound)
require.Equal(t, tt.bucket.McvCount, bucket.McvCount)
require.Equal(t, tt.bucket.Mcvs, bucket.Mcvs)
require.Equal(t, int(tt.bucket.RowCount()), int(bucket.RowCount()))
require.Equal(t, int(tt.bucket.NullCount()), int(bucket.NullCount()))
require.Equal(t, int(tt.bucket.DistinctCount()), int(bucket.DistinctCount()))
require.Equal(t, int(tt.bucket.BoundCount()), int(bucket.BoundCount()))
require.Equal(t, tt.bucket.UpperBound(), bucket.UpperBound())
require.Equal(t, tt.bucket.McvsCnt, bucket.McvsCnt)
require.Equal(t, tt.bucket.Mcvs(), bucket.Mcvs())
})
}
}

View File

@@ -168,7 +168,7 @@ type ChunkStoreGarbageCollector interface {
BeginGC(addChunk func(hash.Hash) bool) error
// EndGC indicates that the GC is over. The previously provided
// addChunk function must not be called after this function function.
// addChunk function must not be called after this function.
EndGC()
// MarkAndSweepChunks is expected to read chunk addresses off of

View File

@@ -100,7 +100,9 @@ type NomsBlockStore struct {
cond *sync.Cond
gcInProgress bool
keeperFunc func(hash.Hash) bool
// keeperFunc is set when |gcInProgress| and appends to the GC sweep queue
// or blocks on GC finalize
keeperFunc func(hash.Hash) bool
mtSize uint64
putCount uint64