Merge pull request #4617 from dolthub/andy/faster-table-index-lookup

go/store/nbs: Faster prefix lookup for table file indices
This commit is contained in:
AndyA
2022-10-25 17:21:14 -07:00
committed by GitHub
2 changed files with 56 additions and 7 deletions

View File

@@ -304,7 +304,7 @@ func (ti onHeapTableIndex) Lookup(h *addr) (indexEntry, bool, error) {
func (ti onHeapTableIndex) lookupOrdinal(h *addr) (uint32, error) {
prefix := h.Prefix()
for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixAt(idx) == prefix; idx++ {
for idx := ti.findPrefix(prefix); idx < ti.chunkCount && ti.prefixAt(idx) == prefix; idx++ {
m, err := ti.EntrySuffixMatches(idx, h)
if err != nil {
return ti.chunkCount, err
@@ -317,22 +317,24 @@ func (ti onHeapTableIndex) lookupOrdinal(h *addr) (uint32, error) {
return ti.chunkCount, nil
}
// prefixIdx returns the first position in |tr.prefixes| whose value ==
// |prefix|. Returns |tr.chunkCount| if absent
func (ti onHeapTableIndex) prefixIdx(prefix uint64) (idx uint32) {
// findPrefix returns the first position in |tr.prefixes| whose value == |prefix|.
// Returns |tr.chunkCount| if absent
func (ti onHeapTableIndex) findPrefix(prefix uint64) (idx uint32) {
query := make([]byte, addrPrefixSize)
binary.BigEndian.PutUint64(query, prefix)
// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
// an extremely tight loop and inlining the code was a significant perf improvement.
idx, j := 0, ti.chunkCount
for idx < j {
h := idx + (j-idx)/2 // avoid overflow when computing h
// i ≤ h < j
if ti.prefixAt(h) < prefix {
o := int64(prefixTupleSize * h)
if bytes.Compare(ti.tupleB[o:o+addrPrefixSize], query) < 0 {
idx = h + 1 // preserves f(i-1) == false
} else {
j = h // preserves f(j) == true
}
}
return
}
@@ -475,7 +477,7 @@ func (ti onHeapTableIndex) ResolveShortHash(short []byte) ([]string, error) {
sPrefix := ti.padStringAndDecode(shortHash, "0")
// Binary Search for prefix
pIdxL = ti.prefixIdx(sPrefix)
pIdxL = ti.findPrefix(sPrefix)
// Prefix doesn't exist
if pIdxL == ti.chunkCount {

View File

@@ -50,6 +50,53 @@ func TestParseTableIndex(t *testing.T) {
}
}
func BenchmarkFindPrefix(b *testing.B) {
f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
require.NoError(b, err)
defer f.Close()
bs, err := io.ReadAll(f)
require.NoError(b, err)
idx, err := parseTableIndexByCopy(bs, &noopQuotaProvider{})
require.NoError(b, err)
defer idx.Close()
assert.Equal(b, uint32(596), idx.ChunkCount())
prefixes, err := idx.Prefixes()
require.NoError(b, err)
b.Run("benchmark prefixIdx()", func(b *testing.B) {
var ord uint32
for i := 0; i < b.N; i++ {
ord = prefixIdx(idx, prefixes[uint(i)&uint(512)])
}
assert.True(b, ord < 596)
})
b.Run("benchmark findPrefix", func(b *testing.B) {
var ord uint32
for i := 0; i < b.N; i++ {
ord = idx.findPrefix(prefixes[uint(i)&uint(512)])
}
assert.True(b, ord < 596)
})
}
// previous implementation for findIndex().
func prefixIdx(ti onHeapTableIndex, prefix uint64) (idx uint32) {
// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
// an extremely tight loop and inlining the code was a significant perf improvement.
idx, j := 0, ti.chunkCount
for idx < j {
h := idx + (j-idx)/2 // avoid overflow when computing h
// i ≤ h < j
if ti.prefixAt(h) < prefix {
idx = h + 1 // preserves f(i-1) == false
} else {
j = h // preserves f(j) == true
}
}
return
}
func TestMMapIndex(t *testing.T) {
f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
require.NoError(t, err)