Merge pull request #4617 from dolthub/andy/faster-table-index-lookup

go/store/nbs: Faster prefix lookup for table file indices
2026-04-30 11:31:37 -05:00 · 2022-10-25 17:21:14 -07:00
parent 2d87ce2962 141aa1c8c0
commit ee68a95bad
2 changed files with 56 additions and 7 deletions
@@ -304,7 +304,7 @@ func (ti onHeapTableIndex) Lookup(h *addr) (indexEntry, bool, error) {
 func (ti onHeapTableIndex) lookupOrdinal(h *addr) (uint32, error) {
 	prefix := h.Prefix()

-	for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixAt(idx) == prefix; idx++ {
+	for idx := ti.findPrefix(prefix); idx < ti.chunkCount && ti.prefixAt(idx) == prefix; idx++ {
 		m, err := ti.EntrySuffixMatches(idx, h)
 		if err != nil {
 			return ti.chunkCount, err
@@ -317,22 +317,24 @@ func (ti onHeapTableIndex) lookupOrdinal(h *addr) (uint32, error) {
 	return ti.chunkCount, nil
 }

-// prefixIdx returns the first position in |tr.prefixes| whose value ==
-// |prefix|. Returns |tr.chunkCount| if absent
-func (ti onHeapTableIndex) prefixIdx(prefix uint64) (idx uint32) {
+// findPrefix returns the first position in |tr.prefixes| whose value == |prefix|.
+// Returns |tr.chunkCount| if absent
+func (ti onHeapTableIndex) findPrefix(prefix uint64) (idx uint32) {
+	query := make([]byte, addrPrefixSize)
+	binary.BigEndian.PutUint64(query, prefix)
 	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
 	// an extremely tight loop and inlining the code was a significant perf improvement.
 	idx, j := 0, ti.chunkCount
 	for idx < j {
 		h := idx + (j-idx)/2 // avoid overflow when computing h
 		// i ≤ h < j
-		if ti.prefixAt(h) < prefix {
+		o := int64(prefixTupleSize * h)
+		if bytes.Compare(ti.tupleB[o:o+addrPrefixSize], query) < 0 {
 			idx = h + 1 // preserves f(i-1) == false
 		} else {
 			j = h // preserves f(j) == true
 		}
 	}
-
 	return
 }

@@ -475,7 +477,7 @@ func (ti onHeapTableIndex) ResolveShortHash(short []byte) ([]string, error) {
 		sPrefix := ti.padStringAndDecode(shortHash, "0")

 		// Binary Search for prefix
-		pIdxL = ti.prefixIdx(sPrefix)
+		pIdxL = ti.findPrefix(sPrefix)

 		// Prefix doesn't exist
 		if pIdxL == ti.chunkCount {
@@ -50,6 +50,53 @@ func TestParseTableIndex(t *testing.T) {
 	}
 }

+func BenchmarkFindPrefix(b *testing.B) {
+	f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
+	require.NoError(b, err)
+	defer f.Close()
+	bs, err := io.ReadAll(f)
+	require.NoError(b, err)
+	idx, err := parseTableIndexByCopy(bs, &noopQuotaProvider{})
+	require.NoError(b, err)
+	defer idx.Close()
+	assert.Equal(b, uint32(596), idx.ChunkCount())
+
+	prefixes, err := idx.Prefixes()
+	require.NoError(b, err)
+
+	b.Run("benchmark prefixIdx()", func(b *testing.B) {
+		var ord uint32
+		for i := 0; i < b.N; i++ {
+			ord = prefixIdx(idx, prefixes[uint(i)&uint(512)])
+		}
+		assert.True(b, ord < 596)
+	})
+	b.Run("benchmark findPrefix", func(b *testing.B) {
+		var ord uint32
+		for i := 0; i < b.N; i++ {
+			ord = idx.findPrefix(prefixes[uint(i)&uint(512)])
+		}
+		assert.True(b, ord < 596)
+	})
+}
+
+// previous implementation for findIndex().
+func prefixIdx(ti onHeapTableIndex, prefix uint64) (idx uint32) {
+	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
+	// an extremely tight loop and inlining the code was a significant perf improvement.
+	idx, j := 0, ti.chunkCount
+	for idx < j {
+		h := idx + (j-idx)/2 // avoid overflow when computing h
+		// i ≤ h < j
+		if ti.prefixAt(h) < prefix {
+			idx = h + 1 // preserves f(i-1) == false
+		} else {
+			j = h // preserves f(j) == true
+		}
+	}
+	return
+}
+
 func TestMMapIndex(t *testing.T) {
 	f, err := os.Open("testdata/0oa7mch34jg1rvghrnhr4shrp2fm4ftd.idx")
 	require.NoError(t, err)