mirror of
https://github.com/dolthub/dolt.git
synced 2026-04-27 07:26:31 -05:00
ddebdcaefd
The json serialization now only contains the length of each individual
blob child.
The go representation of this still uses offsets but the offsets are
for the end delimiter.
For "hi" "bye" we get
{"cb", [{"ref": "sha1-hi"}, 2, {"ref": "sha1-bye"}, 3]}
compoundBlob{[2, 5], [sha1-hi, ,sha1-bye]}
Keeping the length in the serialization leads to smaller serializations
Using the end offset leads to simpler binary search and allows us to
use the last entry as the length.
Issue #17
93 lines
1.7 KiB
Go
93 lines
1.7 KiB
Go
package types
|
|
|
|
import (
|
|
"bytes"
|
|
"io"
|
|
|
|
"github.com/attic-labs/buzhash"
|
|
"github.com/attic-labs/noms/ref"
|
|
)
|
|
|
|
const (
|
|
// 12 bits leads to an average size of 4k
|
|
// 13 bits leads to an average size of 8k
|
|
// 14 bits leads to an average size of 16k
|
|
pattern = uint32(1<<13 - 1)
|
|
|
|
// The window size to use for computing the rolling hash.
|
|
windowSize = 64
|
|
)
|
|
|
|
type Blob interface {
|
|
Value
|
|
Len() uint64
|
|
// BUG 155 - Should provide Seek and Write... Maybe even have Blob implement ReadWriteSeeker
|
|
Reader() io.ReadSeeker
|
|
}
|
|
|
|
func NewBlob(r io.Reader) (Blob, error) {
|
|
length := uint64(0)
|
|
offsets := []uint64{}
|
|
blobs := []Future{}
|
|
var blob blobLeaf
|
|
for {
|
|
buf := bytes.Buffer{}
|
|
n, err := copyChunk(&buf, r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if n == 0 {
|
|
// Don't add empty chunk.
|
|
break
|
|
}
|
|
|
|
length += n
|
|
offsets = append(offsets, length)
|
|
blob = newBlobLeaf(buf.Bytes())
|
|
blobs = append(blobs, futureFromValue(blob))
|
|
}
|
|
|
|
if length == 0 {
|
|
return newBlobLeaf([]byte{}), nil
|
|
}
|
|
|
|
if len(blobs) == 1 {
|
|
return blob, nil
|
|
}
|
|
return compoundBlob{offsets, blobs, &ref.Ref{}, nil}, nil
|
|
}
|
|
|
|
func BlobFromVal(v Value) Blob {
|
|
return v.(Blob)
|
|
}
|
|
|
|
// copyChunk copies from src to dst until a chunk boundary is found.
|
|
// It returns the number of bytes copied and the earliest error encountered while copying.
|
|
// copyChunk never returns an io.EOF error, instead it returns the number of bytes read up to the io.EOF.
|
|
func copyChunk(dst io.Writer, src io.Reader) (n uint64, err error) {
|
|
h := buzhash.NewBuzHash(windowSize)
|
|
p := []byte{0}
|
|
|
|
for {
|
|
_, err = src.Read(p)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
return n, nil
|
|
}
|
|
return
|
|
}
|
|
|
|
h.Write(p)
|
|
_, err = dst.Write(p)
|
|
if err != nil {
|
|
return
|
|
}
|
|
n++
|
|
|
|
if h.Sum32()&pattern == pattern {
|
|
return
|
|
}
|
|
}
|
|
}
|