From 362a5630d95fa1e893c642ac4b8280aebedbe5e8 Mon Sep 17 00:00:00 2001 From: Aaron Boodman Date: Tue, 27 Sep 2016 10:50:37 -0700 Subject: [PATCH] Add photo-index: a simple photo indexer. For now only indexes by tag. (#2610) Add photo-index: a simple photo indexer. For now only indexes by tag. Will add indexing by face/geo in subsequent patches. --- go/marshal/decode.go | 5 +- go/marshal/decode_test.go | 1 + go/types/type_cache.go | 33 +++++++ go/types/type_cache_test.go | 16 ++++ samples/go/photo-index/main.go | 144 ++++++++++++++++++++++++++++ samples/go/photo-index/main_test.go | 99 +++++++++++++++++++ 6 files changed, 297 insertions(+), 1 deletion(-) create mode 100644 samples/go/photo-index/main.go create mode 100644 samples/go/photo-index/main_test.go diff --git a/go/marshal/decode.go b/go/marshal/decode.go index 20b9ce6dbf..40ae541dab 100644 --- a/go/marshal/decode.go +++ b/go/marshal/decode.go @@ -235,7 +235,10 @@ func structDecoder(t reflect.Type) decoderFunc { } d = func(v types.Value, rv reflect.Value) { - s := v.(types.Struct) + s, ok := v.(types.Struct) + if !ok { + panic(&UnmarshalTypeMismatchError{v, rv.Type(), ", expected struct"}) + } // If the name is empty then the Go struct has to be anonymous. if !strings.EqualFold(s.Type().Desc.(types.StructDesc).Name, name) { panic(&UnmarshalTypeMismatchError{v, rv.Type(), ", names do not match"}) diff --git a/go/marshal/decode_test.go b/go/marshal/decode_test.go index ffd16f9396..1da78c3648 100644 --- a/go/marshal/decode_test.go +++ b/go/marshal/decode_test.go @@ -221,6 +221,7 @@ func TestDecodeTypeMismatch(t *testing.T) { X int } var s S + assertDecodeErrorMessage(t, types.String("hi!"), &s, "Cannot unmarshal String into Go value of type marshal.S, expected struct") assertDecodeErrorMessage(t, types.NewStruct("S", types.StructData{ "x": types.String("hi"), }), &s, "Cannot unmarshal String into Go value of type int") diff --git a/go/types/type_cache.go b/go/types/type_cache.go index 08bdfbe963..8111da0d64 100644 --- a/go/types/type_cache.go +++ b/go/types/type_cache.go @@ -383,6 +383,39 @@ func MakeMapType(keyType, valType *Type) *Type { return staticTypeCache.getCompoundType(MapKind, keyType, valType) } +type fieldSorter struct { + names []string + types []*Type +} + +func (fs *fieldSorter) Len() int { + return len(fs.names) +} + +func (fs *fieldSorter) Swap(i, j int) { + fs.names[i], fs.names[j] = fs.names[j], fs.names[i] + fs.types[i], fs.types[j] = fs.types[j], fs.types[i] +} + +func (fs *fieldSorter) Less(i, j int) bool { + return fs.names[i] < fs.names[j] +} + +type FieldMap map[string]*Type + +func MakeStructTypeFromFields(name string, fields FieldMap) *Type { + // I'm the computer + names := make([]string, 0, len(fields)) + types := make([]*Type, 0, len(fields)) + for k, v := range fields { + names = append(names, k) + types = append(types, v) + } + fs := fieldSorter{names, types} + sort.Sort(&fs) + return MakeStructType(name, names, types) +} + func MakeStructType(name string, fieldNames []string, fieldTypes []*Type) *Type { staticTypeCache.Lock() defer staticTypeCache.Unlock() diff --git a/go/types/type_cache_test.go b/go/types/type_cache_test.go index 6c3a493dd3..d66e9fabc7 100644 --- a/go/types/type_cache_test.go +++ b/go/types/type_cache_test.go @@ -238,3 +238,19 @@ func TestInvalidCyclesAndUnions(t *testing.T) { []*Type{MakeStructType("A", []string{"a"}, []*Type{MakeCycleType(1)})}) }) } + +func TestMakeStructTypeFromFields(t *testing.T) { + assert := assert.New(t) + fields := map[string]*Type{ + "str": StringType, + "number": NumberType, + "bool": BoolType, + } + desc := MakeStructTypeFromFields("Thing", fields).Desc.(StructDesc) + assert.Equal("Thing", desc.Name) + assert.Equal(3, desc.Len()) + for k, v := range fields { + f := desc.Field(k) + assert.True(v == f) + } +} diff --git a/samples/go/photo-index/main.go b/samples/go/photo-index/main.go new file mode 100644 index 0000000000..66a15d50b7 --- /dev/null +++ b/samples/go/photo-index/main.go @@ -0,0 +1,144 @@ +// Copyright 2016 Attic Labs, Inc. All rights reserved. +// Licensed under the Apache License, version 2.0: +// http://www.apache.org/licenses/LICENSE-2.0 + +package main + +import ( + "fmt" + "os" + "path" + + "github.com/attic-labs/noms/go/datas" + "github.com/attic-labs/noms/go/spec" + "github.com/attic-labs/noms/go/types" + "github.com/attic-labs/noms/go/walk" + flag "github.com/juju/gnuflag" +) + +func main() { + if !index() { + os.Exit(1) + } +} + +func index() (win bool) { + var dbStr = flag.String("db", "", "input database spec") + var outDSStr = flag.String("out-ds", "", "output dataset to write to - if empty, defaults to input dataset") + var parallelism = flag.Int("parallelism", 16, "number of parallel goroutines to search") + + flag.Usage = usage + flag.Parse(false) + + if flag.NArg() == 0 { + flag.Usage() + return + } + + if flag.NArg() == 0 { + fmt.Fprintln(os.Stderr, "Need at least one dataset to index") + return + } + + db, err := spec.GetDatabase(*dbStr) + if err != nil { + fmt.Fprintf(os.Stderr, "Invalid input database '%s': %s\n", flag.Arg(0), err) + return + } + defer db.Close() + + var outDS datas.Dataset + if !datas.DatasetFullRe.MatchString(*outDSStr) { + fmt.Fprintf(os.Stderr, "Invalid output dataset name: %s\n", *outDSStr) + return + } else { + outDS = db.GetDataset(*outDSStr) + } + + inputs := []types.Value{} + for i := 0; i < flag.NArg(); i++ { + p, err := spec.NewAbsolutePath(flag.Arg(i)) + if err != nil { + fmt.Fprintf(os.Stderr, "Invalid input path '%s', error: %s\n", flag.Arg(i), err) + return + } + + v := p.Resolve(db) + if v == nil { + fmt.Fprintf(os.Stderr, "Input path '%s' does not exist in '%s'", flag.Arg(i), *dbStr) + return + } + + inputs = append(inputs, v) + continue + } + + sizeType := types.MakeStructTypeFromFields("", types.FieldMap{ + "width": types.NumberType, + "height": types.NumberType, + }) + dateType := types.MakeStructTypeFromFields("Date", types.FieldMap{ + "nsSinceEpoch": types.NumberType, + }) + fields := types.FieldMap{ + "sizes": types.MakeMapType(sizeType, types.StringType), + "tags": types.MakeSetType(types.StringType), + "title": types.StringType, + "datePublished": dateType, + "dateUpdated": dateType, + } + photoType := types.MakeStructTypeFromFields("Photo", fields) + fields["dateTaken"] = dateType + photoType = types.MakeUnionType(photoType, types.MakeStructTypeFromFields("Photo", fields)) + + byDate := types.NewGraphBuilder(db, types.MapKind, true) + byTag := types.NewGraphBuilder(db, types.MapKind, true) + + for _, v := range inputs { + walk.SomeP(v, db, func(cv types.Value, _ *types.Ref) (stop bool) { + if types.IsSubtype(photoType, cv.Type()) { + s := cv.(types.Struct) + // Prefer to sort by the actual date the photo was taken, but if it's not + // available, use the date it was published instead. + ds, ok := s.MaybeGet("dateTaken") + if !ok { + ds = s.Get("datePublished") + } + + // Sort by most recent by negating the timestamp. + d := ds.(types.Struct).Get("nsSinceEpoch").(types.Number) + d = types.Number(-float64(d)) + + byDate.SetInsert([]types.Value{d}, cv) + s.Get("tags").(types.Set).IterAll(func(t types.Value) { + byTag.SetInsert([]types.Value{t, d}, cv) + }) + // Can't be any photos inside photos, so we can save a little bit here. + stop = true + } + return + }, *parallelism) + } + + outDS, err = db.CommitValue(outDS, types.NewStruct("", types.StructData{ + "byDate": byDate.Build(), + "byTag": byTag.Build(), + })) + if err != nil { + fmt.Fprintf(os.Stderr, "Could not commit: %s\n", err) + return + } + + win = true + return +} + +func usage() { + fmt.Fprintf(os.Stderr, "photo-index indexes photos by common attributes\n\n") + fmt.Fprintf(os.Stderr, "Usage: %s -db= -out-ds= [input-paths...]\n\n", path.Base(os.Args[0])) + fmt.Fprintf(os.Stderr, " : Database to work with\n") + fmt.Fprintf(os.Stderr, " : Dataset to write index to\n") + fmt.Fprintf(os.Stderr, " [input-paths...] : One or more paths within to crawl\n\n") + fmt.Fprintln(os.Stderr, "Flags:\n") + flag.PrintDefaults() +} diff --git a/samples/go/photo-index/main_test.go b/samples/go/photo-index/main_test.go new file mode 100644 index 0000000000..bcd948f350 --- /dev/null +++ b/samples/go/photo-index/main_test.go @@ -0,0 +1,99 @@ +// Copyright 2016 Attic Labs, Inc. All rights reserved. +// Licensed under the Apache License, version 2.0: +// http://www.apache.org/licenses/LICENSE-2.0 + +package main + +import ( + "fmt" + "testing" + + "github.com/attic-labs/noms/go/marshal" + "github.com/attic-labs/noms/go/spec" + "github.com/attic-labs/noms/go/types" + "github.com/attic-labs/noms/go/util/clienttest" + "github.com/attic-labs/testify/suite" +) + +func TestBasics(t *testing.T) { + suite.Run(t, &testSuite{}) +} + +type testSuite struct { + clienttest.ClientTestSuite +} + +func (s *testSuite) TestWin() { + sp := fmt.Sprintf("ldb:%s::test", s.LdbDir) + db, ds, _ := spec.GetDataset(sp) + + type Date struct { + NsSinceEpoch int + } + + type Photo struct { + Title string + Tags types.Set + Sizes map[struct { + Width int + Height int + }]string + DateTaken Date + DatePublished Date + DateUpdated Date + } + + getTags := func(n int) types.Set { + s := types.NewSet() + for i := 0; i < n; i++ { + s = s.Insert(types.String(fmt.Sprintf("tag%d", i))) + } + return s + } + + getPhoto := func(n int) Photo { + return Photo{ + Title: fmt.Sprintf("photo %d", n), + Tags: getTags(n), + Sizes: map[struct{ Width, Height int }]string{ + {100, 100}: "100.jpg"}, + DateTaken: Date{n * 10}, + DatePublished: Date{n*10 + 1}, + DateUpdated: Date{n*10 + 2}, + } + } + + photos := []Photo{} + for i := 0; i < 5; i++ { + photos = append(photos, getPhoto(i)) + } + + v, err := marshal.Marshal(photos) + s.NoError(err) + ds, err = db.CommitValue(ds, v) + s.NoError(err) + db.Close() + + _, _ = s.MustRun(main, []string{"--out-ds", "idx", "--db", s.LdbDir, "test"}) + + db, ds, _ = spec.GetDataset(fmt.Sprintf("%s::idx", s.LdbDir)) + var idx struct { + ByDate map[int]types.Set + ByTag map[string]map[int]types.Set + } + marshal.Unmarshal(ds.HeadValue(), &idx) + + s.Equal(5, len(idx.ByDate)) + for i := 0; i < 5; i++ { + s.Equal(uint64(1), idx.ByDate[-i*10].Len()) + p := idx.ByDate[-i*10].First().(types.Struct) + s.Equal(fmt.Sprintf("photo %d", i), string(p.Get("title").(types.String))) + } + + s.Equal(4, len(idx.ByTag)) + for i := 1; i < 5; i++ { + k := fmt.Sprintf("tag%d", i) + v := idx.ByTag[k] + s.Equal(4-i, len(v)) + } +}