From 8d4ff605f5b22cb40ed7808e8d278885e24715bb Mon Sep 17 00:00:00 2001 From: Aaron Boodman Date: Tue, 6 Dec 2016 16:21:02 -0800 Subject: [PATCH] photo-index: output PhotoGroup instead of Photo (#2902) photo-index: output PhotoGroup instead of Photo (#2902) --- samples/go/photo-date-dedup/main.go | 21 ++-- samples/go/photo-date-dedup/main_test.go | 47 +++++--- samples/go/photo-index/main.go | 140 ++++++++++------------- samples/go/photo-index/main_test.go | 37 ++++-- 4 files changed, 133 insertions(+), 112 deletions(-) diff --git a/samples/go/photo-date-dedup/main.go b/samples/go/photo-date-dedup/main.go index 90c54e1d67..7a1b9c767a 100644 --- a/samples/go/photo-date-dedup/main.go +++ b/samples/go/photo-date-dedup/main.go @@ -30,7 +30,7 @@ type Date struct { type Photo struct { Id string - DateTaken Date + DateTaken Date `noms:",omitempty"` } type PhotoGroup struct { @@ -124,11 +124,9 @@ func buildDateIndex(db types.ValueReadWriter, inputs []types.Value) types.Map { var p Photo if err := marshal.Unmarshal(cv, &p); err == nil { stop = true - if p.DateTaken.NsSinceEpoch != 0 { - indexBuilder.SetInsert( - []types.Value{types.Number(float64(p.DateTaken.NsSinceEpoch))}, - cv) - } + indexBuilder.SetInsert( + []types.Value{types.Number(float64(p.DateTaken.NsSinceEpoch))}, + cv) } return }) @@ -151,7 +149,7 @@ func buildGroups(db types.ValueReadWriter, thresh int, byDate types.Map) types.L } flush := func() { - if group != nil && group.Photos.Len() > 0 { + if group != nil { v, err := marshal.Marshal(*group) d.Chk.NoError(err) vals <- v @@ -163,10 +161,17 @@ func buildGroups(db types.ValueReadWriter, thresh int, byDate types.Map) types.L byDate.IterAll(func(key, s types.Value) { s.(types.Set).IterAll(func(val types.Value) { dt := float64(key.(types.Number)) - if (dt - lastTime) > float64(thresh*1e6) { + if dt == 0 { + // If date is not known, then the photo is in its own group + flush() + startGroup(val) + flush() + } else if (dt - lastTime) > float64(thresh*1e6) { + // Otherwise, if we've surpassed the threshold, start a new group flush() startGroup(val) } else { + // Otherwise, add to the existing group group.Photos = group.Photos.Insert(val) } lastTime = dt diff --git a/samples/go/photo-date-dedup/main_test.go b/samples/go/photo-date-dedup/main_test.go index 83b6057faa..9140227765 100644 --- a/samples/go/photo-date-dedup/main_test.go +++ b/samples/go/photo-date-dedup/main_test.go @@ -57,21 +57,28 @@ func (s *testSuite) TestBasic() { DateTaken: Date{NsSinceEpoch: float64(55 * 1e9)}, }), - // No dupes, so it doen't end up in a group + // No dupes marshal.MustMarshal(Photo{ Id: "48", DateTaken: Date{NsSinceEpoch: float64(61 * 1e9)}, }), - // Zero date taken, so it doesn't end up in a group + // If the DateTaken is zero, it should end up in its own group marshal.MustMarshal(Photo{ Id: "49", DateTaken: Date{NsSinceEpoch: float64(0)}, }), + marshal.MustMarshal(Photo{ + Id: "50", + DateTaken: Date{NsSinceEpoch: float64(0)}, + }), - // No date taken, so it doens't end up in a group + // If the DateTaken is not present, it should end up in its own group types.NewStruct("Photo", types.StructData{ - "Id": types.String("50"), + "id": types.String("51"), + }), + types.NewStruct("Photo", types.StructData{ + "id": types.String("52"), }), ) @@ -90,15 +97,29 @@ func (s *testSuite) TestBasic() { err = marshal.Unmarshal(sp.GetDataset().HeadValue(), &result) s.NoError(err) - s.Equal(2, len(result.Groups)) + expectedGroups := map[string]map[string]bool{ + "44": map[string]bool{"45": true, "43": true, "42": true}, + "46": map[string]bool{"47": true}, + "48": nil, + "49": nil, + "50": nil, + "51": nil, + "52": nil, + } - s.Equal("44", result.Groups[0].Cover.Id) - s.Equal(3, len(result.Groups[0].Photos)) - s.Equal("45", result.Groups[0].Photos[0].Id) - s.Equal("43", result.Groups[0].Photos[1].Id) - s.Equal("42", result.Groups[0].Photos[2].Id) + for _, g := range result.Groups { + exp, ok := expectedGroups[g.Cover.Id] + s.True(ok, "Group cover %s not expected", g.Cover.Id) + for _, p := range g.Photos { + if _, ok = exp[p.Id]; ok { + delete(exp, p.Id) + } else { + s.Fail("Photo %s not expected in group %s", p.Id, g.Cover.Id) + } + } + s.Equal(0, len(exp), "Some expected photos not found in group %s: %+v", g.Cover.Id, exp) + delete(expectedGroups, g.Cover.Id) + } - s.Equal("46", result.Groups[1].Cover.Id) - s.Equal(1, len(result.Groups[1].Photos)) - s.Equal("47", result.Groups[1].Photos[0].Id) + s.Equal(0, len(expectedGroups), "Some expected groups not found in result: %+v", expectedGroups) } diff --git a/samples/go/photo-index/main.go b/samples/go/photo-index/main.go index 84311de75d..846e7891b6 100644 --- a/samples/go/photo-index/main.go +++ b/samples/go/photo-index/main.go @@ -9,13 +9,11 @@ import ( "math" "os" "path" - "sync" "time" "github.com/attic-labs/noms/go/config" "github.com/attic-labs/noms/go/d" "github.com/attic-labs/noms/go/datas" - "github.com/attic-labs/noms/go/hash" "github.com/attic-labs/noms/go/marshal" "github.com/attic-labs/noms/go/spec" "github.com/attic-labs/noms/go/types" @@ -32,23 +30,39 @@ func main() { } type Photo struct { - Id string - Sizes map[struct{ Width, Height int }]string + Id string + Sizes map[struct{ Width, Height int }]string + DateTaken Date `noms:",omitempty"` + DatePublished Date `noms:",omitempty"` + DateUpdated Date `noms:",omitempty"` + Tags []string `noms:",omitempty"` + Sources []string `noms:",omitempty"` + Original types.Struct `noms:",original"` + Faces []struct { + Name string + X, Y, W, H float32 + } `noms:",omitempty"` } type PhotoGroup struct { - Cover Photo - Photos types.Set + Id string + Cover Photo + Photos []Photo + Original types.Struct `noms:",original"` } type Date struct { NsSinceEpoch float64 } +func (d Date) IsEmpty() bool { + return d.NsSinceEpoch == 0 +} + func index() (win bool) { var dbStr = flag.String("db", "", "input database spec") - var groupsStr = flag.String("groups", "", "path within db to look for PhotoGroup structs") var outDSStr = flag.String("out-ds", "", "output dataset to write to - if empty, defaults to input dataset") + var indexCovers = flag.Bool("index-covers", false, "the resulting index will contain only the cover Photo, not the entire PhotoGroup") verbose.RegisterVerboseFlags(flag.CommandLine) flag.Usage = usage @@ -86,101 +100,65 @@ func index() (win bool) { faceCounts := map[types.String]int{} sourceCounts := map[types.String]int{} tagCounts := map[types.String]int{} - countsMtx := sync.Mutex{} - addToIndex := func(p Photo, cv types.Value) { - d := math.MaxFloat64 - var dt struct{ DateTaken Date } - var dp struct{ DatePublished Date } - var du struct{ DateUpdated Date } - if err := marshal.Unmarshal(cv, &dt); err == nil { - d = -dt.DateTaken.NsSinceEpoch - } else if err := marshal.Unmarshal(cv, &dp); err == nil { - d = -dp.DatePublished.NsSinceEpoch - } else if err := marshal.Unmarshal(cv, &du); err == nil { - d = -du.DateUpdated.NsSinceEpoch + addToIndex := func(gb *types.GraphBuilder, path []types.Value, pg PhotoGroup) { + if *indexCovers { + gb.SetInsert(path, pg.Cover.Original) + } else { + gb.SetInsert(path, pg.Original) } + } + + addToIndexes := func(pg PhotoGroup) { + d := math.MaxFloat64 + if !pg.Cover.DateTaken.IsEmpty() { + d = pg.Cover.DateTaken.NsSinceEpoch + } else if !pg.Cover.DatePublished.IsEmpty() { + d = pg.Cover.DatePublished.NsSinceEpoch + } else if !pg.Cover.DateUpdated.IsEmpty() { + d = pg.Cover.DateUpdated.NsSinceEpoch + } + d = -d // Index by date - byDate.SetInsert([]types.Value{types.Number(d)}, cv) + addToIndex(byDate, []types.Value{types.Number(d)}, pg) + + allPhotos := []Photo{pg.Cover} + if !*indexCovers { + allPhotos = append(allPhotos, pg.Photos...) + } // Index by tag, then date - moreTags := map[types.String]int{} - var wt struct{ Tags []string } - if err = marshal.Unmarshal(cv, &wt); err == nil { - for _, t := range wt.Tags { - byTag.SetInsert([]types.Value{types.String(t), types.Number(d)}, cv) - moreTags[types.String(t)]++ + for _, p := range allPhotos { + for _, t := range p.Tags { + addToIndex(byTag, []types.Value{types.String(t), types.Number(d)}, pg) + tagCounts[types.String(t)]++ } } // Index by face, then date - moreFaces := map[types.String]int{} - var wf struct { - Faces []struct { - Name string - X, Y, W, H float32 - } - } - if err = marshal.Unmarshal(cv, &wf); err == nil { - for _, f := range wf.Faces { - byFace.SetInsert([]types.Value{types.String(f.Name), types.Number(d)}, cv) - moreFaces[types.String(f.Name)]++ + for _, p := range allPhotos { + for _, f := range p.Faces { + addToIndex(byFace, []types.Value{types.String(f.Name), types.Number(d)}, pg) + faceCounts[types.String(f.Name)]++ } } // Index by source, then date - moreSources := map[types.String]int{} - var ws struct { - Sources []string - } - if err = marshal.Unmarshal(cv, &ws); err == nil { - for _, s := range ws.Sources { - bySource.SetInsert([]types.Value{types.String(s), types.Number(d)}, cv) + for _, p := range allPhotos { + for _, s := range p.Sources { + addToIndex(bySource, []types.Value{types.String(s), types.Number(d)}, pg) + sourceCounts[types.String(s)]++ } } - - countsMtx.Lock() - for tag, count := range moreTags { - tagCounts[tag] += count - } - for face, count := range moreFaces { - faceCounts[face] += count - } - for source, count := range moreSources { - sourceCounts[source] += count - } - countsMtx.Unlock() - } - - groups := []types.Value{} - inGroups := map[hash.Hash]struct{}{} - if *groupsStr != "" { - groups, err = spec.ReadAbsolutePaths(db, *groupsStr) - d.CheckErrorNoUsage(err) - walk.WalkValues(groups[0], db, func(cv types.Value) (stop bool) { - var pg PhotoGroup - if err := marshal.Unmarshal(cv, &pg); err == nil { - stop = true - // TODO: Don't need to do this second arg separately when decoder can catch full value. - addToIndex(pg.Cover, cv.(types.Struct).Get("cover")) - inGroups[cv.(types.Struct).Get("cover").Hash()] = struct{}{} - pg.Photos.IterAll(func(cv types.Value) { - inGroups[cv.Hash()] = struct{}{} - }) - } - return - }) } for _, v := range inputs { walk.WalkValues(v, db, func(cv types.Value) (stop bool) { - var p Photo - if _, ok := inGroups[cv.Hash()]; ok { + var pg PhotoGroup + if err := marshal.Unmarshal(cv, &pg); err == nil { stop = true - } else if err := marshal.Unmarshal(cv, &p); err == nil { - stop = true - addToIndex(p, cv) + addToIndexes(pg) } return }) diff --git a/samples/go/photo-index/main_test.go b/samples/go/photo-index/main_test.go index ec9c6e2041..3573643a23 100644 --- a/samples/go/photo-index/main_test.go +++ b/samples/go/photo-index/main_test.go @@ -24,10 +24,6 @@ type testSuite struct { } func (s *testSuite) TestWin() { - sp, err := spec.ForDataset(fmt.Sprintf("ldb:%s::test", s.LdbDir)) - s.NoError(err) - defer sp.Close() - type Face struct { Name string X, Y, W, H int @@ -51,6 +47,12 @@ func (s *testSuite) TestWin() { DateUpdated Date } + type PhotoGroup struct { + Id string + Cover Photo + Photos []Photo + } + getTags := func(n int) types.Set { s := types.NewSet() for i := 0; i < n; i++ { @@ -86,12 +88,23 @@ func (s *testSuite) TestWin() { } } - photos := []Photo{} - for i := 0; i < 5; i++ { - photos = append(photos, getPhoto(i)) + getPhotoGroup := func(n int) PhotoGroup { + return PhotoGroup{ + Id: fmt.Sprintf("pg%d", n), + Cover: getPhoto(n), + } } - v, err := marshal.Marshal(photos) + groups := []PhotoGroup{} + for i := 0; i < 5; i++ { + groups = append(groups, getPhotoGroup(i)) + } + + sp, err := spec.ForDataset(fmt.Sprintf("ldb:%s::test", s.LdbDir)) + s.NoError(err) + defer sp.Close() + + v, err := marshal.Marshal(groups) s.NoError(err) _, err = sp.GetDatabase().CommitValue(sp.GetDataset(), v) s.NoError(err) @@ -113,8 +126,12 @@ func (s *testSuite) TestWin() { s.Equal(5, len(idx.ByDate)) for i := 0; i < 5; i++ { - s.Equal(uint64(1), idx.ByDate[-i*10].Len()) - p := idx.ByDate[-i*10].First().(types.Struct) + k := -i * 10 + if k == 0 { + k = -1 + } + s.Equal(uint64(1), idx.ByDate[k].Len()) + p := idx.ByDate[k].First().(types.Struct).Get("cover").(types.Struct) s.Equal(fmt.Sprintf("photo %d", i), string(p.Get("title").(types.String))) }