mirror of
https://github.com/dolthub/dolt.git
synced 2026-05-05 11:21:58 -05:00
Add jobs for grouping similar photos in PhotoGroups (#2789)
* Add jobs for grouping similar photos in PhotoGroups Outline: - The first photo-dhash job adds a dhash field to each photo. The dhash is a 128 bit downsampled representation of the photo that works well for visual similarity comparisions. - The second photo-dedup job groups photos that have similar dhash's into PhotoGroups. fixes: #2787
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
|
||||
"github.com/attic-labs/noms/go/config"
|
||||
"github.com/attic-labs/noms/go/datas"
|
||||
"github.com/attic-labs/noms/go/spec"
|
||||
"github.com/attic-labs/noms/go/util/exit"
|
||||
"github.com/attic-labs/noms/go/util/verbose"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/job"
|
||||
|
||||
flag "github.com/juju/gnuflag"
|
||||
)
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s -db=<db-spec> -out-ds=<name> <input-paths...>\n\n", path.Base(os.Args[0]))
|
||||
fmt.Fprintf(os.Stderr, "Groups photos into PhotoGroups based on similarity determined by their dhashes\n\n")
|
||||
fmt.Fprintf(os.Stderr, " <db> : Database to work with\n")
|
||||
fmt.Fprintf(os.Stderr, " <out-ds> : Dataset to write photos groups to\n")
|
||||
fmt.Fprintf(os.Stderr, " <input-paths...> : One or more input paths within <db-spec>\n\n")
|
||||
fmt.Fprintln(os.Stderr, "Flags:\n")
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
func main() {
|
||||
var dbStr = flag.String("db", "", "input database spec")
|
||||
var outDSStr = flag.String("out-ds", "", "output dataset to write to")
|
||||
var threshold = flag.Int("threshold", 10, "photo's whose dhash distance is < threshold are grouped together")
|
||||
verbose.RegisterVerboseFlags(flag.CommandLine)
|
||||
|
||||
flag.Usage = usage
|
||||
flag.Parse(false)
|
||||
|
||||
if flag.NArg() == 0 {
|
||||
flag.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
cfg := config.NewResolver()
|
||||
db, err := cfg.GetDatabase(*dbStr)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Invalid input database '%s': %s\n", flag.Arg(0), err)
|
||||
return
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
var outDS datas.Dataset
|
||||
if !datas.IsValidDatasetName(*outDSStr) {
|
||||
fmt.Fprintf(os.Stderr, "Invalid output dataset name: %s\n", *outDSStr)
|
||||
return
|
||||
} else {
|
||||
outDS = db.GetDataset(*outDSStr)
|
||||
}
|
||||
|
||||
inputs, err := spec.ReadAbsolutePaths(db, flag.Args()...)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
err = job.DeduplicateJob(db, inputs, outDS, *threshold)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", err)
|
||||
exit.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
|
||||
"github.com/attic-labs/noms/go/config"
|
||||
"github.com/attic-labs/noms/go/datas"
|
||||
"github.com/attic-labs/noms/go/spec"
|
||||
"github.com/attic-labs/noms/go/util/exit"
|
||||
"github.com/attic-labs/noms/go/util/verbose"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/job"
|
||||
|
||||
flag "github.com/juju/gnuflag"
|
||||
)
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintf(os.Stderr, "Usage: %s -db=<db-spec> -out-ds=<name> <input-paths...>\n\n", path.Base(os.Args[0]))
|
||||
fmt.Fprintf(os.Stderr, "Annotates each Photo in input-paths with a dhash field\n\n")
|
||||
fmt.Fprintf(os.Stderr, " <input-paths...> : One or more input paths within <db-spec>\n")
|
||||
fmt.Fprintf(os.Stderr, " <db> : Database to work with\n")
|
||||
fmt.Fprintf(os.Stderr, " <out-ds> : Dataset to write photos groups to\n")
|
||||
fmt.Fprintf(os.Stderr, " <input-paths...> : One or more input paths within <db-spec>\n\n")
|
||||
|
||||
flag.PrintDefaults()
|
||||
}
|
||||
|
||||
func main() {
|
||||
var dbStr = flag.String("db", "", "input database spec")
|
||||
var outDSStr = flag.String("out-ds", "", "output dataset to write to")
|
||||
verbose.RegisterVerboseFlags(flag.CommandLine)
|
||||
|
||||
flag.Usage = usage
|
||||
flag.Parse(false)
|
||||
|
||||
if flag.NArg() == 0 {
|
||||
flag.Usage()
|
||||
return
|
||||
}
|
||||
|
||||
cfg := config.NewResolver()
|
||||
db, err := cfg.GetDatabase(*dbStr)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Invalid input database '%s': %s\n", flag.Arg(0), err)
|
||||
return
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
var outDS datas.Dataset
|
||||
if !datas.IsValidDatasetName(*outDSStr) {
|
||||
fmt.Fprintf(os.Stderr, "Invalid output dataset name: %s\n", *outDSStr)
|
||||
return
|
||||
} else {
|
||||
outDS = db.GetDataset(*outDSStr)
|
||||
}
|
||||
|
||||
inputs, err := spec.ReadAbsolutePaths(db, flag.Args()...)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
if err = job.HashPhotosJob(db, inputs, outDS); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s\n", err)
|
||||
exit.Fail()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
Copyright (c) 2015, Andy Balholm
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
// Derived from https://github.com/andybalholm/dhash
|
||||
// Copyright (c) 2015, Andy Balholm
|
||||
// All rights reserved.
|
||||
|
||||
package dhash
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// A Hash is a 128-bit perceptual hash.
|
||||
type Hash [2]uint64
|
||||
|
||||
var NilHash = Hash{}
|
||||
|
||||
func (h Hash) String() string {
|
||||
return fmt.Sprintf("%016x%016x", h[0], h[1])
|
||||
}
|
||||
|
||||
// Parse takes the string representation of a Hash, and returns the
|
||||
// corresponding Hash value.
|
||||
func Parse(s string) (h Hash, err error) {
|
||||
if len(s) != 32 {
|
||||
err = fmt.Errorf("wrong length for dhash value (%d characters; should be 32)", len(s))
|
||||
return
|
||||
}
|
||||
_, err = fmt.Fscanf(strings.NewReader(s), "%016x%016x", &h[0], &h[1])
|
||||
return
|
||||
}
|
||||
|
||||
// New returns a hash of img. The algorithm is the difference hash from
|
||||
// http://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html.
|
||||
func New(img image.Image) Hash {
|
||||
bounds := img.Bounds()
|
||||
width := bounds.Max.X - bounds.Min.X
|
||||
height := bounds.Max.Y - bounds.Min.Y
|
||||
|
||||
// Calculate the mean brightness of each block in an 9x9 grid.
|
||||
var blocks [9][9]int
|
||||
for i := 0; i < 9; i++ {
|
||||
left := bounds.Min.X + (width * i / 9)
|
||||
right := bounds.Min.X + (width * (i + 1) / 9)
|
||||
if right == left {
|
||||
right = left + 1
|
||||
}
|
||||
for j := 0; j < 9; j++ {
|
||||
top := bounds.Min.Y + (height * j / 9)
|
||||
bottom := bounds.Min.Y + (height * (j + 1) / 9)
|
||||
if bottom == top {
|
||||
bottom = top + 1
|
||||
}
|
||||
var total int64
|
||||
|
||||
switch img := img.(type) {
|
||||
case *image.YCbCr:
|
||||
for y := top; y < bottom; y++ {
|
||||
rowStart := y * img.YStride
|
||||
for x := left; x < right; x++ {
|
||||
total += int64(img.Y[rowStart+x])
|
||||
}
|
||||
}
|
||||
default:
|
||||
for x := left; x < right; x++ {
|
||||
for y := top; y < bottom; y++ {
|
||||
r, g, b, _ := img.At(x, y).RGBA()
|
||||
total += int64(r+r+r+b+g+g+g+g) >> 3
|
||||
}
|
||||
}
|
||||
}
|
||||
blocks[i][j] = int(total / int64((right-left)*(bottom-top)))
|
||||
}
|
||||
}
|
||||
|
||||
var result Hash
|
||||
for i := 0; i < 8; i++ {
|
||||
for j := 0; j < 8; j++ {
|
||||
if blocks[i][j] > blocks[i][j+1] {
|
||||
result[0] |= 1 << uint(i*8+j)
|
||||
}
|
||||
if blocks[i][j] > blocks[i+1][j] {
|
||||
result[1] |= 1 << uint(i*8+j)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Distance returns the number of bits different between two Hash values.
|
||||
func Distance(h1, h2 Hash) int {
|
||||
return bitCount(h1[0]^h2[0]) + bitCount(h1[1]^h2[1])
|
||||
}
|
||||
|
||||
// Count returns the number of nonzero bits in w.
|
||||
// (Copied from https://github.com/andybalholm/go-bit/blob/master/funcs.go)
|
||||
func bitCount(w uint64) int {
|
||||
// “Software Optimization Guide for AMD64 Processors”, Section 8.6.
|
||||
const maxw = 1<<64 - 1
|
||||
const bpw = 64
|
||||
|
||||
// Compute the count for each 2-bit group.
|
||||
// Example using 16-bit word w = 00,01,10,11,00,01,10,11
|
||||
// w - (w>>1) & 01,01,01,01,01,01,01,01 = 00,01,01,10,00,01,01,10
|
||||
w -= (w >> 1) & (maxw / 3)
|
||||
|
||||
// Add the count of adjacent 2-bit groups and store in 4-bit groups:
|
||||
// w & 0011,0011,0011,0011 + w>>2 & 0011,0011,0011,0011 = 0001,0011,0001,0011
|
||||
w = w&(maxw/15*3) + (w>>2)&(maxw/15*3)
|
||||
|
||||
// Add the count of adjacent 4-bit groups and store in 8-bit groups:
|
||||
// (w + w>>4) & 00001111,00001111 = 00000100,00000100
|
||||
w += w >> 4
|
||||
w &= maxw / 255 * 15
|
||||
|
||||
// Add all 8-bit counts with a multiplication and a shift:
|
||||
// (w * 00000001,00000001) >> 8 = 00001000
|
||||
w *= maxw / 255
|
||||
w >>= (bpw/8 - 1) * 8
|
||||
return int(w)
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package job
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"runtime"
|
||||
|
||||
"github.com/attic-labs/noms/go/datas"
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/go/util/status"
|
||||
"github.com/attic-labs/noms/go/walk"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/model"
|
||||
)
|
||||
|
||||
var grouper *photoGrouper
|
||||
|
||||
// DeduplicateJob reads Set<Photo>'s (annotated with dhash) and writes Set<PhotoGroup> to
|
||||
// outDS where each group contains all duplicates.
|
||||
func DeduplicateJob(db datas.Database, photoSets []types.Value, outDS datas.Dataset, similarityThreshold int) error {
|
||||
return commitPhotoGroups(db, outDS, groupPhotos(db, photoSets, similarityThreshold))
|
||||
}
|
||||
|
||||
// groupPhotos reads Set<Photo>'s and sorts them into groups containing all photos that
|
||||
// are deemed duplicates by comparing dhash's.
|
||||
func groupPhotos(db datas.Database, photoSets []types.Value, threshold int) <-chan types.Struct {
|
||||
grouper = newPhotoGrouper(threshold)
|
||||
for _, set := range photoSets {
|
||||
walk.WalkValues(set, db, func(cv types.Value) (stop bool) {
|
||||
if photo, ok := model.UnmarshalPhoto(cv); ok {
|
||||
grouper.insertPhoto(photo)
|
||||
}
|
||||
return false
|
||||
})
|
||||
}
|
||||
grouped := make(chan types.Struct, runtime.NumCPU()*4)
|
||||
go func() {
|
||||
defer close(grouped)
|
||||
grouper.iterGroups(func(pg *model.PhotoGroup) {
|
||||
grouped <- pg.Marshal()
|
||||
})
|
||||
}()
|
||||
return grouped
|
||||
}
|
||||
|
||||
// commitPhotoGroups commits the new groups to ds
|
||||
func commitPhotoGroups(db datas.Database, ds datas.Dataset, groups <-chan types.Struct) error {
|
||||
newSet := types.NewGraphBuilder(db, types.SetKind, true)
|
||||
for group := range groups {
|
||||
newSet.SetInsert(nil, group)
|
||||
}
|
||||
status.Done()
|
||||
fmt.Printf("\nCommitting %d PhotoGroups\n", grouper.photoCount)
|
||||
commit := newSet.Build()
|
||||
meta := model.NewCommitMeta().Marshal()
|
||||
_, err := db.Commit(ds, commit, datas.CommitOptions{Meta: meta})
|
||||
return err
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package job
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
"github.com/attic-labs/noms/go/util/status"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/dhash"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/model"
|
||||
)
|
||||
|
||||
// photoGrouper is a data structure used to group similar photos into PhotoGroups
|
||||
//
|
||||
// The current implementation is a simple map. Photo inserts are O(n^2).
|
||||
// TODO: Replace the map with VP/MVP tree (https://en.wikipedia.org/wiki/Vantage-point_tree).
|
||||
type photoGrouper struct {
|
||||
groups map[model.ID]*group
|
||||
threshold int
|
||||
photoCount int
|
||||
duplicateCount int
|
||||
}
|
||||
|
||||
type group struct {
|
||||
id model.ID
|
||||
dhash dhash.Hash
|
||||
photos map[*model.Photo]bool
|
||||
}
|
||||
|
||||
func newPhotoGrouper(threshold int) *photoGrouper {
|
||||
return &photoGrouper{make(map[model.ID]*group), threshold, 0, 0}
|
||||
}
|
||||
|
||||
func newGroup(photo *model.Photo) *group {
|
||||
photos := map[*model.Photo]bool{photo: true}
|
||||
return &group{model.NewAtticID(), photo.Dhash, photos}
|
||||
}
|
||||
|
||||
func (g *photoGrouper) insertGroup(pg *group) {
|
||||
}
|
||||
|
||||
// insertPhoto places the photo into an existing group if there is one that contains
|
||||
// duplicate photos. Otherwise it creates a new group.
|
||||
//
|
||||
// The current implementation is a brute force n^2 comparision. A more efficient
|
||||
// implementation would be to build an VP/MVP tree. A VP tree is a binary search
|
||||
// tree that works in a geometric space. Each node defines a center point and a
|
||||
// radius. Dhashes within the radius can be found to the left; those outside the
|
||||
// radius can be found to the right. An MVP is the k-tree equivalent.
|
||||
func (g *photoGrouper) insertPhoto(photo *model.Photo) {
|
||||
for _, group := range g.groups {
|
||||
if group.dhash != dhash.NilHash {
|
||||
if dhash.Distance(photo.Dhash, group.dhash) < g.threshold {
|
||||
if _, ok := group.photos[photo]; !ok {
|
||||
group.photos[photo] = true
|
||||
g.duplicateCount++
|
||||
g.photoCount++
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
status.Printf("Grouping - %d duplicates found in %d photos", g.duplicateCount, g.photoCount)
|
||||
new := newGroup(photo)
|
||||
g.groups[new.id] = new
|
||||
g.photoCount++
|
||||
}
|
||||
|
||||
// iterGroups iterator through all the photo groups
|
||||
func (g *photoGrouper) iterGroups(cb func(pg *model.PhotoGroup)) {
|
||||
for _, group := range g.groups {
|
||||
cover, rest := pickCover(group)
|
||||
cb(model.NewPhotoGroup(group.id, group.dhash, cover, rest))
|
||||
}
|
||||
}
|
||||
|
||||
// Dumb implementation for now. Ultimately, there should be another job that picks the best photo.
|
||||
func pickCover(pg *group) (*model.Photo, map[*model.Photo]bool) {
|
||||
d.Chk.True(len(pg.photos) > 0)
|
||||
var cover *model.Photo
|
||||
rest := map[*model.Photo]bool{}
|
||||
i := 0
|
||||
for p, _ := range pg.photos {
|
||||
if i == 0 {
|
||||
cover = p
|
||||
} else {
|
||||
rest[p] = true
|
||||
}
|
||||
i += 1
|
||||
}
|
||||
return cover, rest
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package job
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"image"
|
||||
_ "image/gif"
|
||||
_ "image/jpeg"
|
||||
_ "image/png"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
"github.com/attic-labs/noms/go/datas"
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/go/util/status"
|
||||
"github.com/attic-labs/noms/go/walk"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/dhash"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/model"
|
||||
)
|
||||
|
||||
// HashPhotosJob adds a dhash field to every photo in photoSets and commits them to
|
||||
// a new Photo set in outDS. The dhash is used in turn by the DeduplicatePhotosJob
|
||||
// to group similar photos into PhotoGroups.
|
||||
func HashPhotosJob(db datas.Database, photoSets []types.Value, outDS datas.Dataset) error {
|
||||
return commitHashedPhotos(db, outDS, hashPhotos(db, photoSets))
|
||||
}
|
||||
|
||||
// hashPhotos adds a dhash to each photo in photoSets and delivers them on the returned channel
|
||||
func hashPhotos(db datas.Database, photoSets []types.Value) <-chan types.Struct {
|
||||
numWorkers := runtime.NumCPU() * 4
|
||||
toHash := make(chan *model.Photo, numWorkers)
|
||||
hashed := make(chan types.Struct, numWorkers)
|
||||
|
||||
go func() {
|
||||
for _, set := range photoSets {
|
||||
walk.WalkValues(set, db, func(cv types.Value) (stop bool) {
|
||||
if photo, ok := model.UnmarshalPhoto(cv); ok {
|
||||
toHash <- photo
|
||||
}
|
||||
return false
|
||||
})
|
||||
}
|
||||
close(toHash)
|
||||
}()
|
||||
|
||||
fmt.Print("Downloading and hashing photos...\n")
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(numWorkers)
|
||||
for i := 0; i < numWorkers; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
for photo := range toHash {
|
||||
withHash, err := addHashToPhoto(photo)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "\nSkipping: %s\n", err)
|
||||
hashed <- photo.Marshal()
|
||||
} else {
|
||||
hashed <- withHash.Marshal()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(hashed)
|
||||
}()
|
||||
return hashed
|
||||
}
|
||||
|
||||
func addHashToPhoto(photo *model.Photo) (*model.Photo, error) {
|
||||
url := pickBestImage(photo)
|
||||
if url == "" {
|
||||
return nil, fmt.Errorf("No URL found for photo %s", photo.Id)
|
||||
}
|
||||
res, err := http.Get(string(url))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
img, _, err := image.Decode(res.Body)
|
||||
if err != nil {
|
||||
if err == image.ErrFormat {
|
||||
err = fmt.Errorf("%s: unknown format", url)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
photo.Dhash = dhash.New(img)
|
||||
return photo, nil
|
||||
}
|
||||
|
||||
// pickBestImage returns the image URL corresponding to the size
|
||||
// closest to but not below the ideal size. If there is no image
|
||||
// large enough, it returns the next smallest image.
|
||||
func pickBestImage(photo *model.Photo) string {
|
||||
idealSize := 240 * 240
|
||||
|
||||
closestURL := ""
|
||||
closestDist := int(math.MaxInt64)
|
||||
|
||||
closestBelowURL := ""
|
||||
closestBelowDist := int(-math.MaxInt64)
|
||||
|
||||
photo.IterSizes(func(w int, h int, url string) {
|
||||
dist := w*h - idealSize
|
||||
if dist >= 0 && dist < closestDist {
|
||||
closestURL = url
|
||||
closestDist = dist
|
||||
} else if dist < 0 && dist > closestBelowDist {
|
||||
closestBelowURL = url
|
||||
closestBelowDist = dist
|
||||
}
|
||||
})
|
||||
if closestDist < int(math.MaxInt64) {
|
||||
return closestURL
|
||||
}
|
||||
return closestBelowURL
|
||||
}
|
||||
|
||||
// commitHashedPhotos reads the annotated photos off the hashPhotos channel and commits
|
||||
// them to the new ds
|
||||
func commitHashedPhotos(db datas.Database, ds datas.Dataset, hashedPhotos <-chan types.Struct) error {
|
||||
newSet := types.NewGraphBuilder(db, types.SetKind, true)
|
||||
count := 0
|
||||
for photo := range hashedPhotos {
|
||||
count += 1
|
||||
status.Printf("Hashing - %d photos processed", count)
|
||||
newSet.SetInsert(nil, photo)
|
||||
}
|
||||
status.Done()
|
||||
fmt.Printf("Committing %d hashed Photos\n", count)
|
||||
commit := newSet.Build()
|
||||
meta := model.NewCommitMeta().Marshal()
|
||||
_, err := db.Commit(ds, commit, datas.CommitOptions{Meta: meta})
|
||||
return err
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package model
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/attic-labs/noms/go/marshal"
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
)
|
||||
|
||||
type CommitMeta struct {
|
||||
Date string
|
||||
}
|
||||
|
||||
func NewCommitMeta() CommitMeta {
|
||||
return CommitMeta{
|
||||
time.Now().Format(time.RFC3339),
|
||||
}
|
||||
}
|
||||
|
||||
func (c CommitMeta) Marshal() types.Struct {
|
||||
v, err := marshal.Marshal(c)
|
||||
d.Chk.NoError(err)
|
||||
return v.(types.Struct)
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package model
|
||||
|
||||
var field = struct {
|
||||
cover string
|
||||
dhash string
|
||||
height string
|
||||
id string
|
||||
photos string
|
||||
sizes string
|
||||
title string
|
||||
width string
|
||||
} {
|
||||
"cover",
|
||||
"dhash",
|
||||
"height",
|
||||
"id",
|
||||
"photos",
|
||||
"sizes",
|
||||
"title",
|
||||
"width",
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
package model
|
||||
|
||||
import "github.com/satori/go.uuid"
|
||||
|
||||
type ID string
|
||||
|
||||
func NewAtticID() ID {
|
||||
return ID(uuid.NewV4().String())
|
||||
}
|
||||
|
||||
func (id ID) String() string {
|
||||
return string(id)
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package model_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/model"
|
||||
"github.com/attic-labs/testify/assert"
|
||||
)
|
||||
|
||||
func TestCommitMeta(t *testing.T) {
|
||||
commit := model.NewCommitMeta()
|
||||
strct := commit.Marshal()
|
||||
v := strct.Get("date").(types.String)
|
||||
date, err := time.Parse(time.RFC3339, string(v))
|
||||
assert.NoError(t, err)
|
||||
assert.True(t, date.Before(time.Now()))
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
"github.com/attic-labs/noms/go/marshal"
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/dhash"
|
||||
)
|
||||
|
||||
type Photo struct {
|
||||
Id ID
|
||||
Sizes map[struct {
|
||||
Width int
|
||||
Height int
|
||||
}]string
|
||||
Dhash dhash.Hash `noms:"-"` // TODO: replace with optional field support
|
||||
Orig types.Struct `noms:"-"` // TODO: replace with value preservation support
|
||||
}
|
||||
|
||||
func UnmarshalPhoto(value types.Value) (*Photo, bool) {
|
||||
d.Chk.NotNil(value)
|
||||
p := Photo{}
|
||||
err := marshal.Unmarshal(value, &p)
|
||||
if err != nil {
|
||||
if _, ok := err.(*marshal.UnmarshalTypeMismatchError); ok {
|
||||
return nil, false
|
||||
}
|
||||
d.Chk.NoError(err)
|
||||
}
|
||||
s := value.(types.Struct)
|
||||
if dv, ok := s.MaybeGet("dhash"); ok {
|
||||
p.Dhash, err = dhash.Parse(string(dv.(types.String)))
|
||||
}
|
||||
p.Orig = s
|
||||
return &p, true
|
||||
}
|
||||
|
||||
func (p *Photo) IterSizes(cb func(width int, height int, url string)) {
|
||||
for k, v := range p.Sizes {
|
||||
cb(k.Width, k.Height, v)
|
||||
}
|
||||
}
|
||||
|
||||
// This can be replaced with marshal.Marshal when value preservation is implemented
|
||||
func (p *Photo) Marshal() types.Struct {
|
||||
nomsV, err := marshal.Marshal(*p)
|
||||
d.Chk.NoError(err)
|
||||
nomsS := nomsV.(types.Struct)
|
||||
final := p.Orig
|
||||
nomsS.Type().Desc.(types.StructDesc).IterFields(func(name string, t *types.Type) {
|
||||
v := nomsS.Get(name)
|
||||
final = final.Set(name, v)
|
||||
})
|
||||
final = final.Set("dhash", types.String(p.Dhash.String()))
|
||||
return final
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
// Copyright 2016 Attic Labs, Inc. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
package model
|
||||
|
||||
import (
|
||||
"github.com/attic-labs/noms/go/d"
|
||||
"github.com/attic-labs/noms/go/marshal"
|
||||
"github.com/attic-labs/noms/go/types"
|
||||
"github.com/attic-labs/noms/samples/go/photo-dedup/dhash"
|
||||
)
|
||||
|
||||
type PhotoGroup struct {
|
||||
ID ID `noms:"id"`
|
||||
Dhash dhash.Hash
|
||||
Cover *Photo `noms:"-"` // ignore tag required until ref support in marshalling
|
||||
Photos map[*Photo]bool `noms:"-"` // ignore tag required until ref support in marshalling
|
||||
}
|
||||
|
||||
func NewPhotoGroup(id ID, dhash dhash.Hash, cover *Photo, photos map[*Photo]bool) *PhotoGroup {
|
||||
return &PhotoGroup{id, dhash, cover, photos}
|
||||
}
|
||||
|
||||
// TODO: replace with simple marshalling when ref support is implemented
|
||||
func (pg *PhotoGroup) Marshal() types.Struct {
|
||||
v, err := marshal.Marshal(*pg)
|
||||
d.Chk.NoError(err)
|
||||
s := v.(types.Struct)
|
||||
s = s.Set("cover", pg.Cover.Marshal())
|
||||
refs := []types.Value{}
|
||||
for p, _ := range pg.Photos {
|
||||
refs = append(refs, p.Marshal())
|
||||
}
|
||||
s = s.Set("photos", types.NewSet(refs...))
|
||||
return s
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user