diff --git a/samples/go/nomdex/nomdex_find.go b/samples/go/nomdex/nomdex_find.go index 9e2e4e5806..c524e31d70 100644 --- a/samples/go/nomdex/nomdex_find.go +++ b/samples/go/nomdex/nomdex_find.go @@ -18,42 +18,54 @@ import ( flag "github.com/juju/gnuflag" ) -var longHelp = `Find retrieves and prints objects that satisfy the 'query' argument. +var longFindHelp = `'nomdex find' retrieves and prints objects that satisfy the 'query' argument. -Indexes are built using the 'nomdex up' command. Once built, the indexes can be referenced -in the 'query' arg to select objects matching certain criteria. For example, if there are -objects in the database that contain a personId and a gender field, 'nomdex up' can scan all -the objects in a given dataset and build an index on the specified field with the following -commands: - nomdex up --by gender --in-path --out-ds gender-index - nomdex up --by personId --in-path --out-ds personId-index +Indexes are built using the 'nomdex up' command. For information about building +indexes, see: nomdex up -h -Once these indexes are built, objects can be retrieved quickly and efficiently using the -nomdex query language. For example, the followign query could be used to find all people -with with a personId between 1 and 2000 and who are female: - nomdex find '(personId >= 0 and personId <= 2000) and gender = "female" +Objects that have been indexed can be quickly found using the nomdex query +language. For example, consider objects with the following type: -The next command would retrieve all people objects that were either male or had an personId -greater than 2000: - nomdex find 'gender = "male" or personId > 2000' +struct Person { + name String, + geopos struct GeoPos { + latitude Number, + longitude Number, + } +} +Objects of this type can be indexed on the name, latitude and longitude fields +with the following commands: + nomdex up --in-path ~/nomsdb::people.value --by .name --out-ds by-name + nomdex up --in-path ~/nomsdb::people.value --by .geopos.latitude --out-ds by-lat + nomdex up --in-path ~/nomsdb::people.value --by .geopos.longitude --out-ds by-lng + +The following query could be used to find all people with an address near the +equator: + nomdex find 'by-lat >= -1.0 and by-lat <= 1.0' + +We could also get a list of all people who live near the equator whose name begins with "A": + nomdex find '(by-name >= "A" and by-name < "B") and (by-lat >= -1.0 and by-lat <= 1.0)' + The query language is simple. It currently supports the following relational operators: <, <=, >, >=, =, != Relational expressions are always of the form: e.g. personId >= 2000. + +Indexes are the name given by the --out-ds argument in the 'nomdex up' command. +Constants are either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5, +3.147, etc). -Indexes are the name given by the --out-ds argument in the 'nomdex up' command. Constants are -either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5, 3.147, etc). - -Relational expressions can be combined using the "and" and "or" operators. Parentheses can -be used to ensure that the evaluation is done in the desired order. +Relational expressions can be combined using the "and" and "or" operators. +Parentheses can (and should) be used to ensure that the evaluation is done in +the desired order. ` var find = &util.Command{ Run: runFind, UsageLine: "find --db ", Short: "Print objects in index that satisfy 'query'", - Long: longHelp, + Long: longFindHelp, Flags: setupFindFlags, Nargs: 1, } diff --git a/samples/go/nomdex/nomdex_test.go b/samples/go/nomdex/nomdex_test.go index 5887216992..e379664429 100644 --- a/samples/go/nomdex/nomdex_test.go +++ b/samples/go/nomdex/nomdex_test.go @@ -5,6 +5,7 @@ package main import ( + "regexp" "testing" "github.com/attic-labs/noms/go/chunks" @@ -12,6 +13,7 @@ import ( "github.com/attic-labs/noms/go/marshal" "github.com/attic-labs/noms/go/spec" "github.com/attic-labs/noms/go/util/clienttest" + "github.com/attic-labs/testify/assert" "github.com/attic-labs/testify/suite" ) @@ -113,3 +115,36 @@ func (s *testSuite) TestNomdex() { s.Contains(stdout, "Found 23 objects") s.Equal("", stderr) } + +func TestTransform(t *testing.T) { + assert := assert.New(t) + + tcs := [][]string{ + []string{`"01/02/2003"`, "\"(\\d{2})/(\\d{2})/(\\d{4})\"", "$3/$2/$1", "2003/02/01"}, + } + + for _, tc := range tcs { + base, regex, replace, expected := tc[0], tc[1], tc[2], tc[3] + + testRe := regexp.MustCompile(regex) + result := testRe.ReplaceAllString(base, replace) + assert.Equal(expected, result) + } + + tcs = [][]string{ + []string{"343 STATE ST\nROCHESTER, NY 14650\n(43.161276, -77.619386)", "43.161276", "-77.619386"}, + []string{"TWO EMBARCADERO CENTER\nPROMENADE LEVEL SAN FRANCISCO, CA 94111\n", "", ""}, + } + + findLatRe := regexp.MustCompile("(?s)\\(([\\d.]+)") + findLngRe := regexp.MustCompile("(?s)(-?[\\d.]+)\\)") + for _, tc := range tcs { + base, expectedLat, expectedLng := tc[0], tc[1], tc[2] + + lat := findLatRe.FindStringSubmatch(base) + assert.True(len(lat) == 0 && expectedLat == "" || (len(lat) == 2 && expectedLat == lat[1])) + + lng := findLngRe.FindStringSubmatch(base) + assert.True(len(lng) == 0 && expectedLng == "" || (len(lng) == 2 && expectedLng == lng[1])) + } +} diff --git a/samples/go/nomdex/nomdex_update.go b/samples/go/nomdex/nomdex_update.go index 0c2e61d043..e03e83c26b 100644 --- a/samples/go/nomdex/nomdex_update.go +++ b/samples/go/nomdex/nomdex_update.go @@ -7,7 +7,10 @@ package main import ( "fmt" "os" + "regexp" + "strconv" "sync" + "sync/atomic" "github.com/attic-labs/noms/cmd/util" "github.com/attic-labs/noms/go/config" @@ -23,16 +26,63 @@ import ( ) var ( - inPathArg = "" - outDsArg = "" - relPathArg = "" + inPathArg = "" + outDsArg = "" + relPathArg = "" + txRegexArg = "" + txReplaceArg = "" + txConvertArg = "" ) +var longUpHelp = `'nomdex up' builds indexes that are useful for rapidly accessing objects. + +This sample tool can index objects based on any string or number attribute of that +object. The 'up' command works by scanning all the objects reachable from the --in-path +command line argument. It tests the object to determine if there is a string or number +value reachable by applying the --by path argument to the object. If so, the object is +added to the index under that value. + +For example, if there are objects in the database that contain a personId and a +gender field, 'nomdex up' can scan all the objects in a given dataset and build +an index on the specified field with the following commands: + nomdex up --in-path .value --by .gender --out-ds gender-index + nomdex up --in-path .value --by .address.city --out-ds personId-index + +The previous commands can be understood as follows. The first command updates or +builds an index by scanning all the objects that are reachable from |in-path| that +have a string or number value reachable using |by| and stores the root of the +resulting index in a dataset specified by |out-ds|. + +Notice that the --in-path argument has a value of '.value'. The '.value' +is not strictly necessary but it's normally useful when indexing. Since datasets +generally point to Commit objects in Noms, they usually have parents which are +previous versions of the data. If you add .value to the end of the dataset, only +the most recent version of the data will be indexed. Without the '.value' all +objects in all previous commits will also be indexed which is most often not what +is expected. + +There are three additional commands that can be useful for transforming the value +being indexed: + * tx-replace: used to modify behavior of tx-regex, see below + * tx-regex: the behavior for this argument depends on whether a tx-replace argument + is present. If so, the go routine "regexp.ReplaceAllString() is called: + txRe := regex.MustCompile(|tx-regex|) + txRe.ReplaceAllString(|index value|, |tx-replace| + If tx-replace is not present then the following call is made on each value: + txRe := regex.MustCompile(|tx-regex|) + regex.FindStringSubmatch(|index value|) + *tx-convert: attempts to convert the index value to the type specified. + Currently the only value accepted for this arg is 'number' + +The resulting indexes can be used by the 'nomdex find command' for help on that +see: nomdex find -h +` + var update = &util.Command{ Run: runUpdate, UsageLine: "up --in-path --out-ds --by ", Short: "Build/Update an index", - Long: "Traverse all values starting at root and add values found at 'relativePath' to a map found at 'out-ds'\n", + Long: longUpHelp, Flags: setupUpdateFlags, Nargs: 0, } @@ -42,6 +92,9 @@ func setupUpdateFlags() *flag.FlagSet { flagSet.StringVar(&inPathArg, "in-path", "", "a value to search for items to index within ") flagSet.StringVar(&outDsArg, "out-ds", "", "name of dataset to save the results to") flagSet.StringVar(&relPathArg, "by", "", "a path relative to all the items in to index by") + flagSet.StringVar(&txRegexArg, "tx-regex", "", "perform a string transformation on value before putting it in index") + flagSet.StringVar(&txReplaceArg, "tx-replace", "", "replace values matched by tx-regex") + flagSet.StringVar(&txConvertArg, "tx-convert", "", "convert the result of a tx regex/replace to this type (only does 'number' currently)") verbose.RegisterVerboseFlags(flagSet) profile.RegisterProfileFlags(flagSet) return flagSet @@ -55,9 +108,10 @@ type StreamingSetEntry struct { type IndexMap map[types.Value]StreamingSetEntry type Index struct { - m IndexMap - cnt int64 - mutex sync.Mutex + m IndexMap + indexedCnt int64 + seenCnt int64 + mutex sync.Mutex } func runUpdate(args []string) int { @@ -102,6 +156,13 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb typeCacheMutex := sync.Mutex{} typeCache := map[*types.Type]bool{} + var txRe *regexp.Regexp + if txRegexArg != "" { + var err error + txRe, err = regexp.Compile(txRegexArg) + d.CheckError(err) + } + index := Index{m: IndexMap{}} walk.AllP(rootObject, db, func(v types.Value, r *types.Ref) { typ := v.Type() @@ -112,7 +173,7 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb pathResolved := false tv := relPath.Resolve(v) if tv != nil { - index.addToGraphBuilder(gb, tv, v) + index.addToGraphBuilder(gb, tv, v, txRe) pathResolved = true } if !ok { @@ -126,8 +187,34 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb status.Done() } -func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value) { - idx.cnt++ +func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value, txRe *regexp.Regexp) { + atomic.AddInt64(&idx.seenCnt, 1) + if txRe != nil { + k1 := types.EncodedValue(k) + k2 := "" + if txReplaceArg != "" { + k2 = txRe.ReplaceAllString(string(k1), txReplaceArg) + } else { + matches := txRe.FindStringSubmatch(string(k1)) + if len(matches) > 0 { + k2 = matches[len(matches)-1] + } + } + if txConvertArg == "number" { + if k2 == "" { + return + } + n, err := strconv.ParseFloat(k2, 64) + if err != nil { + fmt.Println("error converting to number: ", err) + return + } + k = types.Number(n) + } else { + k = types.String(k2) + } + } + atomic.AddInt64(&idx.indexedCnt, 1) gb.SetInsert(types.ValueSlice{k}, v) - status.Printf("Indexed %s objects", humanize.Comma(idx.cnt)) + status.Printf("Found %s objects, Indexed %s objects", humanize.Comma(idx.seenCnt), humanize.Comma(idx.indexedCnt)) }