Add tx-regex, tx-replace, and tx-convert args. (#2651)

Also, make help text more complete.
This commit is contained in:
Dan Willhite
2016-09-29 13:03:53 -07:00
committed by GitHub
parent 9f5725bd27
commit 885dd5a21c
3 changed files with 166 additions and 32 deletions

View File

@@ -18,42 +18,54 @@ import (
flag "github.com/juju/gnuflag"
)
var longHelp = `Find retrieves and prints objects that satisfy the 'query' argument.
var longFindHelp = `'nomdex find' retrieves and prints objects that satisfy the 'query' argument.
Indexes are built using the 'nomdex up' command. Once built, the indexes can be referenced
in the 'query' arg to select objects matching certain criteria. For example, if there are
objects in the database that contain a personId and a gender field, 'nomdex up' can scan all
the objects in a given dataset and build an index on the specified field with the following
commands:
nomdex up --by gender --in-path <dsSpec> --out-ds gender-index
nomdex up --by personId --in-path <dsSpec> --out-ds personId-index
Indexes are built using the 'nomdex up' command. For information about building
indexes, see: nomdex up -h
Once these indexes are built, objects can be retrieved quickly and efficiently using the
nomdex query language. For example, the followign query could be used to find all people
with with a personId between 1 and 2000 and who are female:
nomdex find '(personId >= 0 and personId <= 2000) and gender = "female"
Objects that have been indexed can be quickly found using the nomdex query
language. For example, consider objects with the following type:
The next command would retrieve all people objects that were either male or had an personId
greater than 2000:
nomdex find 'gender = "male" or personId > 2000'
struct Person {
name String,
geopos struct GeoPos {
latitude Number,
longitude Number,
}
}
Objects of this type can be indexed on the name, latitude and longitude fields
with the following commands:
nomdex up --in-path ~/nomsdb::people.value --by .name --out-ds by-name
nomdex up --in-path ~/nomsdb::people.value --by .geopos.latitude --out-ds by-lat
nomdex up --in-path ~/nomsdb::people.value --by .geopos.longitude --out-ds by-lng
The following query could be used to find all people with an address near the
equator:
nomdex find 'by-lat >= -1.0 and by-lat <= 1.0'
We could also get a list of all people who live near the equator whose name begins with "A":
nomdex find '(by-name >= "A" and by-name < "B") and (by-lat >= -1.0 and by-lat <= 1.0)'
The query language is simple. It currently supports the following relational operators:
<, <=, >, >=, =, !=
Relational expressions are always of the form:
<index> <relational operator> <constant> e.g. personId >= 2000.
Indexes are the name given by the --out-ds argument in the 'nomdex up' command.
Constants are either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5,
3.147, etc).
Indexes are the name given by the --out-ds argument in the 'nomdex up' command. Constants are
either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5, 3.147, etc).
Relational expressions can be combined using the "and" and "or" operators. Parentheses can
be used to ensure that the evaluation is done in the desired order.
Relational expressions can be combined using the "and" and "or" operators.
Parentheses can (and should) be used to ensure that the evaluation is done in
the desired order.
`
var find = &util.Command{
Run: runFind,
UsageLine: "find --db <database spec> <query>",
Short: "Print objects in index that satisfy 'query'",
Long: longHelp,
Long: longFindHelp,
Flags: setupFindFlags,
Nargs: 1,
}

View File

@@ -5,6 +5,7 @@
package main
import (
"regexp"
"testing"
"github.com/attic-labs/noms/go/chunks"
@@ -12,6 +13,7 @@ import (
"github.com/attic-labs/noms/go/marshal"
"github.com/attic-labs/noms/go/spec"
"github.com/attic-labs/noms/go/util/clienttest"
"github.com/attic-labs/testify/assert"
"github.com/attic-labs/testify/suite"
)
@@ -113,3 +115,36 @@ func (s *testSuite) TestNomdex() {
s.Contains(stdout, "Found 23 objects")
s.Equal("", stderr)
}
func TestTransform(t *testing.T) {
assert := assert.New(t)
tcs := [][]string{
[]string{`"01/02/2003"`, "\"(\\d{2})/(\\d{2})/(\\d{4})\"", "$3/$2/$1", "2003/02/01"},
}
for _, tc := range tcs {
base, regex, replace, expected := tc[0], tc[1], tc[2], tc[3]
testRe := regexp.MustCompile(regex)
result := testRe.ReplaceAllString(base, replace)
assert.Equal(expected, result)
}
tcs = [][]string{
[]string{"343 STATE ST\nROCHESTER, NY 14650\n(43.161276, -77.619386)", "43.161276", "-77.619386"},
[]string{"TWO EMBARCADERO CENTER\nPROMENADE LEVEL SAN FRANCISCO, CA 94111\n", "", ""},
}
findLatRe := regexp.MustCompile("(?s)\\(([\\d.]+)")
findLngRe := regexp.MustCompile("(?s)(-?[\\d.]+)\\)")
for _, tc := range tcs {
base, expectedLat, expectedLng := tc[0], tc[1], tc[2]
lat := findLatRe.FindStringSubmatch(base)
assert.True(len(lat) == 0 && expectedLat == "" || (len(lat) == 2 && expectedLat == lat[1]))
lng := findLngRe.FindStringSubmatch(base)
assert.True(len(lng) == 0 && expectedLng == "" || (len(lng) == 2 && expectedLng == lng[1]))
}
}

View File

@@ -7,7 +7,10 @@ package main
import (
"fmt"
"os"
"regexp"
"strconv"
"sync"
"sync/atomic"
"github.com/attic-labs/noms/cmd/util"
"github.com/attic-labs/noms/go/config"
@@ -23,16 +26,63 @@ import (
)
var (
inPathArg = ""
outDsArg = ""
relPathArg = ""
inPathArg = ""
outDsArg = ""
relPathArg = ""
txRegexArg = ""
txReplaceArg = ""
txConvertArg = ""
)
var longUpHelp = `'nomdex up' builds indexes that are useful for rapidly accessing objects.
This sample tool can index objects based on any string or number attribute of that
object. The 'up' command works by scanning all the objects reachable from the --in-path
command line argument. It tests the object to determine if there is a string or number
value reachable by applying the --by path argument to the object. If so, the object is
added to the index under that value.
For example, if there are objects in the database that contain a personId and a
gender field, 'nomdex up' can scan all the objects in a given dataset and build
an index on the specified field with the following commands:
nomdex up --in-path <dsSpec>.value --by .gender --out-ds gender-index
nomdex up --in-path <dsSpec>.value --by .address.city --out-ds personId-index
The previous commands can be understood as follows. The first command updates or
builds an index by scanning all the objects that are reachable from |in-path| that
have a string or number value reachable using |by| and stores the root of the
resulting index in a dataset specified by |out-ds|.
Notice that the --in-path argument has a value of '<dsSpec>.value'. The '.value'
is not strictly necessary but it's normally useful when indexing. Since datasets
generally point to Commit objects in Noms, they usually have parents which are
previous versions of the data. If you add .value to the end of the dataset, only
the most recent version of the data will be indexed. Without the '.value' all
objects in all previous commits will also be indexed which is most often not what
is expected.
There are three additional commands that can be useful for transforming the value
being indexed:
* tx-replace: used to modify behavior of tx-regex, see below
* tx-regex: the behavior for this argument depends on whether a tx-replace argument
is present. If so, the go routine "regexp.ReplaceAllString() is called:
txRe := regex.MustCompile(|tx-regex|)
txRe.ReplaceAllString(|index value|, |tx-replace|
If tx-replace is not present then the following call is made on each value:
txRe := regex.MustCompile(|tx-regex|)
regex.FindStringSubmatch(|index value|)
*tx-convert: attempts to convert the index value to the type specified.
Currently the only value accepted for this arg is 'number'
The resulting indexes can be used by the 'nomdex find command' for help on that
see: nomdex find -h
`
var update = &util.Command{
Run: runUpdate,
UsageLine: "up --in-path <path> --out-ds <dspath> --by <relativepath>",
Short: "Build/Update an index",
Long: "Traverse all values starting at root and add values found at 'relativePath' to a map found at 'out-ds'\n",
Long: longUpHelp,
Flags: setupUpdateFlags,
Nargs: 0,
}
@@ -42,6 +92,9 @@ func setupUpdateFlags() *flag.FlagSet {
flagSet.StringVar(&inPathArg, "in-path", "", "a value to search for items to index within ")
flagSet.StringVar(&outDsArg, "out-ds", "", "name of dataset to save the results to")
flagSet.StringVar(&relPathArg, "by", "", "a path relative to all the items in <in-path> to index by")
flagSet.StringVar(&txRegexArg, "tx-regex", "", "perform a string transformation on value before putting it in index")
flagSet.StringVar(&txReplaceArg, "tx-replace", "", "replace values matched by tx-regex")
flagSet.StringVar(&txConvertArg, "tx-convert", "", "convert the result of a tx regex/replace to this type (only does 'number' currently)")
verbose.RegisterVerboseFlags(flagSet)
profile.RegisterProfileFlags(flagSet)
return flagSet
@@ -55,9 +108,10 @@ type StreamingSetEntry struct {
type IndexMap map[types.Value]StreamingSetEntry
type Index struct {
m IndexMap
cnt int64
mutex sync.Mutex
m IndexMap
indexedCnt int64
seenCnt int64
mutex sync.Mutex
}
func runUpdate(args []string) int {
@@ -102,6 +156,13 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
typeCacheMutex := sync.Mutex{}
typeCache := map[*types.Type]bool{}
var txRe *regexp.Regexp
if txRegexArg != "" {
var err error
txRe, err = regexp.Compile(txRegexArg)
d.CheckError(err)
}
index := Index{m: IndexMap{}}
walk.AllP(rootObject, db, func(v types.Value, r *types.Ref) {
typ := v.Type()
@@ -112,7 +173,7 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
pathResolved := false
tv := relPath.Resolve(v)
if tv != nil {
index.addToGraphBuilder(gb, tv, v)
index.addToGraphBuilder(gb, tv, v, txRe)
pathResolved = true
}
if !ok {
@@ -126,8 +187,34 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
status.Done()
}
func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value) {
idx.cnt++
func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value, txRe *regexp.Regexp) {
atomic.AddInt64(&idx.seenCnt, 1)
if txRe != nil {
k1 := types.EncodedValue(k)
k2 := ""
if txReplaceArg != "" {
k2 = txRe.ReplaceAllString(string(k1), txReplaceArg)
} else {
matches := txRe.FindStringSubmatch(string(k1))
if len(matches) > 0 {
k2 = matches[len(matches)-1]
}
}
if txConvertArg == "number" {
if k2 == "" {
return
}
n, err := strconv.ParseFloat(k2, 64)
if err != nil {
fmt.Println("error converting to number: ", err)
return
}
k = types.Number(n)
} else {
k = types.String(k2)
}
}
atomic.AddInt64(&idx.indexedCnt, 1)
gb.SetInsert(types.ValueSlice{k}, v)
status.Printf("Indexed %s objects", humanize.Comma(idx.cnt))
status.Printf("Found %s objects, Indexed %s objects", humanize.Comma(idx.seenCnt), humanize.Comma(idx.indexedCnt))
}