mirror of
https://github.com/dolthub/dolt.git
synced 2026-01-25 03:09:00 -06:00
Add tx-regex, tx-replace, and tx-convert args. (#2651)
Also, make help text more complete.
This commit is contained in:
@@ -18,42 +18,54 @@ import (
|
||||
flag "github.com/juju/gnuflag"
|
||||
)
|
||||
|
||||
var longHelp = `Find retrieves and prints objects that satisfy the 'query' argument.
|
||||
var longFindHelp = `'nomdex find' retrieves and prints objects that satisfy the 'query' argument.
|
||||
|
||||
Indexes are built using the 'nomdex up' command. Once built, the indexes can be referenced
|
||||
in the 'query' arg to select objects matching certain criteria. For example, if there are
|
||||
objects in the database that contain a personId and a gender field, 'nomdex up' can scan all
|
||||
the objects in a given dataset and build an index on the specified field with the following
|
||||
commands:
|
||||
nomdex up --by gender --in-path <dsSpec> --out-ds gender-index
|
||||
nomdex up --by personId --in-path <dsSpec> --out-ds personId-index
|
||||
Indexes are built using the 'nomdex up' command. For information about building
|
||||
indexes, see: nomdex up -h
|
||||
|
||||
Once these indexes are built, objects can be retrieved quickly and efficiently using the
|
||||
nomdex query language. For example, the followign query could be used to find all people
|
||||
with with a personId between 1 and 2000 and who are female:
|
||||
nomdex find '(personId >= 0 and personId <= 2000) and gender = "female"
|
||||
Objects that have been indexed can be quickly found using the nomdex query
|
||||
language. For example, consider objects with the following type:
|
||||
|
||||
The next command would retrieve all people objects that were either male or had an personId
|
||||
greater than 2000:
|
||||
nomdex find 'gender = "male" or personId > 2000'
|
||||
struct Person {
|
||||
name String,
|
||||
geopos struct GeoPos {
|
||||
latitude Number,
|
||||
longitude Number,
|
||||
}
|
||||
}
|
||||
|
||||
Objects of this type can be indexed on the name, latitude and longitude fields
|
||||
with the following commands:
|
||||
nomdex up --in-path ~/nomsdb::people.value --by .name --out-ds by-name
|
||||
nomdex up --in-path ~/nomsdb::people.value --by .geopos.latitude --out-ds by-lat
|
||||
nomdex up --in-path ~/nomsdb::people.value --by .geopos.longitude --out-ds by-lng
|
||||
|
||||
The following query could be used to find all people with an address near the
|
||||
equator:
|
||||
nomdex find 'by-lat >= -1.0 and by-lat <= 1.0'
|
||||
|
||||
We could also get a list of all people who live near the equator whose name begins with "A":
|
||||
nomdex find '(by-name >= "A" and by-name < "B") and (by-lat >= -1.0 and by-lat <= 1.0)'
|
||||
|
||||
The query language is simple. It currently supports the following relational operators:
|
||||
<, <=, >, >=, =, !=
|
||||
Relational expressions are always of the form:
|
||||
<index> <relational operator> <constant> e.g. personId >= 2000.
|
||||
|
||||
Indexes are the name given by the --out-ds argument in the 'nomdex up' command.
|
||||
Constants are either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5,
|
||||
3.147, etc).
|
||||
|
||||
Indexes are the name given by the --out-ds argument in the 'nomdex up' command. Constants are
|
||||
either "strings" (in quotes) or numbers (e.g. 3, 3000, -2, -2.5, 3.147, etc).
|
||||
|
||||
Relational expressions can be combined using the "and" and "or" operators. Parentheses can
|
||||
be used to ensure that the evaluation is done in the desired order.
|
||||
Relational expressions can be combined using the "and" and "or" operators.
|
||||
Parentheses can (and should) be used to ensure that the evaluation is done in
|
||||
the desired order.
|
||||
`
|
||||
|
||||
var find = &util.Command{
|
||||
Run: runFind,
|
||||
UsageLine: "find --db <database spec> <query>",
|
||||
Short: "Print objects in index that satisfy 'query'",
|
||||
Long: longHelp,
|
||||
Long: longFindHelp,
|
||||
Flags: setupFindFlags,
|
||||
Nargs: 1,
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"testing"
|
||||
|
||||
"github.com/attic-labs/noms/go/chunks"
|
||||
@@ -12,6 +13,7 @@ import (
|
||||
"github.com/attic-labs/noms/go/marshal"
|
||||
"github.com/attic-labs/noms/go/spec"
|
||||
"github.com/attic-labs/noms/go/util/clienttest"
|
||||
"github.com/attic-labs/testify/assert"
|
||||
"github.com/attic-labs/testify/suite"
|
||||
)
|
||||
|
||||
@@ -113,3 +115,36 @@ func (s *testSuite) TestNomdex() {
|
||||
s.Contains(stdout, "Found 23 objects")
|
||||
s.Equal("", stderr)
|
||||
}
|
||||
|
||||
func TestTransform(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
tcs := [][]string{
|
||||
[]string{`"01/02/2003"`, "\"(\\d{2})/(\\d{2})/(\\d{4})\"", "$3/$2/$1", "2003/02/01"},
|
||||
}
|
||||
|
||||
for _, tc := range tcs {
|
||||
base, regex, replace, expected := tc[0], tc[1], tc[2], tc[3]
|
||||
|
||||
testRe := regexp.MustCompile(regex)
|
||||
result := testRe.ReplaceAllString(base, replace)
|
||||
assert.Equal(expected, result)
|
||||
}
|
||||
|
||||
tcs = [][]string{
|
||||
[]string{"343 STATE ST\nROCHESTER, NY 14650\n(43.161276, -77.619386)", "43.161276", "-77.619386"},
|
||||
[]string{"TWO EMBARCADERO CENTER\nPROMENADE LEVEL SAN FRANCISCO, CA 94111\n", "", ""},
|
||||
}
|
||||
|
||||
findLatRe := regexp.MustCompile("(?s)\\(([\\d.]+)")
|
||||
findLngRe := regexp.MustCompile("(?s)(-?[\\d.]+)\\)")
|
||||
for _, tc := range tcs {
|
||||
base, expectedLat, expectedLng := tc[0], tc[1], tc[2]
|
||||
|
||||
lat := findLatRe.FindStringSubmatch(base)
|
||||
assert.True(len(lat) == 0 && expectedLat == "" || (len(lat) == 2 && expectedLat == lat[1]))
|
||||
|
||||
lng := findLngRe.FindStringSubmatch(base)
|
||||
assert.True(len(lng) == 0 && expectedLng == "" || (len(lng) == 2 && expectedLng == lng[1]))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,10 @@ package main
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"github.com/attic-labs/noms/cmd/util"
|
||||
"github.com/attic-labs/noms/go/config"
|
||||
@@ -23,16 +26,63 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
inPathArg = ""
|
||||
outDsArg = ""
|
||||
relPathArg = ""
|
||||
inPathArg = ""
|
||||
outDsArg = ""
|
||||
relPathArg = ""
|
||||
txRegexArg = ""
|
||||
txReplaceArg = ""
|
||||
txConvertArg = ""
|
||||
)
|
||||
|
||||
var longUpHelp = `'nomdex up' builds indexes that are useful for rapidly accessing objects.
|
||||
|
||||
This sample tool can index objects based on any string or number attribute of that
|
||||
object. The 'up' command works by scanning all the objects reachable from the --in-path
|
||||
command line argument. It tests the object to determine if there is a string or number
|
||||
value reachable by applying the --by path argument to the object. If so, the object is
|
||||
added to the index under that value.
|
||||
|
||||
For example, if there are objects in the database that contain a personId and a
|
||||
gender field, 'nomdex up' can scan all the objects in a given dataset and build
|
||||
an index on the specified field with the following commands:
|
||||
nomdex up --in-path <dsSpec>.value --by .gender --out-ds gender-index
|
||||
nomdex up --in-path <dsSpec>.value --by .address.city --out-ds personId-index
|
||||
|
||||
The previous commands can be understood as follows. The first command updates or
|
||||
builds an index by scanning all the objects that are reachable from |in-path| that
|
||||
have a string or number value reachable using |by| and stores the root of the
|
||||
resulting index in a dataset specified by |out-ds|.
|
||||
|
||||
Notice that the --in-path argument has a value of '<dsSpec>.value'. The '.value'
|
||||
is not strictly necessary but it's normally useful when indexing. Since datasets
|
||||
generally point to Commit objects in Noms, they usually have parents which are
|
||||
previous versions of the data. If you add .value to the end of the dataset, only
|
||||
the most recent version of the data will be indexed. Without the '.value' all
|
||||
objects in all previous commits will also be indexed which is most often not what
|
||||
is expected.
|
||||
|
||||
There are three additional commands that can be useful for transforming the value
|
||||
being indexed:
|
||||
* tx-replace: used to modify behavior of tx-regex, see below
|
||||
* tx-regex: the behavior for this argument depends on whether a tx-replace argument
|
||||
is present. If so, the go routine "regexp.ReplaceAllString() is called:
|
||||
txRe := regex.MustCompile(|tx-regex|)
|
||||
txRe.ReplaceAllString(|index value|, |tx-replace|
|
||||
If tx-replace is not present then the following call is made on each value:
|
||||
txRe := regex.MustCompile(|tx-regex|)
|
||||
regex.FindStringSubmatch(|index value|)
|
||||
*tx-convert: attempts to convert the index value to the type specified.
|
||||
Currently the only value accepted for this arg is 'number'
|
||||
|
||||
The resulting indexes can be used by the 'nomdex find command' for help on that
|
||||
see: nomdex find -h
|
||||
`
|
||||
|
||||
var update = &util.Command{
|
||||
Run: runUpdate,
|
||||
UsageLine: "up --in-path <path> --out-ds <dspath> --by <relativepath>",
|
||||
Short: "Build/Update an index",
|
||||
Long: "Traverse all values starting at root and add values found at 'relativePath' to a map found at 'out-ds'\n",
|
||||
Long: longUpHelp,
|
||||
Flags: setupUpdateFlags,
|
||||
Nargs: 0,
|
||||
}
|
||||
@@ -42,6 +92,9 @@ func setupUpdateFlags() *flag.FlagSet {
|
||||
flagSet.StringVar(&inPathArg, "in-path", "", "a value to search for items to index within ")
|
||||
flagSet.StringVar(&outDsArg, "out-ds", "", "name of dataset to save the results to")
|
||||
flagSet.StringVar(&relPathArg, "by", "", "a path relative to all the items in <in-path> to index by")
|
||||
flagSet.StringVar(&txRegexArg, "tx-regex", "", "perform a string transformation on value before putting it in index")
|
||||
flagSet.StringVar(&txReplaceArg, "tx-replace", "", "replace values matched by tx-regex")
|
||||
flagSet.StringVar(&txConvertArg, "tx-convert", "", "convert the result of a tx regex/replace to this type (only does 'number' currently)")
|
||||
verbose.RegisterVerboseFlags(flagSet)
|
||||
profile.RegisterProfileFlags(flagSet)
|
||||
return flagSet
|
||||
@@ -55,9 +108,10 @@ type StreamingSetEntry struct {
|
||||
type IndexMap map[types.Value]StreamingSetEntry
|
||||
|
||||
type Index struct {
|
||||
m IndexMap
|
||||
cnt int64
|
||||
mutex sync.Mutex
|
||||
m IndexMap
|
||||
indexedCnt int64
|
||||
seenCnt int64
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
func runUpdate(args []string) int {
|
||||
@@ -102,6 +156,13 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
|
||||
typeCacheMutex := sync.Mutex{}
|
||||
typeCache := map[*types.Type]bool{}
|
||||
|
||||
var txRe *regexp.Regexp
|
||||
if txRegexArg != "" {
|
||||
var err error
|
||||
txRe, err = regexp.Compile(txRegexArg)
|
||||
d.CheckError(err)
|
||||
}
|
||||
|
||||
index := Index{m: IndexMap{}}
|
||||
walk.AllP(rootObject, db, func(v types.Value, r *types.Ref) {
|
||||
typ := v.Type()
|
||||
@@ -112,7 +173,7 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
|
||||
pathResolved := false
|
||||
tv := relPath.Resolve(v)
|
||||
if tv != nil {
|
||||
index.addToGraphBuilder(gb, tv, v)
|
||||
index.addToGraphBuilder(gb, tv, v, txRe)
|
||||
pathResolved = true
|
||||
}
|
||||
if !ok {
|
||||
@@ -126,8 +187,34 @@ func addElementsToGraphBuilder(gb *types.GraphBuilder, db datas.Database, rootOb
|
||||
status.Done()
|
||||
}
|
||||
|
||||
func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value) {
|
||||
idx.cnt++
|
||||
func (idx *Index) addToGraphBuilder(gb *types.GraphBuilder, k, v types.Value, txRe *regexp.Regexp) {
|
||||
atomic.AddInt64(&idx.seenCnt, 1)
|
||||
if txRe != nil {
|
||||
k1 := types.EncodedValue(k)
|
||||
k2 := ""
|
||||
if txReplaceArg != "" {
|
||||
k2 = txRe.ReplaceAllString(string(k1), txReplaceArg)
|
||||
} else {
|
||||
matches := txRe.FindStringSubmatch(string(k1))
|
||||
if len(matches) > 0 {
|
||||
k2 = matches[len(matches)-1]
|
||||
}
|
||||
}
|
||||
if txConvertArg == "number" {
|
||||
if k2 == "" {
|
||||
return
|
||||
}
|
||||
n, err := strconv.ParseFloat(k2, 64)
|
||||
if err != nil {
|
||||
fmt.Println("error converting to number: ", err)
|
||||
return
|
||||
}
|
||||
k = types.Number(n)
|
||||
} else {
|
||||
k = types.String(k2)
|
||||
}
|
||||
}
|
||||
atomic.AddInt64(&idx.indexedCnt, 1)
|
||||
gb.SetInsert(types.ValueSlice{k}, v)
|
||||
status.Printf("Indexed %s objects", humanize.Comma(idx.cnt))
|
||||
status.Printf("Found %s objects, Indexed %s objects", humanize.Comma(idx.seenCnt), humanize.Comma(idx.indexedCnt))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user