Files
opencloud/vendor/github.com/spacewander/go-suffix-tree/suffix.go
T
2023-04-19 20:24:34 +02:00

535 lines
14 KiB
Go

package suffix
import (
"bytes"
"sort"
)
// Return
// the first index of the mismatch byte (from right to left, starts from 1)
// len(left)+1 if left byte sequence is shorter than right one
// 0 if two byte sequences are equal
// -len(right)-1 if left byte sequence is longer than right one
func suffixDiff(left, right []byte) int {
leftLen := len(left)
rightLen := len(right)
minLen := leftLen
if minLen > rightLen {
minLen = rightLen
}
for i := 1; i <= minLen; i++ {
if left[leftLen-i] != right[rightLen-i] {
return i
}
}
if leftLen < rightLen {
return leftLen + 1
} else if leftLen == rightLen {
return 0
}
return -rightLen - 1
}
type _Edge struct {
label []byte
// Could be either Node or Leaf
point interface{}
}
type _Leaf struct {
// For LongestSuffix and so on. We choice to use more memory(24 bytes per node)
// over appending keys each time.
originKey []byte
value interface{}
}
type _Node struct {
edges []*_Edge
}
func (node *_Node) insertEdge(edge *_Edge) {
newEdgeLabelLen := len(edge.label)
idx := sort.Search(len(node.edges), func(i int) bool {
return newEdgeLabelLen < len(node.edges[i].label)
})
node.edges = append(node.edges, nil)
copy(node.edges[idx+1:], node.edges[idx:])
node.edges[idx] = edge
}
func (node *_Node) removeEdge(idx int) {
copy(node.edges[idx:], node.edges[idx+1:])
node.edges[len(node.edges)-1] = nil
node.edges = node.edges[:len(node.edges)-1]
}
// Reorder edge which is not shorter than before
func (node *_Node) backwardEdge(idx int) {
edge := node.edges[idx]
edgeLabelLen := len(edge.label)
edgesLen := len(node.edges)
if idx == edgesLen-1 {
// Still longest, no need to change
return
}
// Get the first edge which's label is longer than this edge...
i := sort.Search(edgesLen-idx-1, func(j int) bool {
return edgeLabelLen < len(node.edges[j+idx+1].label)
})
// ... and insert before it. (Note that we just add `idx` instead of `idx+1`)
i += idx
copy(node.edges[idx:i], node.edges[idx+1:i+1])
node.edges[i] = edge
}
// Reorder edge which is shorter than before
func (node *_Node) forwardEdge(idx int) {
edge := node.edges[idx]
edgeLabelLen := len(edge.label)
i := sort.Search(idx, func(j int) bool {
return edgeLabelLen < len(node.edges[j].label)
})
copy(node.edges[i+1:idx+1], node.edges[i:idx])
node.edges[i] = edge
}
func (node *_Node) insert(originKey []byte, key []byte, value interface{}) (
oldValue interface{}, ok bool) {
start := 0
if len(node.edges) > 0 && len(node.edges[0].label) == 0 {
// handle empty label as a special case, so the rest of labels don't share
// common suffix
if len(key) == 0 {
leaf, _ := node.edges[0].point.(*_Leaf)
oldValue = leaf.value
leaf.value = value
return oldValue, true
}
start++
}
for i := start; i < len(node.edges); i++ {
edge := node.edges[i]
gap := suffixDiff(key, edge.label)
if gap == 0 {
// CASE 1: key == label
switch point := edge.point.(type) {
case *_Leaf:
// Leaf hitted, replace old value
oldValue = point.value
point.value = value
return oldValue, true
case *_Node:
// Node hitted, insert a leaf under this Node
return point.insert(originKey, []byte{}, value)
}
} else if gap < 0 {
// CASE 2: key > label
gap = -gap
label := key[:len(key)-gap+1]
switch point := edge.point.(type) {
case *_Leaf:
// Before: Node - "label" -> Leaf(Value1)
// After: Node - "label" - Node - "" -> Leaf(Value1)
// |- "s" -> Leaf(Value2)
// Create new Node, move old Leaf under new Node, and then
// insert a new Leaf
newNode := &_Node{
edges: []*_Edge{
{
label: []byte{},
point: point,
},
{
label: label,
point: &_Leaf{
originKey: originKey,
value: value,
},
},
},
}
edge.point = newNode
return nil, true
case *_Node:
// Before: Node - "label" -> Node - "" -> Leaf(Value1)
// After: Node - "label" - Node - "" -> Leaf(Value1)
// |- "s" -> Leaf(Value2)
// Insert a new Leaf with extra data as label
return point.insert(originKey, label, value)
}
} else if gap > 1 {
// CASE 3: mismatch(key, label) after first letter or key < label
// Before: Node - "labels" -> Node/Leaf(Value1)
// After: Node - "label" - Node - "s" -> Node/Leaf(Value1)
// |- "" -> Leaf(Value2)
// Before: Node - "label" -> Node/Leaf(Value1)
// After: Node - "lab" - Node - "el" -> Node/Leaf(Value1)
// |- "or" -> Leaf(Value2)
newEdge := &_Edge{
label: edge.label[:len(edge.label)-gap+1],
point: edge.point,
}
keyEdge := &_Edge{
label: key[:len(key)-gap+1],
point: &_Leaf{
originKey: originKey,
value: value,
},
}
newNode := &_Node{
edges: make([]*_Edge, 2),
}
if len(newEdge.label) < len(keyEdge.label) {
newNode.edges[0], newNode.edges[1] = newEdge, keyEdge
} else {
newNode.edges[0], newNode.edges[1] = keyEdge, newEdge
}
edge.point = newNode
edge.label = edge.label[len(edge.label)-gap+1:]
node.forwardEdge(i)
return nil, true
}
// CASE 4: totally mismatch
}
leaf := &_Leaf{
originKey: originKey,
value: value,
}
edge := &_Edge{
label: key,
point: leaf,
}
node.insertEdge(edge)
return nil, true
}
func (node *_Node) get(key []byte) (value interface{}, found bool) {
edges := node.edges
start := 0
if len(edges[0].label) == 0 {
// handle empty label as a special case, so the rest of labels don't share
// common suffix
if len(key) == 0 {
leaf, _ := edges[0].point.(*_Leaf)
return leaf.value, true
}
start++
}
keyLen := len(key)
for i := start; i < len(edges); i++ {
edge := edges[i]
edgeLabelLen := len(edge.label)
if keyLen > edgeLabelLen {
if bytes.Equal(key[len(key)-len(edge.label):], edge.label) {
subKey := key[:len(key)-len(edge.label)]
switch point := edge.point.(type) {
case *_Leaf:
return nil, false
case *_Node:
return point.get(subKey)
}
}
} else if keyLen == edgeLabelLen {
if bytes.Equal(key, edge.label) {
switch point := edge.point.(type) {
case *_Leaf:
return point.value, true
case *_Node:
return point.get([]byte{})
}
}
} else {
break
}
}
return nil, false
}
func (node *_Node) longestSuffix(key []byte) (matchedKey []byte, value interface{}, found bool) {
edges := node.edges
start := 0
if len(edges[0].label) == 0 {
// handle empty label as a special case, so the rest of labels don't share
// common suffix
if len(key) == 0 {
leaf, _ := edges[0].point.(*_Leaf)
return leaf.originKey, leaf.value, true
}
start++
}
keyLen := len(key)
for i := start; i < len(edges); i++ {
edge := edges[i]
edgeLabelLen := len(edge.label)
if keyLen > edgeLabelLen {
if bytes.Equal(key[len(key)-len(edge.label):], edge.label) {
subKey := key[:len(key)-len(edge.label)]
switch point := edge.point.(type) {
case *_Leaf:
return point.originKey, point.value, true
case *_Node:
matchedKey, value, found := point.longestSuffix(subKey)
if found {
return matchedKey, value, found
}
}
}
} else if keyLen == edgeLabelLen {
if bytes.Equal(key, edge.label) {
switch point := edge.point.(type) {
case *_Leaf:
return point.originKey, point.value, true
case *_Node:
matchedKey, value, found := point.longestSuffix([]byte{})
if found {
return matchedKey, value, found
}
}
}
} else {
break
}
}
if start == 1 {
leaf, _ := edges[0].point.(*_Leaf)
return leaf.originKey, leaf.value, true
}
return nil, nil, false
}
func (node *_Node) mergeChildNode(idx int, child *_Node) {
if len(child.edges) == 1 {
edge := node.edges[idx]
edge.point = child.edges[0].point
edge.label = append(child.edges[0].label, edge.label...)
node.backwardEdge(idx)
}
// When child has only one edge, we will remove the child and merge its label,
// So there is no case that child has no edge.
}
func (node *_Node) remove(key []byte) (value interface{}, found bool, childRemoved bool) {
edges := node.edges
start := 0
if len(edges[0].label) == 0 {
// handle empty label as a special case, so the rest of labels don't share
// common suffix
if len(key) == 0 {
leaf, _ := edges[0].point.(*_Leaf)
value = leaf.value
node.removeEdge(0)
return value, true, true
}
start++
}
keyLen := len(key)
for i := start; i < len(edges); i++ {
edge := edges[i]
edgeLabelLen := len(edge.label)
if keyLen > edgeLabelLen {
if bytes.Equal(key[len(key)-len(edge.label):], edge.label) {
key := key[:len(key)-len(edge.label)]
switch point := edge.point.(type) {
case *_Node:
value, found, childRemoved = point.remove(key)
if childRemoved {
node.mergeChildNode(i, point)
}
return value, found, false
}
}
} else if keyLen == edgeLabelLen {
if bytes.Equal(key, edge.label) {
switch point := edge.point.(type) {
case *_Leaf:
value = point.value
node.removeEdge(i)
return value, true, true
case *_Node:
value, found, childRemoved = point.remove([]byte{})
if childRemoved {
node.mergeChildNode(i, point)
}
return value, found, false
}
}
} else {
break
}
}
return nil, false, false
}
// return either _Leaf or _Node as interface{}
func (node *_Node) getPointHasSuffix(key []byte) (interface{}, []byte, bool) {
edges := node.edges
keyLen := len(key)
for i := len(edges) - 1; i >= 0; i-- {
edge := edges[i]
edgeLabelLen := len(edge.label)
if keyLen > edgeLabelLen {
if bytes.Equal(key[len(key)-len(edge.label):], edge.label) {
subKey := key[:len(key)-len(edge.label)]
switch point := edge.point.(type) {
case *_Leaf:
return nil, nil, false
case *_Node:
return point.getPointHasSuffix(subKey)
}
}
} else {
if bytes.HasSuffix(edge.label, key) {
return edge.point, edge.label[:len(edge.label)-len(key)], true
}
}
}
return nil, nil, false
}
func (node *_Node) walk(suffix []byte, f func(key []byte, value interface{}) bool, stop *bool) {
for _, edge := range node.edges {
if *stop {
return
}
switch point := edge.point.(type) {
case *_Leaf:
*stop = f(append(edge.label, suffix...), point.value)
case *_Node:
point.walk(append(edge.label, suffix...), f, stop)
}
}
}
func (node *_Node) walkNode(suffix [][]byte, f func(labels [][]byte, value interface{})) {
f(append([][]byte{nil}, suffix...), nil)
nodes := []*_Edge{}
leaves := []*_Edge{}
for _, edge := range node.edges {
switch edge.point.(type) {
case *_Leaf:
leaves = append(leaves, edge)
case *_Node:
nodes = append(nodes, edge)
}
}
for _, edge := range leaves {
leaf, _ := edge.point.(*_Leaf)
f(append([][]byte{edge.label}, suffix...), leaf.value)
}
for _, edge := range nodes {
node, _ := edge.point.(*_Node)
node.walkNode(append([][]byte{edge.label}, suffix...), f)
}
}
// Tree represents a suffix tree.
type Tree struct {
root *_Node
leavesNum int
}
// NewTree create a suffix tree for future usage.
func NewTree() *Tree {
return &Tree{
root: &_Node{
edges: []*_Edge{},
},
leavesNum: 0,
}
}
// Insert suffix tree with given key and value. Return the previous value and a boolean to
// indicate whether the insertion is successful.
func (tree *Tree) Insert(key []byte, value interface{}) (oldValue interface{}, ok bool) {
if key == nil {
return nil, false
}
oldValue, ok = tree.root.insert(key, key, value)
if ok && oldValue == nil {
tree.leavesNum++
}
return oldValue, ok
}
// Get returns the value of given key and a boolean to indicate
// whether the value is found.
func (tree *Tree) Get(key []byte) (value interface{}, found bool) {
if key == nil || len(tree.root.edges) == 0 {
return nil, false
}
return tree.root.get(key)
}
// LongestSuffix is mostly like Get.
// It returns the key which is the longest suffix of the given key,
// and the value referred by this key.
// Plus a boolean to indicate whether the key/value, is found.
func (tree *Tree) LongestSuffix(key []byte) (matchedKey []byte, value interface{}, found bool) {
if key == nil || len(tree.root.edges) == 0 {
return nil, nil, false
}
return tree.root.longestSuffix(key)
}
// Remove returns the value of given key and a boolean to indicate
// whethe the value is found. Then the value will be removed.
func (tree *Tree) Remove(key []byte) (oldValue interface{}, found bool) {
if key == nil || len(tree.root.edges) == 0 {
return nil, false
}
oldValue, found, _ = tree.root.remove(key)
if found {
tree.leavesNum--
}
return oldValue, found
}
// Len returns the number of keys.
func (tree *Tree) Len() int {
return tree.leavesNum
}
// Walk through the tree, call function with key and value.
// Once the function returns true, it will stop walking.
// The travelling order is DFS, in the same suffix level the shortest key comes first.
func (tree *Tree) Walk(f func(key []byte, value interface{}) bool) {
stop := false
tree.root.walk([]byte{}, f, &stop)
}
// WalkSuffix travels through nodes which have given suffix, calls function with key and value.
// Once the function returns true, it will stop walking.
// The travelling order is DFS, in the same suffix level the shortest key comes first.
func (tree *Tree) WalkSuffix(suffix []byte, f func(key []byte, value interface{}) bool) {
if len(tree.root.edges) != 0 {
stop := false
if len(suffix) == 0 {
tree.root.walk([]byte{}, f, &stop)
} else {
startingPoint, extraLabel, found := tree.root.getPointHasSuffix(suffix)
if found {
switch point := startingPoint.(type) {
case *_Leaf:
f(point.originKey, point.value)
case *_Node:
point.walk(append(extraLabel, suffix...), f, &stop)
}
}
}
}
}
// This API is for testing/debug
func (tree *Tree) walkNode(f func(labels [][]byte, value interface{})) {
tree.root.walkNode([][]byte{}, f)
}