chore(deps): bump github.com/nats-io/nats-server/v2

Bumps [github.com/nats-io/nats-server/v2](https://github.com/nats-io/nats-server) from 2.10.16 to 2.10.18.
- [Release notes](https://github.com/nats-io/nats-server/releases)
- [Changelog](https://github.com/nats-io/nats-server/blob/main/.goreleaser.yml)
- [Commits](https://github.com/nats-io/nats-server/compare/v2.10.16...v2.10.18)

---
updated-dependencies:
- dependency-name: github.com/nats-io/nats-server/v2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
This commit is contained in:
dependabot[bot]
2024-08-08 06:57:28 +00:00
committed by Ralf Haferkamp
parent ed13b043eb
commit 3f446bbf8b
46 changed files with 2007 additions and 627 deletions
+4 -6
View File
@@ -5,7 +5,6 @@
#include "textflag.h"
// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
@@ -17,17 +16,16 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56
JB matchlen_match4_standalone
matchlen_loopback_standalone:
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
TESTQ BX, BX
JZ matchlen_loop_standalone
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
JZ matchlen_loop_standalone
#ifdef GOAMD64_v3
TZCNTQ BX, BX
#else
BSFQ BX, BX
#endif
SARQ $0x03, BX
SHRL $0x03, BX
LEAL (SI)(BX*1), SI
JMP gen_match_len_end
+1 -1
View File
@@ -60,7 +60,7 @@
//
// The d variable is implicitly R_DST - R_DBASE, and len(dst)-d is R_DEND - R_DST.
// The s variable is implicitly R_SRC - R_SBASE, and len(src)-s is R_SEND - R_SRC.
TEXT ·s2Decode(SB), NOSPLIT, $56-64
TEXT ·s2Decode(SB), NOSPLIT, $56-56
// Initialize R_SRC, R_DST and R_DBASE-R_SEND.
MOVD dst_base+0(FP), R_DBASE
MOVD dst_len+8(FP), R_DLEN
+8 -2
View File
@@ -17,6 +17,8 @@ const (
S2IndexHeader = "s2idx\x00"
S2IndexTrailer = "\x00xdi2s"
maxIndexEntries = 1 << 16
// If distance is less than this, we do not add the entry.
minIndexDist = 1 << 20
)
// Index represents an S2/Snappy index.
@@ -72,6 +74,10 @@ func (i *Index) add(compressedOffset, uncompressedOffset int64) error {
if latest.compressedOffset > compressedOffset {
return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset)
}
if latest.uncompressedOffset+minIndexDist > uncompressedOffset {
// Only add entry if distance is large enough.
return nil
}
}
i.info = append(i.info, struct {
compressedOffset int64
@@ -122,7 +128,7 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er
// reduce to stay below maxIndexEntries
func (i *Index) reduce() {
if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 {
if len(i.info) < maxIndexEntries && i.estBlockUncomp >= minIndexDist {
return
}
@@ -132,7 +138,7 @@ func (i *Index) reduce() {
j := 0
// Each block should be at least 1MB, but don't reduce below 1000 entries.
for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 {
for i.estBlockUncomp*(int64(removeN)+1) < minIndexDist && len(i.info)/(removeN+1) > 1000 {
removeN++
}
for idx := 0; idx < len(src); idx++ {
+5 -1
View File
@@ -109,7 +109,11 @@ const (
chunkTypeStreamIdentifier = 0xff
)
var crcTable = crc32.MakeTable(crc32.Castagnoli)
var (
crcTable = crc32.MakeTable(crc32.Castagnoli)
magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes.
magicChunkBytes = []byte(magicChunk) // Can be passed to functions where it escapes.
)
// crc implements the checksum specified in section 3 of
// https://github.com/google/snappy/blob/master/framing_format.txt
+16 -10
View File
@@ -239,6 +239,9 @@ func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
}
}
if n2 == 0 {
if cap(inbuf) >= w.obufLen {
w.buffers.Put(inbuf)
}
break
}
n += int64(n2)
@@ -314,9 +317,9 @@ func (w *Writer) AddSkippableBlock(id uint8, data []byte) (err error) {
hWriter := make(chan result)
w.output <- hWriter
if w.snappy {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
} else {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
}
}
@@ -370,9 +373,9 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) {
hWriter := make(chan result)
w.output <- hWriter
if w.snappy {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
} else {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
}
}
@@ -478,9 +481,9 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) {
hWriter := make(chan result)
w.output <- hWriter
if w.snappy {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
} else {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
}
}
@@ -560,6 +563,9 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) {
if w.concurrency == 1 {
_, err := w.writeSync(inbuf[obufHeaderLen:])
if cap(inbuf) >= w.obufLen {
w.buffers.Put(inbuf)
}
return err
}
@@ -569,9 +575,9 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) {
hWriter := make(chan result)
w.output <- hWriter
if w.snappy {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes}
} else {
hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)}
hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes}
}
}
@@ -637,9 +643,9 @@ func (w *Writer) writeSync(p []byte) (nRet int, errRet error) {
var n int
var err error
if w.snappy {
n, err = w.writer.Write([]byte(magicChunkSnappy))
n, err = w.writer.Write(magicChunkSnappyBytes)
} else {
n, err = w.writer.Write([]byte(magicChunk))
n, err = w.writer.Write(magicChunkBytes)
}
if err != nil {
return 0, w.err(err)
+31
View File
@@ -273,6 +273,9 @@ func BuildDict(o BuildDictOptions) ([]byte, error) {
enc.Encode(&block, b)
addValues(&remain, block.literals)
litTotal += len(block.literals)
if len(block.sequences) == 0 {
continue
}
seqs += len(block.sequences)
block.genCodes()
addHist(&ll, block.coders.llEnc.Histogram())
@@ -286,6 +289,9 @@ func BuildDict(o BuildDictOptions) ([]byte, error) {
if offset == 0 {
continue
}
if int(offset) >= len(o.History) {
continue
}
if offset > 3 {
newOffsets[offset-3]++
} else {
@@ -336,6 +342,9 @@ func BuildDict(o BuildDictOptions) ([]byte, error) {
if seqs/nUsed < 512 {
// Use 512 as minimum.
nUsed = seqs / 512
if nUsed == 0 {
nUsed = 1
}
}
copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) {
hist := dst.Histogram()
@@ -358,6 +367,28 @@ func BuildDict(o BuildDictOptions) ([]byte, error) {
fakeLength += v
hist[i] = uint32(v)
}
// Ensure we aren't trying to represent RLE.
if maxCount == fakeLength {
for i := range hist {
if uint8(i) == maxSym {
fakeLength++
maxSym++
hist[i+1] = 1
if maxSym > 1 {
break
}
}
if hist[0] == 0 {
fakeLength++
hist[i] = 1
if maxSym > 1 {
break
}
}
}
}
dst.HistogramFinished(maxSym, maxCount)
dst.reUsed = false
dst.useRLE = false
+2 -2
View File
@@ -162,12 +162,12 @@ finalize:
MOVD h, ret+24(FP)
RET
// func writeBlocks(d *Digest, b []byte) int
// func writeBlocks(s *Digest, b []byte) int
TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
LDP ·primes+0(SB), (prime1, prime2)
// Load state. Assume v[1-4] are stored contiguously.
MOVD d+0(FP), digest
MOVD s+0(FP), digest
LDP 0(digest), (v1, v2)
LDP 16(digest), (v3, v4)
+4 -6
View File
@@ -5,7 +5,6 @@
#include "textflag.h"
// func matchLen(a []byte, b []byte) int
// Requires: BMI
TEXT ·matchLen(SB), NOSPLIT, $0-56
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
@@ -17,17 +16,16 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56
JB matchlen_match4_standalone
matchlen_loopback_standalone:
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
TESTQ BX, BX
JZ matchlen_loop_standalone
MOVQ (AX)(SI*1), BX
XORQ (CX)(SI*1), BX
JZ matchlen_loop_standalone
#ifdef GOAMD64_v3
TZCNTQ BX, BX
#else
BSFQ BX, BX
#endif
SARQ $0x03, BX
SHRL $0x03, BX
LEAL (SI)(BX*1), SI
JMP gen_match_len_end
+2 -4
View File
@@ -12,13 +12,11 @@ linters:
- goimports
- misspell
- govet
- golint
- revive
- ineffassign
- gosimple
- deadcode
- unparam
- unused
- structcheck
issues:
exclude-use-default: false
@@ -27,4 +25,4 @@ issues:
- error strings should not be capitalized or end with punctuation or a newline
- should have comment # TODO(aead): Remove once all exported ident. have comments!
service:
golangci-lint-version: 1.20.0 # use the fixed version to not introduce new linters unexpectedly
golangci-lint-version: 1.51.2 # use the fixed version to not introduce new linters unexpectedly
+9 -9
View File
@@ -42,17 +42,17 @@ So for moderately sized messages it tops out at about 15 GB/sec. Also for small
### ARM Performance
Below are the single core results on an EC2 m6g.4xlarge (Graviton2) instance for 256 bit outputs:
Below are the single core results on an EC2 c7g.4xlarge (Graviton3) instance for 256 bit outputs:
```
BenchmarkSum256_16 96.82 MB/s
BenchmarkSum256_64 445.35 MB/s
BenchmarkSum256_1K 2782.46 MB/s
BenchmarkSum256_8K 4083.58 MB/s
BenchmarkSum256_1M 4986.41 MB/s
BenchmarkSum256_5M 4992.72 MB/s
BenchmarkSum256_10M 4993.32 MB/s
BenchmarkSum256_25M 4992.55 MB/s
BenchmarkSum256_16 143.66 MB/s
BenchmarkSum256_64 628.75 MB/s
BenchmarkSum256_1K 3621.71 MB/s
BenchmarkSum256_8K 5039.64 MB/s
BenchmarkSum256_1M 5279.79 MB/s
BenchmarkSum256_5M 5474.60 MB/s
BenchmarkSum256_10M 5621.73 MB/s
BenchmarkSum256_25M 5250.47 MB/s
```
### ppc64le Performance
+132
View File
@@ -0,0 +1,132 @@
//
// Copyright (c) 2024 Minio Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//+build !noasm,!appengine
#include "textflag.h"
TEXT ·getVectorLength(SB), NOSPLIT, $0
WORD $0xd2800002 // mov x2, #0
WORD $0x04225022 // addvl x2, x2, #1
WORD $0xd37df042 // lsl x2, x2, #3
WORD $0xd2800003 // mov x3, #0
WORD $0x04635023 // addpl x3, x3, #1
WORD $0xd37df063 // lsl x3, x3, #3
MOVD R2, vl+0(FP)
MOVD R3, pl+8(FP)
RET
TEXT ·updateArm64Sve(SB), NOSPLIT, $0
MOVD state+0(FP), R0
MOVD msg_base+8(FP), R1
MOVD msg_len+16(FP), R2 // length of message
SUBS $32, R2
BMI completeSve
WORD $0x2518e3e1 // ptrue p1.b
WORD $0xa5e0a401 // ld1d z1.d, p1/z, [x0]
WORD $0xa5e1a402 // ld1d z2.d, p1/z, [x0, #1, MUL VL]
WORD $0xa5e2a403 // ld1d z3.d, p1/z, [x0, #2, MUL VL]
WORD $0xa5e3a404 // ld1d z4.d, p1/z, [x0, #3, MUL VL]
// Load zipper merge constants table pointer
MOVD $·zipperMergeSve(SB), R3
WORD $0xa5e0a465 // ld1d z5.d, p1/z, [x3]
WORD $0x25b8c006 // mov z6.s, #0
WORD $0x25d8e3e2 // ptrue p2.d /* set every other lane for "s" type */
loopSve:
WORD $0xa5e0a420 // ld1d z0.d, p1/z, [x1]
ADD $32, R1
WORD $0x04e00042 // add z2.d, z2.d, z0.d
WORD $0x04e30042 // add z2.d, z2.d, z3.d
WORD $0x04e09420 // lsr z0.d, z1.d, #32
WORD $0x05a6c847 // sel z7.s, p2, z2.s, z6.s
WORD $0x04d004e0 // mul z0.d, p1/m, z0.d, z7.d
WORD $0x04a33003 // eor z3.d, z0.d, z3.d
WORD $0x04e10081 // add z1.d, z4.d, z1.d
WORD $0x04e09440 // lsr z0.d, z2.d, #32
WORD $0x05a6c827 // sel z7.s, p2, z1.s, z6.s
WORD $0x04d004e0 // mul z0.d, p1/m, z0.d, z7.d
WORD $0x04a43004 // eor z4.d, z0.d, z4.d
WORD $0x05253040 // tbl z0.b, z2.b, z5.b
WORD $0x04e00021 // add z1.d, z1.d, z0.d
WORD $0x05253020 // tbl z0.b, z1.b, z5.b
WORD $0x04e00042 // add z2.d, z2.d, z0.d
SUBS $32, R2
BPL loopSve
WORD $0xe5e0e401 // st1d z1.d, p1, [x0]
WORD $0xe5e1e402 // st1d z2.d, p1, [x0, #1, MUL VL]
WORD $0xe5e2e403 // st1d z3.d, p1, [x0, #2, MUL VL]
WORD $0xe5e3e404 // st1d z4.d, p1, [x0, #3, MUL VL]
completeSve:
RET
TEXT ·updateArm64Sve2(SB), NOSPLIT, $0
MOVD state+0(FP), R0
MOVD msg_base+8(FP), R1
MOVD msg_len+16(FP), R2 // length of message
SUBS $32, R2
BMI completeSve2
WORD $0x2518e3e1 // ptrue p1.b
WORD $0xa5e0a401 // ld1d z1.d, p1/z, [x0]
WORD $0xa5e1a402 // ld1d z2.d, p1/z, [x0, #1, MUL VL]
WORD $0xa5e2a403 // ld1d z3.d, p1/z, [x0, #2, MUL VL]
WORD $0xa5e3a404 // ld1d z4.d, p1/z, [x0, #3, MUL VL]
// Load zipper merge constants table pointer
MOVD $·zipperMergeSve(SB), R3
WORD $0xa5e0a465 // ld1d z5.d, p1/z, [x3]
loopSve2:
WORD $0xa5e0a420 // ld1d z0.d, p1/z, [x1]
ADD $32, R1
WORD $0x04e00042 // add z2.d, z2.d, z0.d
WORD $0x04e30042 // add z2.d, z2.d, z3.d
WORD $0x04e09420 // lsr z0.d, z1.d, #32
WORD $0x45c27800 // umullb z0.d, z0.s, z2.s
WORD $0x04a33003 // eor z3.d, z0.d, z3.d
WORD $0x04e10081 // add z1.d, z4.d, z1.d
WORD $0x04e09440 // lsr z0.d, z2.d, #32
WORD $0x45c17800 // umullb z0.d, z0.s, z1.s
WORD $0x04a43004 // eor z4.d, z0.d, z4.d
WORD $0x05253040 // tbl z0.b, z2.b, z5.b
WORD $0x04e00021 // add z1.d, z1.d, z0.d
WORD $0x05253020 // tbl z0.b, z1.b, z5.b
WORD $0x04e00042 // add z2.d, z2.d, z0.d
SUBS $32, R2
BPL loopSve2
WORD $0xe5e0e401 // st1d z1.d, p1, [x0]
WORD $0xe5e1e402 // st1d z2.d, p1, [x0, #1, MUL VL]
WORD $0xe5e2e403 // st1d z3.d, p1, [x0, #2, MUL VL]
WORD $0xe5e3e404 // st1d z4.d, p1, [x0, #3, MUL VL]
completeSve2:
RET
DATA ·zipperMergeSve+0x00(SB)/8, $0x000f010e05020c03
DATA ·zipperMergeSve+0x08(SB)/8, $0x070806090d0a040b
DATA ·zipperMergeSve+0x10(SB)/8, $0x101f111e15121c13
DATA ·zipperMergeSve+0x18(SB)/8, $0x171816191d1a141b
GLOBL ·zipperMergeSve(SB), (NOPTR+RODATA), $32
+3
View File
@@ -2,6 +2,7 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//go:build amd64 && !gccgo && !appengine && !nacl && !noasm
// +build amd64,!gccgo,!appengine,!nacl,!noasm
package highwayhash
@@ -12,6 +13,8 @@ var (
useSSE4 = cpu.X86.HasSSE41
useAVX2 = cpu.X86.HasAVX2
useNEON = false
useSVE = false
useSVE2 = false
useVMX = false
)
+38 -4
View File
@@ -1,24 +1,54 @@
// Copyright (c) 2017 Minio Inc. All rights reserved.
// Copyright (c) 2017-2024 Minio Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//+build !noasm,!appengine
//go:build !noasm && !appengine
// +build !noasm,!appengine
package highwayhash
import (
"golang.org/x/sys/cpu"
)
var (
useSSE4 = false
useAVX2 = false
useNEON = true
useNEON = cpu.ARM64.HasASIMD
useSVE = cpu.ARM64.HasSVE
useSVE2 = false // cpu.ARM64.HasSVE2 -- disable until tested on real hardware
useVMX = false
)
func init() {
if useSVE {
if vl, _ := getVectorLength(); vl != 256 {
//
// Since HighwahHash is designed for AVX2,
// SVE/SVE2 instructions only run correctly
// for vector length of 256
//
useSVE2 = false
useSVE = false
}
}
}
//go:noescape
func initializeArm64(state *[16]uint64, key []byte)
//go:noescape
func updateArm64(state *[16]uint64, msg []byte)
//go:noescape
func getVectorLength() (vl, pl uint64)
//go:noescape
func updateArm64Sve(state *[16]uint64, msg []byte)
//go:noescape
func updateArm64Sve2(state *[16]uint64, msg []byte)
//go:noescape
func finalizeArm64(out []byte, state *[16]uint64)
@@ -31,7 +61,11 @@ func initialize(state *[16]uint64, key []byte) {
}
func update(state *[16]uint64, msg []byte) {
if useNEON {
if useSVE2 {
updateArm64Sve2(state, msg)
} else if useSVE {
updateArm64Sve(state, msg)
} else if useNEON {
updateArm64(state, msg)
} else {
updateGeneric(state, msg)
+216 -39
View File
@@ -46,40 +46,113 @@ func initializeGeneric(state *[16]uint64, k []byte) {
}
func updateGeneric(state *[16]uint64, msg []byte) {
for len(msg) > 0 {
// add message
state[v1+0] += binary.LittleEndian.Uint64(msg)
state[v1+1] += binary.LittleEndian.Uint64(msg[8:])
state[v1+2] += binary.LittleEndian.Uint64(msg[16:])
state[v1+3] += binary.LittleEndian.Uint64(msg[24:])
// v1 += mul0
state[v1+0] += state[mul0+0]
state[v1+1] += state[mul0+1]
state[v1+2] += state[mul0+2]
state[v1+3] += state[mul0+3]
for len(msg) >= 32 {
m := msg[:32]
// add message + mul0
// Interleave operations to hide multiplication
state[v1+0] += binary.LittleEndian.Uint64(m) + state[mul0+0]
state[mul0+0] ^= uint64(uint32(state[v1+0])) * (state[v0+0] >> 32)
state[mul0+1] ^= uint64(uint32(state[v1+1])) * (state[v0+1] >> 32)
state[mul0+2] ^= uint64(uint32(state[v1+2])) * (state[v0+2] >> 32)
state[mul0+3] ^= uint64(uint32(state[v1+3])) * (state[v0+3] >> 32)
// v0 += mul1
state[v0+0] += state[mul1+0]
state[v0+1] += state[mul1+1]
state[v0+2] += state[mul1+2]
state[v0+3] += state[mul1+3]
state[mul1+0] ^= uint64(uint32(state[v0+0])) * (state[v1+0] >> 32)
state[v1+1] += binary.LittleEndian.Uint64(m[8:]) + state[mul0+1]
state[mul0+1] ^= uint64(uint32(state[v1+1])) * (state[v0+1] >> 32)
state[v0+1] += state[mul1+1]
state[mul1+1] ^= uint64(uint32(state[v0+1])) * (state[v1+1] >> 32)
state[v1+2] += binary.LittleEndian.Uint64(m[16:]) + state[mul0+2]
state[mul0+2] ^= uint64(uint32(state[v1+2])) * (state[v0+2] >> 32)
state[v0+2] += state[mul1+2]
state[mul1+2] ^= uint64(uint32(state[v0+2])) * (state[v1+2] >> 32)
state[v1+3] += binary.LittleEndian.Uint64(m[24:]) + state[mul0+3]
state[mul0+3] ^= uint64(uint32(state[v1+3])) * (state[v0+3] >> 32)
state[v0+3] += state[mul1+3]
state[mul1+3] ^= uint64(uint32(state[v0+3])) * (state[v1+3] >> 32)
zipperMerge(state[v1+0], state[v1+1], &state[v0+0], &state[v0+1])
zipperMerge(state[v1+2], state[v1+3], &state[v0+2], &state[v0+3])
// inlined: zipperMerge(state[v1+0], state[v1+1], &state[v0+0], &state[v0+1])
{
val0 := state[v1+0]
val1 := state[v1+1]
res := val0 & (0xff << (2 * 8))
res2 := (val0 & (0xff << (7 * 8))) + (val1 & (0xff << (2 * 8)))
res += (val1 & (0xff << (7 * 8))) >> 8
res2 += (val0 & (0xff << (6 * 8))) >> 8
res += ((val0 & (0xff << (5 * 8))) + (val1 & (0xff << (6 * 8)))) >> 16
res2 += (val1 & (0xff << (5 * 8))) >> 16
res += ((val0 & (0xff << (3 * 8))) + (val1 & (0xff << (4 * 8)))) >> 24
res2 += ((val1 & (0xff << (3 * 8))) + (val0 & (0xff << (4 * 8)))) >> 24
res += (val0 & (0xff << (1 * 8))) << 32
res2 += (val1 & 0xff) << 48
res += val0 << 56
res2 += (val1 & (0xff << (1 * 8))) << 24
zipperMerge(state[v0+0], state[v0+1], &state[v1+0], &state[v1+1])
zipperMerge(state[v0+2], state[v0+3], &state[v1+2], &state[v1+3])
state[v0+0] += res
state[v0+1] += res2
}
// zipperMerge(state[v1+2], state[v1+3], &state[v0+2], &state[v0+3])
{
val0 := state[v1+2]
val1 := state[v1+3]
res := val0 & (0xff << (2 * 8))
res2 := (val0 & (0xff << (7 * 8))) + (val1 & (0xff << (2 * 8)))
res += (val1 & (0xff << (7 * 8))) >> 8
res2 += (val0 & (0xff << (6 * 8))) >> 8
res += ((val0 & (0xff << (5 * 8))) + (val1 & (0xff << (6 * 8)))) >> 16
res2 += (val1 & (0xff << (5 * 8))) >> 16
res += ((val0 & (0xff << (3 * 8))) + (val1 & (0xff << (4 * 8)))) >> 24
res2 += ((val1 & (0xff << (3 * 8))) + (val0 & (0xff << (4 * 8)))) >> 24
res += (val0 & (0xff << (1 * 8))) << 32
res2 += (val1 & 0xff) << 48
res += val0 << 56
res2 += (val1 & (0xff << (1 * 8))) << 24
state[v0+2] += res
state[v0+3] += res2
}
// inlined: zipperMerge(state[v0+0], state[v0+1], &state[v1+0], &state[v1+1])
{
val0 := state[v0+0]
val1 := state[v0+1]
res := val0 & (0xff << (2 * 8))
res2 := (val0 & (0xff << (7 * 8))) + (val1 & (0xff << (2 * 8)))
res += (val1 & (0xff << (7 * 8))) >> 8
res2 += (val0 & (0xff << (6 * 8))) >> 8
res += ((val0 & (0xff << (5 * 8))) + (val1 & (0xff << (6 * 8)))) >> 16
res2 += (val1 & (0xff << (5 * 8))) >> 16
res += ((val0 & (0xff << (3 * 8))) + (val1 & (0xff << (4 * 8)))) >> 24
res2 += ((val1 & (0xff << (3 * 8))) + (val0 & (0xff << (4 * 8)))) >> 24
res += (val0 & (0xff << (1 * 8))) << 32
res2 += (val1 & 0xff) << 48
res += val0 << 56
res2 += (val1 & (0xff << (1 * 8))) << 24
state[v1+0] += res
state[v1+1] += res2
}
//inlined: zipperMerge(state[v0+2], state[v0+3], &state[v1+2], &state[v1+3])
{
val0 := state[v0+2]
val1 := state[v0+3]
res := val0 & (0xff << (2 * 8))
res2 := (val0 & (0xff << (7 * 8))) + (val1 & (0xff << (2 * 8)))
res += (val1 & (0xff << (7 * 8))) >> 8
res2 += (val0 & (0xff << (6 * 8))) >> 8
res += ((val0 & (0xff << (5 * 8))) + (val1 & (0xff << (6 * 8)))) >> 16
res2 += (val1 & (0xff << (5 * 8))) >> 16
res += ((val0 & (0xff << (3 * 8))) + (val1 & (0xff << (4 * 8)))) >> 24
res2 += ((val1 & (0xff << (3 * 8))) + (val0 & (0xff << (4 * 8)))) >> 24
res += (val0 & (0xff << (1 * 8))) << 32
res2 += (val1 & 0xff) << 48
res += val0 << 56
res2 += (val1 & (0xff << (1 * 8))) << 24
state[v1+2] += res
state[v1+3] += res2
}
msg = msg[32:]
}
}
@@ -124,25 +197,129 @@ func finalizeGeneric(out []byte, state *[16]uint64) {
}
}
// Experiments on variations left for future reference...
/*
func zipperMerge(v0, v1 uint64, d0, d1 *uint64) {
m0 := v0 & (0xFF << (2 * 8))
m1 := (v1 & (0xFF << (7 * 8))) >> 8
m2 := ((v0 & (0xFF << (5 * 8))) + (v1 & (0xFF << (6 * 8)))) >> 16
m3 := ((v0 & (0xFF << (3 * 8))) + (v1 & (0xFF << (4 * 8)))) >> 24
m4 := (v0 & (0xFF << (1 * 8))) << 32
m5 := v0 << 56
if true {
// fastest. original interleaved...
res := v0 & (0xff << (2 * 8))
res2 := (v0 & (0xff << (7 * 8))) + (v1 & (0xff << (2 * 8)))
res += (v1 & (0xff << (7 * 8))) >> 8
res2 += (v0 & (0xff << (6 * 8))) >> 8
res += ((v0 & (0xff << (5 * 8))) + (v1 & (0xff << (6 * 8)))) >> 16
res2 += (v1 & (0xff << (5 * 8))) >> 16
res += ((v0 & (0xff << (3 * 8))) + (v1 & (0xff << (4 * 8)))) >> 24
res2 += ((v1 & (0xff << (3 * 8))) + (v0 & (0xff << (4 * 8)))) >> 24
res += (v0 & (0xff << (1 * 8))) << 32
res2 += (v1 & 0xff) << 48
res += v0 << 56
res2 += (v1 & (0xff << (1 * 8))) << 24
*d0 += m0 + m1 + m2 + m3 + m4 + m5
*d0 += res
*d1 += res2
} else if false {
// Reading bytes and combining into uint64
var v0b [8]byte
binary.LittleEndian.PutUint64(v0b[:], v0)
var v1b [8]byte
binary.LittleEndian.PutUint64(v1b[:], v1)
var res, res2 uint64
m0 = (v0 & (0xFF << (7 * 8))) + (v1 & (0xFF << (2 * 8)))
m1 = (v0 & (0xFF << (6 * 8))) >> 8
m2 = (v1 & (0xFF << (5 * 8))) >> 16
m3 = ((v1 & (0xFF << (3 * 8))) + (v0 & (0xFF << (4 * 8)))) >> 24
m4 = (v1 & 0xFF) << 48
m5 = (v1 & (0xFF << (1 * 8))) << 24
res = uint64(v0b[0]) << (7 * 8)
res2 = uint64(v1b[0]) << (6 * 8)
res |= uint64(v0b[1]) << (5 * 8)
res2 |= uint64(v1b[1]) << (4 * 8)
res |= uint64(v0b[2]) << (2 * 8)
res2 |= uint64(v1b[2]) << (2 * 8)
res |= uint64(v0b[3])
res2 |= uint64(v0b[4]) << (1 * 8)
res |= uint64(v0b[5]) << (3 * 8)
res2 |= uint64(v0b[6]) << (5 * 8)
res |= uint64(v1b[4]) << (1 * 8)
res2 |= uint64(v0b[7]) << (7 * 8)
res |= uint64(v1b[6]) << (4 * 8)
res2 |= uint64(v1b[3])
res |= uint64(v1b[7]) << (6 * 8)
res2 |= uint64(v1b[5]) << (3 * 8)
*d1 += m3 + m2 + m5 + m1 + m4 + m0
*d0 += res
*d1 += res2
} else if false {
// bytes to bytes shuffle
var v0b [8]byte
binary.LittleEndian.PutUint64(v0b[:], v0)
var v1b [8]byte
binary.LittleEndian.PutUint64(v1b[:], v1)
var res [8]byte
//res += ((v0 & (0xff << (3 * 8))) + (v1 & (0xff << (4 * 8)))) >> 24
res[0] = v0b[3]
res[1] = v1b[4]
// res := v0 & (0xff << (2 * 8))
res[2] = v0b[2]
//res += ((v0 & (0xff << (5 * 8))) + (v1 & (0xff << (6 * 8)))) >> 16
res[3] = v0b[5]
res[4] = v1b[6]
//res += (v0 & (0xff << (1 * 8))) << 32
res[5] = v0b[1]
//res += (v1 & (0xff << (7 * 8))) >> 8
res[6] += v1b[7]
//res += v0 << 56
res[7] = v0b[0]
v0 = binary.LittleEndian.Uint64(res[:])
*d0 += v0
//res += ((v1 & (0xff << (3 * 8))) + (v0 & (0xff << (4 * 8)))) >> 24
res[0] = v1b[3]
res[1] = v0b[4]
res[2] = v1b[2]
// res += (v1 & (0xff << (5 * 8))) >> 16
res[3] = v1b[5]
//res += (v1 & (0xff << (1 * 8))) << 24
res[4] = v1b[1]
// res += (v0 & (0xff << (6 * 8))) >> 8
res[5] = v0b[6]
//res := (v0 & (0xff << (7 * 8))) + (v1 & (0xff << (2 * 8)))
res[7] = v0b[7]
//res += (v1 & 0xff) << 48
res[6] = v1b[0]
v0 = binary.LittleEndian.Uint64(res[:])
*d1 += v0
} else {
// original.
res := v0 & (0xff << (2 * 8))
res += (v1 & (0xff << (7 * 8))) >> 8
res += ((v0 & (0xff << (5 * 8))) + (v1 & (0xff << (6 * 8)))) >> 16
res += ((v0 & (0xff << (3 * 8))) + (v1 & (0xff << (4 * 8)))) >> 24
res += (v0 & (0xff << (1 * 8))) << 32
res += v0 << 56
*d0 += res
res = (v0 & (0xff << (7 * 8))) + (v1 & (0xff << (2 * 8)))
res += (v0 & (0xff << (6 * 8))) >> 8
res += (v1 & (0xff << (5 * 8))) >> 16
res += ((v1 & (0xff << (3 * 8))) + (v0 & (0xff << (4 * 8)))) >> 24
res += (v1 & 0xff) << 48
res += (v1 & (0xff << (1 * 8))) << 24
*d1 += res
}
}
*/
// reduce v = [v0, v1, v2, v3] mod the irreducible polynomial x^128 + x^2 + x
func reduceMod(v0, v1, v2, v3 uint64) (r0, r1 uint64) {
+4 -1
View File
@@ -2,7 +2,8 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//+build !noasm,!appengine
//go:build !noasm && !appengine
// +build !noasm,!appengine
package highwayhash
@@ -10,6 +11,8 @@ var (
useSSE4 = false
useAVX2 = false
useNEON = false
useSVE = false
useSVE2 = false
useVMX = true
)
+3
View File
@@ -2,6 +2,7 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//go:build noasm || (!amd64 && !arm64 && !ppc64le)
// +build noasm !amd64,!arm64,!ppc64le
package highwayhash
@@ -10,6 +11,8 @@ var (
useSSE4 = false
useAVX2 = false
useNEON = false
useSVE = false
useSVE2 = false
useVMX = false
)
+13 -3
View File
@@ -152,10 +152,20 @@ type Mapping map[Subject][]WeightedMapping
func (m *Mapping) Validate(vr *ValidationResults) {
for ubFrom, wm := range (map[Subject][]WeightedMapping)(*m) {
ubFrom.Validate(vr)
perCluster := make(map[string]uint8)
total := uint8(0)
for _, wm := range wm {
wm.Subject.Validate(vr)
total += wm.GetWeight()
for _, e := range wm {
e.Subject.Validate(vr)
if e.Cluster != "" {
t := perCluster[e.Cluster]
t += e.Weight
perCluster[e.Cluster] = t
if t > 100 {
vr.AddError("Mapping %q in cluster %q exceeds 100%% among all of it's weighted to mappings", ubFrom, e.Cluster)
}
} else {
total += e.GetWeight()
}
}
if total > 100 {
vr.AddError("Mapping %q exceeds 100%% among all of it's weighted to mappings", ubFrom)
+78 -30
View File
@@ -96,6 +96,9 @@ type Account struct {
nameTag string
lastLimErr int64
routePoolIdx int
// Guarantee that only one goroutine can be running either checkJetStreamMigrate
// or clearObserverState at a given time for this account to prevent interleaving.
jscmMu sync.Mutex
}
const (
@@ -1479,6 +1482,10 @@ func (a *Account) addServiceImportWithClaim(destination *Account, from, to strin
return err
}
if err := a.serviceImportFormsCycle(destination, to); err != nil {
return err
}
_, err := a.addServiceImport(destination, from, to, imClaim)
return err
@@ -2466,6 +2473,10 @@ func (a *Account) AddMappedStreamImportWithClaim(account *Account, from, to stri
return err
}
if err := a.streamImportFormsCycle(account, from); err != nil {
return err
}
var (
usePub bool
tr *subjectTransform
@@ -2811,9 +2822,12 @@ func (a *Account) isIssuerClaimTrusted(claims *jwt.ActivationClaims) bool {
// check is done with the account's name, not the pointer. This is used
// during config reload where we are comparing current and new config
// in which pointers are different.
// No lock is acquired in this function, so it is assumed that the
// import maps are not changed while this executes.
// Acquires `a` read lock, but `b` is assumed to not be accessed
// by anyone but the caller (`b` is not registered anywhere).
func (a *Account) checkStreamImportsEqual(b *Account) bool {
a.mu.RLock()
defer a.mu.RUnlock()
if len(a.imports.streams) != len(b.imports.streams) {
return false
}
@@ -3181,6 +3195,9 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
a.nameTag = ac.Name
a.tags = ac.Tags
// Grab trace label under lock.
tl := a.traceLabel()
// Check for external authorization.
if ac.HasExternalAuthorization() {
a.extAuth = &jwt.ExternalAuthorization{}
@@ -3201,10 +3218,10 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
}
if a.imports.services != nil {
old.imports.services = make(map[string]*serviceImport, len(a.imports.services))
}
for k, v := range a.imports.services {
old.imports.services[k] = v
delete(a.imports.services, k)
for k, v := range a.imports.services {
old.imports.services[k] = v
delete(a.imports.services, k)
}
}
alteredScope := map[string]struct{}{}
@@ -3274,13 +3291,13 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
for _, e := range ac.Exports {
switch e.Type {
case jwt.Stream:
s.Debugf("Adding stream export %q for %s", e.Subject, a.traceLabel())
s.Debugf("Adding stream export %q for %s", e.Subject, tl)
if err := a.addStreamExportWithAccountPos(
string(e.Subject), authAccounts(e.TokenReq), e.AccountTokenPosition); err != nil {
s.Debugf("Error adding stream export to account [%s]: %v", a.traceLabel(), err.Error())
s.Debugf("Error adding stream export to account [%s]: %v", tl, err.Error())
}
case jwt.Service:
s.Debugf("Adding service export %q for %s", e.Subject, a.traceLabel())
s.Debugf("Adding service export %q for %s", e.Subject, tl)
rt := Singleton
switch e.ResponseType {
case jwt.ResponseTypeStream:
@@ -3290,7 +3307,7 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
}
if err := a.addServiceExportWithResponseAndAccountPos(
string(e.Subject), rt, authAccounts(e.TokenReq), e.AccountTokenPosition); err != nil {
s.Debugf("Error adding service export to account [%s]: %v", a.traceLabel(), err)
s.Debugf("Error adding service export to account [%s]: %v", tl, err)
continue
}
sub := string(e.Subject)
@@ -3300,13 +3317,13 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
if e.Latency.Sampling == jwt.Headers {
hdrNote = " (using headers)"
}
s.Debugf("Error adding latency tracking%s for service export to account [%s]: %v", hdrNote, a.traceLabel(), err)
s.Debugf("Error adding latency tracking%s for service export to account [%s]: %v", hdrNote, tl, err)
}
}
if e.ResponseThreshold != 0 {
// Response threshold was set in options.
if err := a.SetServiceExportResponseThreshold(sub, e.ResponseThreshold); err != nil {
s.Debugf("Error adding service export response threshold for [%s]: %v", a.traceLabel(), err)
s.Debugf("Error adding service export response threshold for [%s]: %v", tl, err)
}
}
}
@@ -3351,34 +3368,31 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
}
var incompleteImports []*jwt.Import
for _, i := range ac.Imports {
// check tmpAccounts with priority
var acc *Account
var err error
if v, ok := s.tmpAccounts.Load(i.Account); ok {
acc = v.(*Account)
} else {
acc, err = s.lookupAccount(i.Account)
}
acc, err := s.lookupAccount(i.Account)
if acc == nil || err != nil {
s.Errorf("Can't locate account [%s] for import of [%v] %s (err=%v)", i.Account, i.Subject, i.Type, err)
incompleteImports = append(incompleteImports, i)
continue
}
from := string(i.Subject)
to := i.GetTo()
// Capture trace labels.
acc.mu.RLock()
atl := acc.traceLabel()
acc.mu.RUnlock()
// Grab from and to
from, to := string(i.Subject), i.GetTo()
switch i.Type {
case jwt.Stream:
if i.LocalSubject != _EMPTY_ {
// set local subject implies to is empty
to = string(i.LocalSubject)
s.Debugf("Adding stream import %s:%q for %s:%q", acc.traceLabel(), from, a.traceLabel(), to)
s.Debugf("Adding stream import %s:%q for %s:%q", atl, from, tl, to)
err = a.AddMappedStreamImportWithClaim(acc, from, to, i)
} else {
s.Debugf("Adding stream import %s:%q for %s:%q", acc.traceLabel(), from, a.traceLabel(), to)
s.Debugf("Adding stream import %s:%q for %s:%q", atl, from, tl, to)
err = a.AddStreamImportWithClaim(acc, from, to, i)
}
if err != nil {
s.Debugf("Error adding stream import to account [%s]: %v", a.traceLabel(), err.Error())
s.Debugf("Error adding stream import to account [%s]: %v", tl, err.Error())
incompleteImports = append(incompleteImports, i)
}
case jwt.Service:
@@ -3386,9 +3400,9 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
from = string(i.LocalSubject)
to = string(i.Subject)
}
s.Debugf("Adding service import %s:%q for %s:%q", acc.traceLabel(), from, a.traceLabel(), to)
s.Debugf("Adding service import %s:%q for %s:%q", atl, from, tl, to)
if err := a.AddServiceImportWithClaim(acc, from, to, i); err != nil {
s.Debugf("Error adding service import to account [%s]: %v", a.traceLabel(), err.Error())
s.Debugf("Error adding service import to account [%s]: %v", tl, err.Error())
incompleteImports = append(incompleteImports, i)
}
}
@@ -3559,7 +3573,7 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
// regardless of enabled or disabled. It handles both cases.
if jsEnabled {
if err := s.configJetStream(a); err != nil {
s.Errorf("Error configuring jetstream for account [%s]: %v", a.traceLabel(), err.Error())
s.Errorf("Error configuring jetstream for account [%s]: %v", tl, err.Error())
a.mu.Lock()
// Absent reload of js server cfg, this is going to be broken until js is disabled
a.incomplete = true
@@ -3582,6 +3596,14 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
}
c.mu.Lock()
c.applyAccountLimits()
// if we have an nkey user we are a callout user - save
// the issuedAt, and nkey user id to honor revocations
var nkeyUserID string
var issuedAt int64
if c.user != nil {
issuedAt = c.user.Issued
nkeyUserID = c.user.Nkey
}
theJWT := c.opts.JWT
c.mu.Unlock()
// Check for being revoked here. We use ac one to avoid the account lock.
@@ -3600,6 +3622,27 @@ func (s *Server) updateAccountClaimsWithRefresh(a *Account, ac *jwt.AccountClaim
continue
}
}
// if we extracted nkeyUserID and issuedAt we are a callout type
// calloutIAT should only be set if we are in callout scenario as
// the user JWT is _NOT_ associated with the client for callouts,
// so we rely on the calloutIAT to know when the JWT was issued
// revocations simply state that JWT issued before or by that date
// are not valid
if ac.Revocations != nil && nkeyUserID != _EMPTY_ && issuedAt > 0 {
seconds, ok := ac.Revocations[jwt.All]
if ok && seconds >= issuedAt {
c.sendErrAndDebug("User Authentication Revoked")
c.closeConnection(Revocation)
continue
}
seconds, ok = ac.Revocations[nkeyUserID]
if ok && seconds >= issuedAt {
c.sendErrAndDebug("User Authentication Revoked")
c.closeConnection(Revocation)
continue
}
}
}
// Check if the signing keys changed, might have to evict
@@ -3667,8 +3710,13 @@ func (s *Server) buildInternalAccount(ac *jwt.AccountClaims) *Account {
// We don't want to register an account that is in the process of
// being built, however, to solve circular import dependencies, we
// need to store it here.
s.tmpAccounts.Store(ac.Subject, acc)
if v, loaded := s.tmpAccounts.LoadOrStore(ac.Subject, acc); loaded {
return v.(*Account)
}
// Update based on claims.
s.UpdateAccountClaims(acc, ac)
return acc
}
@@ -3708,7 +3756,7 @@ func buildPermissionsFromJwt(uc *jwt.Permissions) *Permissions {
// Helper to build internal NKeyUser.
func buildInternalNkeyUser(uc *jwt.UserClaims, acts map[string]struct{}, acc *Account) *NkeyUser {
nu := &NkeyUser{Nkey: uc.Subject, Account: acc, AllowedConnectionTypes: acts}
nu := &NkeyUser{Nkey: uc.Subject, Account: acc, AllowedConnectionTypes: acts, Issued: uc.IssuedAt}
if uc.IssuerAccount != _EMPTY_ {
nu.SigningKey = uc.Issuer
}
+1
View File
@@ -60,6 +60,7 @@ type ClientAuthentication interface {
// NkeyUser is for multiple nkey based users
type NkeyUser struct {
Nkey string `json:"user"`
Issued int64 `json:"issued,omitempty"` // this is a copy of the issued at (iat) field in the jwt
Permissions *Permissions `json:"permissions,omitempty"`
Account *Account `json:"account,omitempty"`
SigningKey string `json:"signing_key,omitempty"`
+50 -42
View File
@@ -847,7 +847,7 @@ func (c *client) applyAccountLimits() {
c.msubs = jwt.NoLimit
if c.opts.JWT != _EMPTY_ { // user jwt implies account
if uc, _ := jwt.DecodeUserClaims(c.opts.JWT); uc != nil {
c.mpay = int32(uc.Limits.Payload)
atomic.StoreInt32(&c.mpay, int32(uc.Limits.Payload))
c.msubs = int32(uc.Limits.Subs)
if uc.IssuerAccount != _EMPTY_ && uc.IssuerAccount != uc.Issuer {
if scope, ok := c.acc.signingKeys[uc.Issuer]; ok {
@@ -2914,8 +2914,11 @@ func (c *client) addShadowSubscriptions(acc *Account, sub *subscription, enact b
// Add in the shadow subscription.
func (c *client) addShadowSub(sub *subscription, ime *ime, enact bool) (*subscription, error) {
im := ime.im
c.mu.Lock()
nsub := *sub // copy
c.mu.Unlock()
im := ime.im
nsub.im = im
if !im.usePub && ime.dyn && im.tr != nil {
@@ -2950,8 +2953,10 @@ func (c *client) addShadowSub(sub *subscription, ime *ime, enact bool) (*subscri
return nil, fmt.Errorf(errs)
}
// Update our route map here.
c.srv.updateRemoteSubscription(im.acc, &nsub, 1)
// Update our route map here. But only if we are not a leaf node or a hub leafnode.
if c.kind != LEAF || c.isHubLeafNode() {
c.srv.updateRemoteSubscription(im.acc, &nsub, 1)
}
return &nsub, nil
}
@@ -5228,48 +5233,51 @@ func (c *client) closeConnection(reason ClosedState) {
// Unregister
srv.removeClient(c)
// Update remote subscriptions.
if acc != nil && (kind == CLIENT || kind == LEAF || kind == JETSTREAM) {
qsubs := map[string]*qsub{}
for _, sub := range subs {
// Call unsubscribe here to cleanup shadow subscriptions and such.
c.unsubscribe(acc, sub, true, false)
// Update route as normal for a normal subscriber.
if sub.queue == nil {
if !spoke {
srv.updateRouteSubscriptionMap(acc, sub, -1)
if srv.gateway.enabled {
srv.gatewayUpdateSubInterest(acc.Name, sub, -1)
if acc != nil {
// Update remote subscriptions.
if kind == CLIENT || kind == LEAF || kind == JETSTREAM {
qsubs := map[string]*qsub{}
for _, sub := range subs {
// Call unsubscribe here to cleanup shadow subscriptions and such.
c.unsubscribe(acc, sub, true, false)
// Update route as normal for a normal subscriber.
if sub.queue == nil {
if !spoke {
srv.updateRouteSubscriptionMap(acc, sub, -1)
if srv.gateway.enabled {
srv.gatewayUpdateSubInterest(acc.Name, sub, -1)
}
}
acc.updateLeafNodes(sub, -1)
} else {
// We handle queue subscribers special in case we
// have a bunch we can just send one update to the
// connected routes.
num := int32(1)
if kind == LEAF {
num = sub.qw
}
key := keyFromSub(sub)
if esub, ok := qsubs[key]; ok {
esub.n += num
} else {
qsubs[key] = &qsub{sub, num}
}
}
acc.updateLeafNodes(sub, -1)
} else {
// We handle queue subscribers special in case we
// have a bunch we can just send one update to the
// connected routes.
num := int32(1)
if kind == LEAF {
num = sub.qw
}
// TODO(dlc) - Better to use string builder?
key := bytesToString(sub.subject) + " " + bytesToString(sub.queue)
if esub, ok := qsubs[key]; ok {
esub.n += num
} else {
qsubs[key] = &qsub{sub, num}
}
// Process any qsubs here.
for _, esub := range qsubs {
if !spoke {
srv.updateRouteSubscriptionMap(acc, esub.sub, -(esub.n))
if srv.gateway.enabled {
srv.gatewayUpdateSubInterest(acc.Name, esub.sub, -(esub.n))
}
}
acc.updateLeafNodes(esub.sub, -(esub.n))
}
}
// Process any qsubs here.
for _, esub := range qsubs {
if !spoke {
srv.updateRouteSubscriptionMap(acc, esub.sub, -(esub.n))
if srv.gateway.enabled {
srv.gatewayUpdateSubInterest(acc.Name, esub.sub, -(esub.n))
}
}
acc.updateLeafNodes(esub.sub, -(esub.n))
}
// Always remove from the account, otherwise we can leak clients.
// Note that SYSTEM and ACCOUNT types from above cleanup their own subs.
if prev := acc.removeClient(c); prev == 1 {
srv.decActiveAccounts()
}
@@ -5419,7 +5427,7 @@ func (c *client) getAccAndResultFromCache() (*Account, *SublistResult) {
if genid := atomic.LoadUint64(&sl.genid); genid != pac.genid {
ok = false
delete(c.in.pacache, bytesToString(c.pa.pacache))
c.in.pacache = make(map[string]*perAccountCache)
} else {
acc = pac.acc
r = pac.results
+17 -3
View File
@@ -14,6 +14,7 @@
package server
import (
"runtime/debug"
"time"
)
@@ -33,15 +34,28 @@ const (
)
var (
// gitCommit injected at build
gitCommit string
// gitCommit and serverVersion injected at build.
gitCommit, serverVersion string
// trustedKeys is a whitespace separated array of trusted operator's public nkeys.
trustedKeys string
)
func init() {
// Use build info if present, it would be if building using 'go build .'
// or when using a release.
if info, ok := debug.ReadBuildInfo(); ok {
for _, setting := range info.Settings {
switch setting.Key {
case "vcs.revision":
gitCommit = setting.Value[:7]
}
}
}
}
const (
// VERSION is the current version for the server.
VERSION = "2.10.16"
VERSION = "2.10.18"
// PROTO is the currently supported protocol.
// 0 was the original
+74 -41
View File
@@ -711,7 +711,7 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri
}
mset.mu.RLock()
s, jsa, tierName, cfg, acc := mset.srv, mset.jsa, mset.tier, mset.cfg, mset.acc
s, jsa, cfg, acc := mset.srv, mset.jsa, mset.cfg, mset.acc
retention := cfg.Retention
mset.mu.RUnlock()
@@ -726,10 +726,8 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri
return nil, NewJSConsumerConfigRequiredError()
}
jsa.usageMu.RLock()
selectedLimits, limitsFound := jsa.limits[tierName]
jsa.usageMu.RUnlock()
if !limitsFound {
selectedLimits, _, _, _ := acc.selectLimits(config.replicas(&cfg))
if selectedLimits == nil {
return nil, NewJSNoLimitsError()
}
@@ -737,10 +735,10 @@ func (mset *stream) addConsumerWithAssignment(config *ConsumerConfig, oname stri
// Make sure we have sane defaults. Do so with the JS lock, otherwise a
// badly timed meta snapshot can result in a race condition.
mset.js.mu.Lock()
setConsumerConfigDefaults(config, &mset.cfg, srvLim, &selectedLimits)
setConsumerConfigDefaults(config, &mset.cfg, srvLim, selectedLimits)
mset.js.mu.Unlock()
if err := checkConsumerCfg(config, srvLim, &cfg, acc, &selectedLimits, isRecovering); err != nil {
if err := checkConsumerCfg(config, srvLim, &cfg, acc, selectedLimits, isRecovering); err != nil {
return nil, err
}
sampleFreq := 0
@@ -2111,19 +2109,17 @@ func (o *consumer) loopAndForwardProposals(qch chan struct{}) {
const maxBatch = 256 * 1024
var entries []*Entry
for sz := 0; proposal != nil; proposal = proposal.next {
entry := entryPool.Get().(*Entry)
entry.Type, entry.Data = EntryNormal, proposal.data
entries = append(entries, entry)
entries = append(entries, newEntry(EntryNormal, proposal.data))
sz += len(proposal.data)
if sz > maxBatch {
node.ProposeDirect(entries)
node.ProposeMulti(entries)
// We need to re-create `entries` because there is a reference
// to it in the node's pae map.
sz, entries = 0, nil
}
}
if len(entries) > 0 {
node.ProposeDirect(entries)
node.ProposeMulti(entries)
}
return nil
}
@@ -2146,22 +2142,18 @@ func (o *consumer) loopAndForwardProposals(qch chan struct{}) {
// Lock should be held.
func (o *consumer) propose(entry []byte) {
var notify bool
p := &proposal{data: entry}
if o.phead == nil {
o.phead = p
notify = true
} else {
o.ptail.next = p
}
o.ptail = p
// Kick our looper routine if needed.
if notify {
select {
case o.pch <- struct{}{}:
default:
}
// Kick our looper routine.
select {
case o.pch <- struct{}{}:
default:
}
}
@@ -2633,17 +2625,24 @@ func (o *consumer) infoWithSnapAndReply(snap bool, reply string) *ConsumerInfo {
TimeStamp: time.Now().UTC(),
}
// If we are replicated and we are not the leader we need to pull certain data from our store.
if rg != nil && rg.node != nil && !o.isLeader() && o.store != nil {
// If we are replicated and we are not the leader or we are filtered, we need to pull certain data from our store.
isLeader := o.isLeader()
if rg != nil && rg.node != nil && o.store != nil && (!isLeader || o.isFiltered()) {
state, err := o.store.BorrowState()
if err != nil {
o.mu.Unlock()
return nil
}
info.Delivered.Consumer, info.Delivered.Stream = state.Delivered.Consumer, state.Delivered.Stream
info.AckFloor.Consumer, info.AckFloor.Stream = state.AckFloor.Consumer, state.AckFloor.Stream
info.NumAckPending = len(state.Pending)
info.NumRedelivered = len(state.Redelivered)
if !isLeader {
info.Delivered.Consumer, info.Delivered.Stream = state.Delivered.Consumer, state.Delivered.Stream
info.AckFloor.Consumer, info.AckFloor.Stream = state.AckFloor.Consumer, state.AckFloor.Stream
info.NumAckPending = len(state.Pending)
info.NumRedelivered = len(state.Redelivered)
} else {
// Since we are filtered and we are the leader we could have o.sseq that is skipped ahead.
// To maintain consistency in reporting (e.g. jsz) we take the state for our delivered stream sequence.
info.Delivered.Stream = state.Delivered.Stream
}
}
// Adjust active based on non-zero etc. Also make UTC here.
@@ -2742,6 +2741,12 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b
return
}
// Check if this ack is above the current pointer to our next to deliver.
// This could happen on a cooperative takeover with high speed deliveries.
if sseq >= o.sseq {
o.sseq = sseq + 1
}
mset := o.mset
if mset == nil || mset.closed.Load() {
o.mu.Unlock()
@@ -2763,8 +2768,12 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b
delete(o.pending, sseq)
// Use the original deliver sequence from our pending record.
dseq = p.Sequence
// Only move floors if we matched an existing pending.
if dseq == o.adflr+1 {
if len(o.pending) == 0 {
o.adflr = o.dseq - 1
o.asflr = o.sseq - 1
} else if dseq == o.adflr+1 {
o.adflr, o.asflr = dseq, sseq
for ss := sseq + 1; ss < o.sseq; ss++ {
if p, ok := o.pending[ss]; ok {
@@ -2775,11 +2784,6 @@ func (o *consumer) processAckMsg(sseq, dseq, dc uint64, reply string, doSample b
}
}
}
// If nothing left set consumer to current delivered.
// Do not update stream.
if len(o.pending) == 0 {
o.adflr = o.dseq - 1
}
}
delete(o.rdc, sseq)
o.removeFromRedeliverQueue(sseq)
@@ -4150,7 +4154,8 @@ func (o *consumer) checkNumPending() uint64 {
if o.mset != nil {
var state StreamState
o.mset.store.FastState(&state)
if o.sseq > state.LastSeq && o.npc != 0 || o.npc > int64(state.Msgs) {
npc := o.numPending()
if o.sseq > state.LastSeq && npc > 0 || npc > state.Msgs {
// Re-calculate.
o.streamNumPending()
}
@@ -4318,7 +4323,7 @@ func (o *consumer) deliverMsg(dsubj, ackReply string, pmsg *jsPubMsg, dc uint64,
// If we are ack none and mset is interest only we should make sure stream removes interest.
if ap == AckNone && rp != LimitsPolicy {
if o.node == nil || o.cfg.Direct {
if mset != nil && mset.ackq != nil && (o.node == nil || o.cfg.Direct) {
mset.ackq.push(seq)
} else {
o.updateAcks(dseq, seq, _EMPTY_)
@@ -5218,18 +5223,19 @@ func (o *consumer) stopWithFlags(dflag, sdflag, doSignal, advisory bool) error {
// ignoreInterest marks whether the consumer should be ignored when determining interest.
// No lock held on entry.
func (o *consumer) cleanupNoInterestMessages(mset *stream, ignoreInterest bool) {
state := mset.state()
stop := state.LastSeq
o.mu.Lock()
if !o.isLeader() {
o.readStoredState(stop)
o.readStoredState(0)
}
start := o.asflr
o.mu.Unlock()
// Make sure we start at worst with first sequence in the stream.
state := mset.state()
if start < state.FirstSeq {
start = state.FirstSeq
}
stop := state.LastSeq
// Consumer's interests are ignored by default. If we should not ignore interest, unset.
co := o
@@ -5238,13 +5244,37 @@ func (o *consumer) cleanupNoInterestMessages(mset *stream, ignoreInterest bool)
}
var rmseqs []uint64
mset.mu.Lock()
mset.mu.RLock()
// If over this amount of messages to check, defer to checkInterestState() which
// will do the right thing since we are now removed.
// TODO(dlc) - Better way?
const bailThresh = 100_000
// Check if we would be spending too much time here and defer to separate go routine.
if len(mset.consumers) == 0 {
mset.mu.RUnlock()
mset.mu.Lock()
defer mset.mu.Unlock()
mset.store.Purge()
var state StreamState
mset.store.FastState(&state)
mset.lseq = state.LastSeq
// Also make sure we clear any pending acks.
mset.clearAllPreAcksBelowFloor(state.FirstSeq)
return
} else if stop-start > bailThresh {
mset.mu.RUnlock()
go mset.checkInterestState()
return
}
for seq := start; seq <= stop; seq++ {
if mset.noInterest(seq, co) {
rmseqs = append(rmseqs, seq)
}
}
mset.mu.Unlock()
mset.mu.RUnlock()
// These can be removed.
for _, seq := range rmseqs {
@@ -5478,10 +5508,13 @@ func (o *consumer) checkStateForInterestStream() error {
o.mu.RUnlock()
// If we have pending, we will need to walk through to delivered in case we missed any of those acks as well.
if state != nil && len(state.Pending) > 0 {
if state != nil && len(state.Pending) > 0 && state.AckFloor.Stream > 0 {
for seq := state.AckFloor.Stream + 1; seq <= state.Delivered.Stream; seq++ {
if _, ok := state.Pending[seq]; !ok {
mset.ackMsg(o, seq)
// Want to call needAck since it is filter aware.
if o.needAck(seq, _EMPTY_) {
mset.ackMsg(o, seq)
}
}
}
}
+79 -6
View File
@@ -98,6 +98,12 @@ const (
// FIXME(dlc) - make configurable.
var eventsHBInterval = 30 * time.Second
// Default minimum wait time for sending statsz
const defaultStatszRateLimit = 1 * time.Second
// Variable version so we can set in tests.
var statszRateLimit = defaultStatszRateLimit
type sysMsgHandler func(sub *subscription, client *client, acc *Account, subject, reply string, hdr, msg []byte)
// Used if we have to queue things internally to avoid the route/gw path.
@@ -134,6 +140,7 @@ type internal struct {
shash string
inboxPre string
remoteStatsSub *subscription
lastStatsz time.Time
}
// ServerStatsMsg is sent periodically with stats updates.
@@ -807,6 +814,10 @@ func (s *Server) sendStatsz(subj string) {
var m ServerStatsMsg
s.updateServerUsage(&m.Stats)
if s.limitStatsz(subj) {
return
}
s.mu.RLock()
defer s.mu.RUnlock()
@@ -948,6 +959,35 @@ func (s *Server) sendStatsz(subj string) {
s.sendInternalMsg(subj, _EMPTY_, &m.Server, &m)
}
// Limit updates to the heartbeat interval, max one second by default.
func (s *Server) limitStatsz(subj string) bool {
s.mu.Lock()
defer s.mu.Unlock()
if s.sys == nil {
return true
}
// Only limit the normal broadcast subject.
if subj != fmt.Sprintf(serverStatsSubj, s.ID()) {
return false
}
interval := statszRateLimit
if s.sys.cstatsz < interval {
interval = s.sys.cstatsz
}
if time.Since(s.sys.lastStatsz) < interval {
// Reschedule heartbeat for the next interval.
if s.sys.stmr != nil {
s.sys.stmr.Reset(time.Until(s.sys.lastStatsz.Add(interval)))
}
return true
}
s.sys.lastStatsz = time.Now()
return false
}
// Send out our statz update.
// This should be wrapChk() to setup common locking.
func (s *Server) heartbeatStatsz() {
@@ -965,6 +1005,12 @@ func (s *Server) heartbeatStatsz() {
go s.sendStatszUpdate()
}
// Reset statsz rate limit for the next broadcast.
// This should be wrapChk() to setup common locking.
func (s *Server) resetLastStatsz() {
s.sys.lastStatsz = time.Time{}
}
func (s *Server) sendStatszUpdate() {
s.sendStatsz(fmt.Sprintf(serverStatsSubj, s.ID()))
}
@@ -1019,44 +1065,56 @@ func (s *Server) Node() string {
// Tradeoff is subscription and interest graph events vs connect and
// disconnect events, etc.
func (s *Server) initEventTracking() {
if !s.EventsEnabled() {
// Capture sys in case we are shutdown while setting up.
s.mu.RLock()
sys := s.sys
s.mu.RUnlock()
if sys == nil || sys.client == nil || sys.account == nil {
return
}
// Create a system hash which we use for other servers to target us specifically.
s.sys.shash = getHash(s.info.Name)
sys.shash = getHash(s.info.Name)
// This will be for all inbox responses.
subject := fmt.Sprintf(inboxRespSubj, s.sys.shash, "*")
subject := fmt.Sprintf(inboxRespSubj, sys.shash, "*")
if _, err := s.sysSubscribe(subject, s.inboxReply); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
s.sys.inboxPre = subject
sys.inboxPre = subject
// This is for remote updates for connection accounting.
subject = fmt.Sprintf(accConnsEventSubjOld, "*")
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteConnsUpdate)); err != nil {
s.Errorf("Error setting up internal tracking for %s: %v", subject, err)
return
}
// This will be for responses for account info that we send out.
subject = fmt.Sprintf(connsRespSubj, s.info.ID)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteConnsUpdate)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// Listen for broad requests to respond with number of subscriptions for a given subject.
if _, err := s.sysSubscribe(accNumSubsReqSubj, s.noInlineCallback(s.nsubsRequest)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// Listen for statsz from others.
subject = fmt.Sprintf(serverStatsSubj, "*")
if sub, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteServerUpdate)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
} else {
// Keep track of this one.
s.sys.remoteStatsSub = sub
sys.remoteStatsSub = sub
}
// Listen for all server shutdowns.
subject = fmt.Sprintf(shutdownEventSubj, "*")
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteServerShutdown)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// Listen for servers entering lame-duck mode.
// NOTE: This currently is handled in the same way as a server shutdown, but has
@@ -1064,6 +1122,7 @@ func (s *Server) initEventTracking() {
subject = fmt.Sprintf(lameDuckEventSubj, "*")
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteServerShutdown)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// Listen for account claims updates.
subscribeToUpdate := true
@@ -1074,6 +1133,7 @@ func (s *Server) initEventTracking() {
for _, sub := range []string{accUpdateEventSubjOld, accUpdateEventSubjNew} {
if _, err := s.sysSubscribe(fmt.Sprintf(sub, "*"), s.noInlineCallback(s.accountClaimUpdate)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
}
}
@@ -1081,6 +1141,7 @@ func (s *Server) initEventTracking() {
// This subscription is kept for backwards compatibility. Got replaced by ...PING.STATZ from below
if _, err := s.sysSubscribe(serverStatsPingReqSubj, s.noInlineCallback(s.statszReq)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
monSrvc := map[string]sysMsgHandler{
"IDZ": s.idzReq,
@@ -1134,10 +1195,12 @@ func (s *Server) initEventTracking() {
subject = fmt.Sprintf(serverDirectReqSubj, s.info.ID, name)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(req)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
subject = fmt.Sprintf(serverPingReqSubj, name)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(req)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
}
extractAccount := func(subject string) (string, error) {
@@ -1230,6 +1293,7 @@ func (s *Server) initEventTracking() {
for name, req := range monAccSrvc {
if _, err := s.sysSubscribe(fmt.Sprintf(accDirectReqSubj, "*", name), s.noInlineCallback(req)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
}
@@ -1238,6 +1302,7 @@ func (s *Server) initEventTracking() {
// is only one that will answer. This breaks tests since we still forward on remote server connect.
if _, err := s.sysSubscribe(fmt.Sprintf(userDirectReqSubj, "*"), s.userInfoReq); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// For now only the STATZ subject has an account specific ping equivalent.
@@ -1255,6 +1320,7 @@ func (s *Server) initEventTracking() {
})
})); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// Listen for updates when leaf nodes connect for a given account. This will
@@ -1262,32 +1328,38 @@ func (s *Server) initEventTracking() {
subject = fmt.Sprintf(leafNodeConnectEventSubj, "*")
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.leafNodeConnected)); err != nil {
s.Errorf("Error setting up internal tracking: %v", err)
return
}
// For tracking remote latency measurements.
subject = fmt.Sprintf(remoteLatencyEventSubj, s.sys.shash)
subject = fmt.Sprintf(remoteLatencyEventSubj, sys.shash)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.remoteLatencyUpdate)); err != nil {
s.Errorf("Error setting up internal latency tracking: %v", err)
return
}
// This is for simple debugging of number of subscribers that exist in the system.
if _, err := s.sysSubscribeInternal(accSubsSubj, s.noInlineCallback(s.debugSubscribers)); err != nil {
s.Errorf("Error setting up internal debug service for subscribers: %v", err)
return
}
// Listen for requests to reload the server configuration.
subject = fmt.Sprintf(serverReloadReqSubj, s.info.ID)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.reloadConfig)); err != nil {
s.Errorf("Error setting up server reload handler: %v", err)
return
}
// Client connection kick
subject = fmt.Sprintf(clientKickReqSubj, s.info.ID)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.kickClient)); err != nil {
s.Errorf("Error setting up client kick service: %v", err)
return
}
// Client connection LDM
subject = fmt.Sprintf(clientLDMReqSubj, s.info.ID)
if _, err := s.sysSubscribe(subject, s.noInlineCallback(s.ldmClient)); err != nil {
s.Errorf("Error setting up client LDM service: %v", err)
return
}
}
@@ -1868,6 +1940,7 @@ func (s *Server) statszReq(sub *subscription, c *client, _ *Account, subject, re
// No reply is a signal that we should use our normal broadcast subject.
if reply == _EMPTY_ {
reply = fmt.Sprintf(serverStatsSubj, s.info.ID)
s.wrapChk(s.resetLastStatsz)
}
opts := StatszEventOptions{}
+380 -146
View File
@@ -214,7 +214,7 @@ type msgBlock struct {
bytes uint64 // User visible bytes count.
rbytes uint64 // Total bytes (raw) including deleted. Used for rolling to new blk.
msgs uint64 // User visible message count.
fss map[string]*SimpleState
fss *stree.SubjectTree[SimpleState]
kfn string
lwts int64
llts int64
@@ -295,13 +295,13 @@ const (
// Maximum size of a write buffer we may consider for re-use.
maxBufReuse = 2 * 1024 * 1024
// default cache buffer expiration
defaultCacheBufferExpiration = 2 * time.Second
defaultCacheBufferExpiration = 10 * time.Second
// default sync interval
defaultSyncInterval = 2 * time.Minute
// default idle timeout to close FDs.
closeFDsIdle = 30 * time.Second
// default expiration time for mb.fss when idle.
defaultFssExpiration = 10 * time.Second
defaultFssExpiration = 2 * time.Minute
// coalesceMinimum
coalesceMinimum = 16 * 1024
// maxFlushWait is maximum we will wait to gather messages to flush.
@@ -1869,7 +1869,7 @@ func (mb *msgBlock) lastChecksum() []byte {
mb.rbytes = uint64(fi.Size())
}
if mb.rbytes < checksumSize {
return nil
return lchk[:]
}
// Encrypted?
// Check for encryption, we do not load keys on startup anymore so might need to load them here.
@@ -2063,11 +2063,13 @@ func (fs *fileStore) expireMsgsOnRecover() {
}
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
mb.fss.Iter(func(bsubj []byte, ss *SimpleState) bool {
subj := bytesToString(bsubj)
for i := uint64(0); i < ss.Msgs; i++ {
fs.removePerSubject(subj)
}
}
return true
})
mb.dirtyCloseWithRemove(true)
deleted++
}
@@ -2314,9 +2316,21 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor
// Mark fss activity.
mb.lsts = time.Now().UnixNano()
if filter == _EMPTY_ {
filter = fwcs
wc = true
}
// If we only have 1 subject currently and it matches our filter we can also set isAll.
if !isAll && len(mb.fss) == 1 {
_, isAll = mb.fss[filter]
if !isAll && mb.fss.Size() == 1 {
if !wc {
_, isAll = mb.fss.Find(stringToBytes(filter))
} else {
// Since mb.fss.Find won't work if filter is a wildcard, need to use Match instead.
mb.fss.Match(stringToBytes(filter), func(subject []byte, _ *SimpleState) {
isAll = true
})
}
}
// Make sure to start at mb.first.seq if fseq < mb.first.seq
if seq := atomic.LoadUint64(&mb.first.seq); seq > fseq {
@@ -2325,16 +2339,15 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor
lseq := atomic.LoadUint64(&mb.last.seq)
// Optionally build the isMatch for wildcard filters.
tsa := [32]string{}
fsa := [32]string{}
var fts []string
_tsa, _fsa := [32]string{}, [32]string{}
tsa, fsa := _tsa[:0], _fsa[:0]
var isMatch func(subj string) bool
// Decide to build.
if wc {
fts = tokenizeSubjectIntoSlice(fsa[:0], filter)
fsa = tokenizeSubjectIntoSlice(fsa[:0], filter)
isMatch = func(subj string) bool {
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
tsa = tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tsa, fsa)
}
}
@@ -2344,19 +2357,18 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor
// If we do not think we should do a linear scan check how many fss we
// would need to scan vs the full range of the linear walk. Optimize for
// 25th quantile of a match in a linear walk. Filter should be a wildcard.
if !doLinearScan && wc {
doLinearScan = len(mb.fss)*4 > int(lseq-fseq)
// We should consult fss if our cache is not loaded and we only have fss loaded.
if !doLinearScan && wc && mb.cacheAlreadyLoaded() {
doLinearScan = mb.fss.Size()*4 > int(lseq-fseq)
}
if !doLinearScan {
// If we have a wildcard match against all tracked subjects we know about.
if wc {
subs = subs[:0]
for subj := range mb.fss {
if isMatch(subj) {
subs = append(subs, subj)
}
}
mb.fss.Match(stringToBytes(filter), func(bsubj []byte, _ *SimpleState) {
subs = append(subs, string(bsubj))
})
// Check if we matched anything
if len(subs) == 0 {
return nil, didLoad, ErrStoreMsgNotFound
@@ -2364,7 +2376,7 @@ func (mb *msgBlock) firstMatching(filter string, wc bool, start uint64, sm *Stor
}
fseq = lseq + 1
for _, subj := range subs {
ss := mb.fss[subj]
ss, _ := mb.fss.Find(stringToBytes(subj))
if ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
@@ -2455,6 +2467,11 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (
}
}
if filter == _EMPTY_ {
filter = fwcs
wc = true
}
update := func(ss *SimpleState) {
total += ss.Msgs
if first == 0 || ss.First < first {
@@ -2468,9 +2485,9 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (
// Make sure we have fss loaded.
mb.ensurePerSubjectInfoLoaded()
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
_tsa, _fsa := [32]string{}, [32]string{}
tsa, fsa := _tsa[:0], _fsa[:0]
fsa = tokenizeSubjectIntoSlice(fsa[:0], filter)
// 1. See if we match any subs from fss.
// 2. If we match and the sseq is past ss.Last then we can use meta only.
@@ -2480,25 +2497,26 @@ func (mb *msgBlock) filteredPendingLocked(filter string, wc bool, sseq uint64) (
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
tsa = tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tsa, fsa)
}
var havePartial bool
for subj, ss := range mb.fss {
if isAll || isMatch(subj) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
update(ss)
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) {
if havePartial {
// If we already found a partial then don't do anything else.
return
}
}
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(bytesToString(bsubj), ss.First, ss)
}
if sseq <= ss.First {
update(ss)
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
}
})
// If we did not encounter any partials we can return here.
if !havePartial {
@@ -2589,9 +2607,85 @@ func (fs *fileStore) FilteredState(sseq uint64, subj string) SimpleState {
return ss
}
// This is used to see if we can selectively jump start blocks based on filter subject and a floor block index.
// Will return -1 if no matches at all.
func (fs *fileStore) checkSkipFirstBlock(filter string, wc bool) (int, int) {
start, stop := uint32(math.MaxUint32), uint32(0)
if wc {
fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
})
} else if psi, ok := fs.psim.Find(stringToBytes(filter)); ok {
start, stop = psi.fblk, psi.lblk
}
// Nothing found.
if start == uint32(math.MaxUint32) {
return -1, -1
}
// Here we need to translate this to index into fs.blks properly.
mb := fs.bim[start]
if mb == nil {
// psim fblk can be lazy.
i := start + 1
for ; i <= stop; i++ {
mb = fs.bim[i]
if mb == nil {
continue
}
if _, f, _ := mb.filteredPending(filter, wc, 0); f > 0 {
break
}
}
// Update fblk since fblk was outdated.
if !wc {
if psi, ok := fs.psim.Find(stringToBytes(filter)); ok {
psi.fblk = i
}
} else {
fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
if i > psi.fblk {
psi.fblk = i
}
})
}
}
// Still nothing.
if mb == nil {
return -1, -1
}
// Grab first index.
fi, _ := fs.selectMsgBlockWithIndex(atomic.LoadUint64(&mb.last.seq))
// Grab last if applicable.
var li int
if mb = fs.bim[stop]; mb != nil {
li, _ = fs.selectMsgBlockWithIndex(atomic.LoadUint64(&mb.last.seq))
}
return fi, li
}
// Optimized way for getting all num pending matching a filter subject.
// Lock should be held.
func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
fs.numFilteredPendingWithLast(filter, true, ss)
}
// Optimized way for getting all num pending matching a filter subject and first sequence only.
// Lock should be held.
func (fs *fileStore) numFilteredPendingNoLast(filter string, ss *SimpleState) {
fs.numFilteredPendingWithLast(filter, false, ss)
}
// Optimized way for getting all num pending matching a filter subject.
// Optionally look up last sequence. Sometimes do not need last and this avoids cost.
// Lock should be held.
func (fs *fileStore) numFilteredPendingWithLast(filter string, last bool, ss *SimpleState) {
isAll := filter == _EMPTY_ || filter == fwcs
// If isAll we do not need to do anything special to calculate the first and last and total.
@@ -2601,29 +2695,52 @@ func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
ss.Msgs = fs.state.Msgs
return
}
// Always reset.
ss.First, ss.Last, ss.Msgs = 0, 0, 0
if filter == _EMPTY_ {
filter = fwcs
}
start, stop := uint32(math.MaxUint32), uint32(0)
fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
ss.Msgs += psi.total
// Keep track of start and stop indexes for this subject.
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
})
// We do need to figure out the first and last sequences.
wc := subjectHasWildcard(filter)
start, stop := uint32(math.MaxUint32), uint32(0)
if wc {
fs.psim.Match(stringToBytes(filter), func(_ []byte, psi *psi) {
ss.Msgs += psi.total
// Keep track of start and stop indexes for this subject.
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
})
} else if psi, ok := fs.psim.Find(stringToBytes(filter)); ok {
ss.Msgs += psi.total
start, stop = psi.fblk, psi.lblk
}
// Did not find anything.
if stop == 0 {
return
}
// Do start
mb := fs.bim[start]
if mb != nil {
_, f, _ := mb.filteredPending(filter, wc, 0)
ss.First = f
}
if ss.First == 0 {
// This is a miss. This can happen since psi.fblk is lazy, but should be very rare.
for i := start + 1; i <= stop; i++ {
// This is a miss. This can happen since psi.fblk is lazy.
// We will make sure to update fblk.
// Hold this outside loop for psim fblk updates when done.
i := start + 1
for ; i <= stop; i++ {
mb := fs.bim[i]
if mb == nil {
continue
@@ -2633,11 +2750,25 @@ func (fs *fileStore) numFilteredPending(filter string, ss *SimpleState) {
break
}
}
// Update fblk since fblk was outdated.
if !wc {
if info, ok := fs.psim.Find(stringToBytes(filter)); ok {
info.fblk = i
}
} else {
fs.psim.Match(stringToBytes(filter), func(subj []byte, psi *psi) {
if i > psi.fblk {
psi.fblk = i
}
})
}
}
// Now last
if mb = fs.bim[stop]; mb != nil {
_, _, l := mb.filteredPending(filter, wc, 0)
ss.Last = l
// Now gather last sequence if asked to do so.
if last {
if mb = fs.bim[stop]; mb != nil {
_, _, l := mb.filteredPending(filter, wc, 0)
ss.Last = l
}
}
}
@@ -2650,6 +2781,10 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
return nil
}
if subject == _EMPTY_ {
subject = fwcs
}
start, stop := fs.blks[0], fs.lmb
// We can short circuit if not a wildcard using psim for start and stop.
if !subjectHasWildcard(subject) {
@@ -2657,7 +2792,12 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
if !ok {
return nil
}
start, stop = fs.bim[info.fblk], fs.bim[info.lblk]
if f := fs.bim[info.fblk]; f != nil {
start = f
}
if l := fs.bim[info.lblk]; l != nil {
stop = l
}
}
// Aggregate fss.
@@ -2681,21 +2821,20 @@ func (fs *fileStore) SubjectsState(subject string) map[string]SimpleState {
}
// Mark fss activity.
mb.lsts = time.Now().UnixNano()
for subj, ss := range mb.fss {
if subject == _EMPTY_ || subject == fwcs || subjectIsSubsetMatch(subj, subject) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
oss := fss[subj]
if oss.First == 0 { // New
fss[subj] = *ss
} else {
// Merge here.
oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
fss[subj] = oss
}
mb.fss.Match(stringToBytes(subject), func(bsubj []byte, ss *SimpleState) {
subj := string(bsubj)
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
}
oss := fss[subj]
if oss.First == 0 { // New
fss[subj] = *ss
} else {
// Merge here.
oss.Last, oss.Msgs = ss.Last, oss.Msgs+ss.Msgs
fss[subj] = oss
}
})
if shouldExpire {
// Expire this cache before moving on.
mb.tryForceExpireCacheLocked()
@@ -2723,6 +2862,10 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
return 0, validThrough
}
// If sseq is less then our first set to first.
if sseq < fs.state.FirstSeq {
sseq = fs.state.FirstSeq
}
// Track starting for both block for the sseq and staring block that matches any subject.
var seqStart int
// See if we need to figure out starting block per sseq.
@@ -2734,16 +2877,14 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
}
isAll := filter == _EMPTY_ || filter == fwcs
if isAll && filter == _EMPTY_ {
filter = fwcs
}
wc := subjectHasWildcard(filter)
// See if filter was provided but its the only subject.
if !isAll && !wc && fs.psim.Size() == 1 {
if _, ok := fs.psim.Find(stringToBytes(filter)); ok {
isAll = true
}
}
if isAll && filter == _EMPTY_ {
filter = fwcs
_, isAll = fs.psim.Find(stringToBytes(filter))
}
// If we are isAll and have no deleted we can do a simpler calculation.
if !lastPerSubject && isAll && (fs.state.LastSeq-fs.state.FirstSeq+1) == fs.state.Msgs {
@@ -2753,8 +2894,9 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
return fs.state.LastSeq - sseq + 1, validThrough
}
var tsa, fsa [32]string
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
_tsa, _fsa := [32]string{}, [32]string{}
tsa, fsa := _tsa[:0], _fsa[:0]
fsa = tokenizeSubjectIntoSlice(fsa[:0], filter)
isMatch := func(subj string) bool {
if isAll {
@@ -2763,8 +2905,8 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
tsa = tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tsa, fsa)
}
// Handle last by subject a bit differently.
@@ -2864,20 +3006,22 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
mb.lsts = time.Now().UnixNano()
var havePartial bool
for subj, ss := range mb.fss {
if isMatch(subj) {
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
t += ss.Msgs
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
break
}
mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) {
if havePartial {
// If we already found a partial then don't do anything else.
return
}
}
subj := bytesToString(bsubj)
if ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
if sseq <= ss.First {
t += ss.Msgs
} else if sseq <= ss.Last {
// We matched but its a partial.
havePartial = true
}
})
// See if we need to scan msgs here.
if havePartial {
@@ -2955,11 +3099,9 @@ func (fs *fileStore) NumPending(sseq uint64, filter string, lastPerSubject bool)
// Mark fss activity.
mb.lsts = time.Now().UnixNano()
for subj, ss := range mb.fss {
if isMatch(subj) {
adjust += ss.Msgs
}
}
mb.fss.Match(stringToBytes(filter), func(bsubj []byte, ss *SimpleState) {
adjust += ss.Msgs
})
}
} else {
// This is the last block. We need to scan per message here.
@@ -3080,7 +3222,7 @@ func (fs *fileStore) newMsgBlockForWrite() (*msgBlock, error) {
// Lock should be held to quiet race detector.
mb.mu.Lock()
mb.setupWriteCache(rbuf)
mb.fss = make(map[string]*SimpleState)
mb.fss = stree.NewSubjectTree[SimpleState]()
// Set cache time to creation time to start.
ts := time.Now().UnixNano()
@@ -3339,6 +3481,17 @@ func (mb *msgBlock) skipMsg(seq uint64, now time.Time) {
mb.last.ts = nowts
atomic.StoreUint64(&mb.first.seq, seq+1)
mb.first.ts = nowts
needsRecord = mb == mb.fs.lmb
if needsRecord && mb.rbytes > 0 {
// We want to make sure since we have no messages
// that we write to the beginning since we only need last one.
mb.rbytes, mb.cache = 0, &cache{}
// If encrypted we need to reset counter since we just keep one.
if mb.bek != nil {
// Recreate to reset counter.
mb.bek, _ = genBlockEncryptionKey(mb.fs.fcfg.Cipher, mb.seed, mb.nonce)
}
}
} else {
needsRecord = true
mb.dmap.Insert(seq)
@@ -3521,10 +3674,11 @@ func (fs *fileStore) firstSeqForSubj(subj string) (uint64, error) {
// Mark fss activity.
mb.lsts = time.Now().UnixNano()
if ss := mb.fss[subj]; ss != nil {
bsubj := stringToBytes(subj)
if ss, ok := mb.fss.Find(bsubj); ok && ss != nil {
// Adjust first if it was not where we thought it should be.
if i != start {
if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
if info, ok := fs.psim.Find(bsubj); ok {
info.fblk = i
}
}
@@ -3608,11 +3762,12 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) {
// collect all that are not correct.
needAttention := make(map[string]*psi)
fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
fs.psim.Iter(func(subj []byte, psi *psi) bool {
numMsgs += psi.total
if psi.total > maxMsgsPer {
needAttention[string(subj)] = psi
}
return true
})
// We had an issue with a use case where psim (and hence fss) were correct but idx was not and was not properly being caught.
@@ -3632,10 +3787,11 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) {
fs.rebuildStateLocked(nil)
// Need to redo blocks that need attention.
needAttention = make(map[string]*psi)
fs.psim.Match([]byte(fwcs), func(subj []byte, psi *psi) {
fs.psim.Iter(func(subj []byte, psi *psi) bool {
if psi.total > maxMsgsPer {
needAttention[string(subj)] = psi
}
return true
})
}
@@ -3657,8 +3813,8 @@ func (fs *fileStore) enforceMsgPerSubjectLimit(fireCallback bool) {
// Grab the ss entry for this subject in case sparse.
mb.mu.Lock()
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss != nil && ss.firstNeedsUpdate {
ss, ok := mb.fss.Find(stringToBytes(subj))
if ok && ss != nil && ss.firstNeedsUpdate {
mb.recalculateFirstForSubj(subj, ss.First, ss)
}
mb.mu.Unlock()
@@ -4753,11 +4909,11 @@ func (mb *msgBlock) writeMsgRecord(rl, seq uint64, subj string, mhdr, msg []byte
}
// Mark fss activity.
mb.lsts = time.Now().UnixNano()
if ss := mb.fss[subj]; ss != nil {
if ss, ok := mb.fss.Find(stringToBytes(subj)); ok && ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
mb.fss.Insert(stringToBytes(subj), SimpleState{Msgs: 1, First: seq, Last: seq})
}
}
@@ -5358,7 +5514,7 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error {
// Create FSS if we should track.
var popFss bool
if mb.fssNotLoaded() {
mb.fss = make(map[string]*SimpleState)
mb.fss = stree.NewSubjectTree[SimpleState]()
popFss = true
}
// Mark fss activity.
@@ -5425,15 +5581,15 @@ func (mb *msgBlock) indexCacheBuf(buf []byte) error {
// Handle FSS inline here.
if popFss && slen > 0 && !mb.noTrack && !erased && !mb.dmap.Exists(seq) {
bsubj := buf[index+msgHdrSize : index+msgHdrSize+uint32(slen)]
if ss := mb.fss[string(bsubj)]; ss != nil {
if ss, ok := mb.fss.Find(bsubj); ok && ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[string(bsubj)] = &SimpleState{
mb.fss.Insert(bsubj, SimpleState{
Msgs: 1,
First: seq,
Last: seq,
}
})
}
}
}
@@ -6105,15 +6261,31 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err
return nil, ErrStoreMsgNotFound
}
start, stop := fs.lmb.index, fs.blks[0].index
wc := subjectHasWildcard(subj)
var start, stop uint32
// If literal subject check for presence.
if !wc {
if info, ok := fs.psim.Find(stringToBytes(subj)); !ok {
if wc {
start = fs.lmb.index
fs.psim.Match(stringToBytes(subj), func(_ []byte, psi *psi) {
// Keep track of start and stop indexes for this subject.
if psi.fblk < start {
start = psi.fblk
}
if psi.lblk > stop {
stop = psi.lblk
}
})
// None matched.
if stop == 0 {
return nil, ErrStoreMsgNotFound
} else {
start, stop = info.lblk, info.fblk
}
// These need to be swapped.
start, stop = stop, start
} else if info, ok := fs.psim.Find(stringToBytes(subj)); ok {
start, stop = info.lblk, info.fblk
} else {
return nil, ErrStoreMsgNotFound
}
// Walk blocks backwards.
@@ -6133,7 +6305,7 @@ func (fs *fileStore) loadLast(subj string, sm *StoreMsg) (lsm *StoreMsg, err err
var l uint64
// Optimize if subject is not a wildcard.
if !wc {
if ss := mb.fss[subj]; ss != nil {
if ss, ok := mb.fss.Find(stringToBytes(subj)); ok && ss != nil {
l = ss.Last
}
}
@@ -6227,7 +6399,12 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *Store
// let's check the psim to see if we can skip ahead.
if start <= fs.state.FirstSeq {
var ss SimpleState
fs.numFilteredPending(filter, &ss)
fs.numFilteredPendingNoLast(filter, &ss)
// Nothing available.
if ss.Msgs == 0 {
return nil, fs.state.LastSeq, ErrStoreEOF
}
// We can skip ahead.
if ss.First > start {
start = ss.First
}
@@ -6243,8 +6420,27 @@ func (fs *fileStore) LoadNextMsg(filter string, wc bool, start uint64, sm *Store
return sm, sm.seq, nil
} else if err != ErrStoreMsgNotFound {
return nil, 0, err
} else if expireOk {
mb.tryForceExpireCache()
} else {
// Nothing found in this block. We missed, if first block (bi) check psim.
// Similar to above if start <= first seq.
// TODO(dlc) - For v2 track these by filter subject since they will represent filtered consumers.
if i == bi {
nbi, lbi := fs.checkSkipFirstBlock(filter, wc)
// Nothing available.
if nbi < 0 || lbi <= bi {
return nil, fs.state.LastSeq, ErrStoreEOF
}
// See if we can jump ahead here.
// Right now we can only spin on first, so if we have interior sparseness need to favor checking per block fss if loaded.
// For v2 will track all blocks that have matches for psim.
if nbi > i {
i = nbi - 1 // For the iterator condition i++
}
}
// Check is we can expire.
if expireOk {
mb.tryForceExpireCache()
}
}
}
}
@@ -6824,11 +7020,13 @@ func (fs *fileStore) Compact(seq uint64) (uint64, error) {
bytes += mb.bytes
// Make sure we do subject cleanup as well.
mb.ensurePerSubjectInfoLoaded()
for subj, ss := range mb.fss {
mb.fss.Iter(func(bsubj []byte, ss *SimpleState) bool {
subj := bytesToString(bsubj)
for i := uint64(0); i < ss.Msgs; i++ {
fs.removePerSubject(subj)
}
}
return true
})
// Now close.
mb.dirtyCloseWithRemove(true)
mb.mu.Unlock()
@@ -7229,13 +7427,17 @@ func (mb *msgBlock) dirtyCloseWithRemove(remove bool) {
// Lock should be held.
func (mb *msgBlock) removeSeqPerSubject(subj string, seq uint64) {
mb.ensurePerSubjectInfoLoaded()
ss := mb.fss[subj]
if ss == nil {
if mb.fss == nil {
return
}
bsubj := stringToBytes(subj)
ss, ok := mb.fss.Find(bsubj)
if !ok || ss == nil {
return
}
if ss.Msgs == 1 {
delete(mb.fss, subj)
mb.fss.Delete(bsubj)
return
}
@@ -7337,7 +7539,7 @@ func (mb *msgBlock) generatePerSubjectInfo() error {
}
// Create new one regardless.
mb.fss = make(map[string]*SimpleState)
mb.fss = stree.NewSubjectTree[SimpleState]()
var smv StoreMsg
fseq, lseq := atomic.LoadUint64(&mb.first.seq), atomic.LoadUint64(&mb.last.seq)
@@ -7354,16 +7556,16 @@ func (mb *msgBlock) generatePerSubjectInfo() error {
return err
}
if sm != nil && len(sm.subj) > 0 {
if ss := mb.fss[sm.subj]; ss != nil {
if ss, ok := mb.fss.Find(stringToBytes(sm.subj)); ok && ss != nil {
ss.Msgs++
ss.Last = seq
} else {
mb.fss[sm.subj] = &SimpleState{Msgs: 1, First: seq, Last: seq}
mb.fss.Insert(stringToBytes(sm.subj), SimpleState{Msgs: 1, First: seq, Last: seq})
}
}
}
if len(mb.fss) > 0 {
if mb.fss.Size() > 0 {
// Make sure we run the cache expire timer.
mb.llts = time.Now().UnixNano()
// Mark fss activity.
@@ -7384,7 +7586,7 @@ func (mb *msgBlock) ensurePerSubjectInfoLoaded() error {
return nil
}
if mb.msgs == 0 {
mb.fss = make(map[string]*SimpleState)
mb.fss = stree.NewSubjectTree[SimpleState]()
return nil
}
return mb.generatePerSubjectInfo()
@@ -7401,9 +7603,8 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
}
// Now populate psim.
for subj, ss := range mb.fss {
if len(subj) > 0 {
bsubj := stringToBytes(subj)
mb.fss.Iter(func(bsubj []byte, ss *SimpleState) bool {
if len(bsubj) > 0 {
if info, ok := fs.psim.Find(bsubj); ok {
info.total += ss.Msgs
if mb.index > info.lblk {
@@ -7411,10 +7612,11 @@ func (fs *fileStore) populateGlobalPerSubjectInfo(mb *msgBlock) {
}
} else {
fs.psim.Insert(bsubj, psi{total: ss.Msgs, fblk: mb.index, lblk: mb.index})
fs.tsl += len(subj)
fs.tsl += len(bsubj)
}
}
}
return true
})
}
// Close the message block.
@@ -7486,10 +7688,23 @@ func (fs *fileStore) Delete() error {
os.RemoveAll(pdir)
}
// Do Purge() since if we have lots of blocks uses a mv/rename.
fs.Purge()
// Quickly close all blocks and simulate a purge w/o overhead an new write block.
fs.mu.Lock()
for _, mb := range fs.blks {
mb.dirtyClose()
}
dmsgs := fs.state.Msgs
dbytes := int64(fs.state.Bytes)
fs.state.Msgs, fs.state.Bytes = 0, 0
fs.blks = nil
cb := fs.scb
fs.mu.Unlock()
if err := fs.stop(false); err != nil {
if cb != nil {
cb(-int64(dmsgs), -dbytes, 0, _EMPTY_)
}
if err := fs.stop(true, false); err != nil {
return err
}
@@ -7505,14 +7720,19 @@ func (fs *fileStore) Delete() error {
// Do this in separate Go routine in case lots of blocks.
// Purge above protects us as does the removal of meta artifacts above.
go func() {
<-dios
err := os.RemoveAll(ndir)
dios <- struct{}{}
if err == nil {
return
}
ttl := time.Now().Add(time.Second)
for time.Now().Before(ttl) {
time.Sleep(10 * time.Millisecond)
if err = os.RemoveAll(ndir); err == nil {
<-dios
err = os.RemoveAll(ndir)
dios <- struct{}{}
if err == nil {
return
}
}
@@ -7778,11 +7998,11 @@ func (fs *fileStore) _writeFullState(force bool) error {
// Stop the current filestore.
func (fs *fileStore) Stop() error {
return fs.stop(true)
return fs.stop(false, true)
}
// Stop the current filestore.
func (fs *fileStore) stop(writeState bool) error {
func (fs *fileStore) stop(delete, writeState bool) error {
fs.mu.Lock()
if fs.closed || fs.closing {
fs.mu.Unlock()
@@ -7833,7 +8053,11 @@ func (fs *fileStore) stop(writeState bool) error {
fs.cmu.Unlock()
for _, o := range cfs {
o.Stop()
if delete {
o.StreamDelete()
} else {
o.Stop()
}
}
if bytes > 0 && cb != nil {
@@ -8550,7 +8774,8 @@ func (o *consumerFileStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) err
// Check for an update to a message already delivered.
if sseq <= o.state.Delivered.Stream {
if p = o.state.Pending[sseq]; p != nil {
p.Sequence, p.Timestamp = dseq, ts
// Do not update p.Sequence, that should be the original delivery sequence.
p.Timestamp = ts
}
} else {
// Add to pending.
@@ -8608,7 +8833,14 @@ func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
return nil
}
// Match leader logic on checking if ack is ahead of delivered.
// This could happen on a cooperative takeover with high speed deliveries.
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq + 1
}
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
delete(o.state.Redelivered, sseq)
return ErrStoreMsgNotFound
}
@@ -8639,7 +8871,9 @@ func (o *consumerFileStore) UpdateAcks(dseq, sseq uint64) error {
// First delete from our pending state.
if p, ok := o.state.Pending[sseq]; ok {
delete(o.state.Pending, sseq)
dseq = p.Sequence // Use the original.
if dseq > p.Sequence && p.Sequence > 0 {
dseq = p.Sequence // Use the original.
}
}
if len(o.state.Pending) == 0 {
o.state.AckFloor.Consumer = o.state.Delivered.Consumer
+32 -17
View File
@@ -1440,7 +1440,11 @@ func (a *Account) maxBytesLimits(cfg *StreamConfig) (bool, int64) {
return false, 0
}
jsa.usageMu.RLock()
selectedLimits, _, ok := jsa.selectLimits(cfg)
var replicas int
if cfg != nil {
replicas = cfg.Replicas
}
selectedLimits, _, ok := jsa.selectLimits(replicas)
jsa.usageMu.RUnlock()
if !ok {
return false, 0
@@ -1590,7 +1594,7 @@ func diffCheckedLimits(a, b map[string]JetStreamAccountLimits) map[string]JetStr
func (jsa *jsAccount) reservedStorage(tier string) (mem, store uint64) {
for _, mset := range jsa.streams {
cfg := &mset.cfg
if tier == _EMPTY_ || tier == tierName(cfg) && cfg.MaxBytes > 0 {
if tier == _EMPTY_ || tier == tierName(cfg.Replicas) && cfg.MaxBytes > 0 {
switch cfg.Storage {
case FileStorage:
store += uint64(cfg.MaxBytes)
@@ -1607,7 +1611,7 @@ func (jsa *jsAccount) reservedStorage(tier string) (mem, store uint64) {
func reservedStorage(sas map[string]*streamAssignment, tier string) (mem, store uint64) {
for _, sa := range sas {
cfg := sa.Config
if tier == _EMPTY_ || tier == tierName(cfg) && cfg.MaxBytes > 0 {
if tier == _EMPTY_ || tier == tierName(cfg.Replicas) && cfg.MaxBytes > 0 {
switch cfg.Storage {
case FileStorage:
store += uint64(cfg.MaxBytes)
@@ -1695,17 +1699,29 @@ func (a *Account) JetStreamUsage() JetStreamAccountStats {
stats.ReservedMemory, stats.ReservedStore = reservedStorage(sas, _EMPTY_)
}
for _, sa := range sas {
stats.Consumers += len(sa.consumers)
if !defaultTier {
tier := tierName(sa.Config)
u, ok := stats.Tiers[tier]
if !ok {
u = JetStreamTier{}
}
u.Streams++
if defaultTier {
stats.Consumers += len(sa.consumers)
} else {
stats.Streams++
u.Consumers += len(sa.consumers)
stats.Tiers[tier] = u
streamTier := tierName(sa.Config.Replicas)
su, ok := stats.Tiers[streamTier]
if !ok {
su = JetStreamTier{}
}
su.Streams++
stats.Tiers[streamTier] = su
// Now consumers, check each since could be different tiers.
for _, ca := range sa.consumers {
stats.Consumers++
consumerTier := tierName(ca.Config.replicas(sa.Config))
cu, ok := stats.Tiers[consumerTier]
if !ok {
cu = JetStreamTier{}
}
cu.Consumers++
stats.Tiers[consumerTier] = cu
}
}
}
} else {
@@ -2089,9 +2105,8 @@ func (js *jetStream) limitsExceeded(storeType StorageType) bool {
return js.wouldExceedLimits(storeType, 0)
}
func tierName(cfg *StreamConfig) string {
func tierName(replicas int) string {
// TODO (mh) this is where we could select based off a placement tag as well "qos:tier"
replicas := cfg.Replicas
if replicas == 0 {
replicas = 1
}
@@ -2111,11 +2126,11 @@ func (jsa *jsAccount) jetStreamAndClustered() (*jetStream, bool) {
}
// jsa.usageMu read lock should be held.
func (jsa *jsAccount) selectLimits(cfg *StreamConfig) (JetStreamAccountLimits, string, bool) {
func (jsa *jsAccount) selectLimits(replicas int) (JetStreamAccountLimits, string, bool) {
if selectedLimits, ok := jsa.limits[_EMPTY_]; ok {
return selectedLimits, _EMPTY_, true
}
tier := tierName(cfg)
tier := tierName(replicas)
if selectedLimits, ok := jsa.limits[tier]; ok {
return selectedLimits, tier, true
}
+5 -1
View File
@@ -3267,7 +3267,11 @@ func (s *Server) jsStreamPurgeRequest(sub *subscription, c *client, _ *Account,
}
func (acc *Account) jsNonClusteredStreamLimitsCheck(cfg *StreamConfig) *ApiError {
selectedLimits, tier, jsa, apiErr := acc.selectLimits(cfg)
var replicas int
if cfg != nil {
replicas = cfg.Replicas
}
selectedLimits, tier, jsa, apiErr := acc.selectLimits(replicas)
if apiErr != nil {
return apiErr
}
+105 -47
View File
@@ -534,12 +534,18 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool {
return false
}
// If we are catching up return false.
if mset.isCatchingUp() {
// If R1 we are good.
if node == nil {
return true
}
// Here we are a replicated stream.
// First make sure our monitor routine is running.
if !mset.isMonitorRunning() {
return false
}
if node == nil || node.Healthy() {
if node.Healthy() {
// Check if we are processing a snapshot and are catching up.
if !mset.isCatchingUp() {
return true
@@ -553,7 +559,6 @@ func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool {
js.restartStream(acc, sa)
}
}
return false
}
@@ -863,6 +868,8 @@ func (js *jetStream) setupMetaGroup() error {
atomic.StoreInt32(&js.clustered, 1)
c.registerWithAccount(sacc)
// Set to true before we start.
js.metaRecovering = true
js.srv.startGoRoutine(
js.monitorCluster,
pprofLabels{
@@ -2164,7 +2171,7 @@ func genPeerInfo(peers []string, split int) (newPeers, oldPeers []string, newPee
// Should only be called from monitorStream.
func (mset *stream) waitOnConsumerAssignments() {
mset.mu.RLock()
s, js, acc, sa, name := mset.srv, mset.js, mset.acc, mset.sa, mset.cfg.Name
s, js, acc, sa, name, replicas := mset.srv, mset.js, mset.acc, mset.sa, mset.cfg.Name, mset.cfg.Replicas
mset.mu.RUnlock()
if s == nil || js == nil || acc == nil || sa == nil {
@@ -2186,6 +2193,9 @@ func (mset *stream) waitOnConsumerAssignments() {
for _, o := range mset.getConsumers() {
// Make sure we are registered with our consumer assignment.
if ca := o.consumerAssignment(); ca != nil {
if replicas > 1 && !o.isMonitorRunning() {
break
}
numReady++
} else {
break
@@ -2373,7 +2383,8 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps
// since we process streams first then consumers as an asset class.
mset.waitOnConsumerAssignments()
// Setup a periodic check here.
cist = time.NewTicker(30 * time.Second)
// We will fire in 5s the first time then back off to 30s
cist = time.NewTicker(5 * time.Second)
cistc = cist.C
}
@@ -2496,7 +2507,9 @@ func (js *jetStream) monitorStream(mset *stream, sa *streamAssignment, sendSnaps
}
case <-cistc:
mset.checkInterestState()
cist.Reset(30 * time.Second)
// We may be adjusting some things with consumers so do this in its own go routine.
go mset.checkInterestState()
case <-datc:
if mset == nil || isRecovering {
@@ -4096,7 +4109,7 @@ func (js *jetStream) processConsumerAssignment(ca *consumerAssignment) {
sa.consumers = make(map[string]*consumerAssignment)
} else if oca := sa.consumers[ca.Name]; oca != nil {
wasExisting = true
// Copy over private existing state from former SA.
// Copy over private existing state from former CA.
if ca.Group != nil {
ca.Group.node = oca.Group.node
}
@@ -4423,11 +4436,15 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state
s.sendInternalMsgLocked(consumerAssignmentSubj, _EMPTY_, nil, b)
}
} else {
js.mu.RLock()
node := rg.node
js.mu.RUnlock()
if didCreate {
o.setCreatedTime(ca.Created)
} else {
// Check for scale down to 1..
if rg.node != nil && len(rg.Peers) == 1 {
if node != nil && len(rg.Peers) == 1 {
o.clearNode()
o.setLeader(true)
// Need to clear from rg too.
@@ -4442,7 +4459,7 @@ func (js *jetStream) processClusterCreateConsumer(ca *consumerAssignment, state
}
}
if rg.node == nil {
if node == nil {
// Single replica consumer, process manually here.
js.mu.Lock()
// Force response in case we think this is an update.
@@ -4912,7 +4929,22 @@ func (js *jetStream) applyConsumerEntries(o *consumer, ce *CommittedEntry, isLea
}
}
// Check our interest state if applicable.
o.checkStateForInterestStream()
if err := o.checkStateForInterestStream(); err == errAckFloorHigherThanLastSeq {
o.mu.RLock()
mset := o.mset
o.mu.RUnlock()
// Register pre-acks unless no state at all for the stream and we would create alot of pre-acks.
mset.mu.Lock()
var ss StreamState
mset.store.FastState(&ss)
// Only register if we have a valid FirstSeq.
if ss.FirstSeq > 0 {
for seq := ss.FirstSeq; seq < state.AckFloor.Stream; seq++ {
mset.registerPreAck(o, seq)
}
}
mset.mu.Unlock()
}
}
} else if e.Type == EntryRemovePeer {
@@ -5161,9 +5193,7 @@ func (js *jetStream) processConsumerLeaderChange(o *consumer, isLeader bool) err
} else {
resp.ConsumerInfo = o.initialInfo()
s.sendAPIResponse(client, acc, subject, reply, _EMPTY_, s.jsonResponse(&resp))
if node := o.raftNode(); node != nil {
o.sendCreateAdvisory()
}
o.sendCreateAdvisory()
}
return nil
@@ -5954,7 +5984,7 @@ func (js *jetStream) createGroupForStream(ci *ClientInfo, cfg *StreamConfig) (*r
return nil, errs
}
func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) {
func (acc *Account) selectLimits(replicas int) (*JetStreamAccountLimits, string, *jsAccount, *ApiError) {
// Grab our jetstream account info.
acc.mu.RLock()
jsa := acc.js
@@ -5965,7 +5995,7 @@ func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, st
}
jsa.usageMu.RLock()
selectedLimits, tierName, ok := jsa.selectLimits(cfg)
selectedLimits, tierName, ok := jsa.selectLimits(replicas)
jsa.usageMu.RUnlock()
if !ok {
@@ -5976,7 +6006,11 @@ func (acc *Account) selectLimits(cfg *StreamConfig) (*JetStreamAccountLimits, st
// Read lock needs to be held
func (js *jetStream) jsClusteredStreamLimitsCheck(acc *Account, cfg *StreamConfig) *ApiError {
selectedLimits, tier, _, apiErr := acc.selectLimits(cfg)
var replicas int
if cfg != nil {
replicas = cfg.Replicas
}
selectedLimits, tier, _, apiErr := acc.selectLimits(replicas)
if apiErr != nil {
return apiErr
}
@@ -7113,7 +7147,7 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec
s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
return
}
selectedLimits, _, _, apiErr := acc.selectLimits(&streamCfg)
selectedLimits, _, _, apiErr := acc.selectLimits(cfg.replicas(&streamCfg))
if apiErr != nil {
resp.Error = apiErr
s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
@@ -7144,25 +7178,45 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec
return
}
// Was a consumer name provided?
var oname string
if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ {
if cfg.Name != _EMPTY_ {
oname = cfg.Name
} else {
oname = cfg.Durable
}
}
// Check for max consumers here to short circuit if possible.
// Start with limit on a stream, but if one is defined at the level of the account
// and is lower, use that limit.
maxc := sa.Config.MaxConsumers
if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) {
maxc = selectedLimits.MaxConsumers
}
if maxc > 0 {
// Don't count DIRECTS.
total := 0
for _, ca := range sa.consumers {
if ca.Config != nil && !ca.Config.Direct {
total++
}
if action == ActionCreate || action == ActionCreateOrUpdate {
maxc := sa.Config.MaxConsumers
if maxc <= 0 || (selectedLimits.MaxConsumers > 0 && selectedLimits.MaxConsumers < maxc) {
maxc = selectedLimits.MaxConsumers
}
if total >= maxc {
resp.Error = NewJSMaximumConsumersLimitError()
s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
return
if maxc > 0 {
// Don't count DIRECTS.
total := 0
for cn, ca := range sa.consumers {
if action == ActionCreateOrUpdate {
// If the consumer name is specified and we think it already exists, then
// we're likely updating an existing consumer, so don't count it. Otherwise
// we will incorrectly return NewJSMaximumConsumersLimitError for an update.
if oname != _EMPTY_ && cn == oname && sa.consumers[oname] != nil {
continue
}
}
if ca.Config != nil && !ca.Config.Direct {
total++
}
}
if total >= maxc {
resp.Error = NewJSMaximumConsumersLimitError()
s.sendAPIErrResponse(ci, acc, subject, reply, string(rmsg), s.jsonResponse(&resp))
return
}
}
}
@@ -7189,16 +7243,10 @@ func (s *Server) jsClusteredConsumerRequest(ci *ClientInfo, acc *Account, subjec
}
var ca *consumerAssignment
var oname string
// See if we have an existing one already under same durable name or
// if name was set by the user.
if isDurableConsumer(cfg) || cfg.Name != _EMPTY_ {
if cfg.Name != _EMPTY_ {
oname = cfg.Name
} else {
oname = cfg.Durable
}
if oname != _EMPTY_ {
if ca = sa.consumers[oname]; ca != nil && !ca.deleted {
if action == ActionCreate && !reflect.DeepEqual(cfg, ca.Config) {
resp.Error = NewJSConsumerAlreadyExistsError()
@@ -7615,7 +7663,10 @@ func (mset *stream) stateSnapshot() []byte {
func (mset *stream) stateSnapshotLocked() []byte {
// Decide if we can support the new style of stream snapshots.
if mset.supportsBinarySnapshotLocked() {
snap, _ := mset.store.EncodedStreamState(mset.getCLFS())
snap, err := mset.store.EncodedStreamState(mset.getCLFS())
if err != nil {
return nil
}
return snap
}
@@ -7707,7 +7758,7 @@ func (mset *stream) processClusteredInboundMsg(subject, reply string, hdr, msg [
if err == nil {
err = NewJSAccountResourcesExceededError()
}
s.RateLimitWarnf(err.Error())
s.RateLimitWarnf("JetStream account limits exceeded for '%s': %s", jsa.acc().GetName(), err.Error())
if canRespond {
var resp = &JSPubAckResponse{PubAck: &PubAck{Stream: name}}
resp.Error = err
@@ -8085,8 +8136,11 @@ func (mset *stream) processSnapshot(snap *StreamReplicatedState) (e error) {
var sub *subscription
var err error
const activityInterval = 30 * time.Second
notActive := time.NewTimer(activityInterval)
const (
startInterval = 5 * time.Second
activityInterval = 30 * time.Second
)
notActive := time.NewTimer(startInterval)
defer notActive.Stop()
defer func() {
@@ -8169,7 +8223,7 @@ RETRY:
default:
}
}
notActive.Reset(activityInterval)
notActive.Reset(startInterval)
// Grab sync request again on failures.
if sreq == nil {
@@ -8214,8 +8268,10 @@ RETRY:
// Send our sync request.
b, _ := json.Marshal(sreq)
s.sendInternalMsgLocked(subject, reply, nil, b)
// Remember when we sent this out to avoid loop spins on errors below.
reqSendTime := time.Now()
// Clear our sync request.
sreq = nil
@@ -8764,7 +8820,7 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
done = maxOutMsgs-atomic.LoadInt32(&outm) > minBatchWait
if !done {
// Wait for a small bit.
time.Sleep(50 * time.Millisecond)
time.Sleep(100 * time.Millisecond)
} else {
// GC friendly.
mw.Stop()
@@ -8853,7 +8909,9 @@ func (mset *stream) runCatchup(sendSubject string, sreq *streamSyncRequest) {
mset.account(), mset.name(), seq, state)
// Try our best to redo our invalidated snapshot as well.
if n := mset.raftNode(); n != nil {
n.InstallSnapshot(mset.stateSnapshot())
if snap := mset.stateSnapshot(); snap != nil {
n.InstallSnapshot(snap)
}
}
// If we allow gap markers check if we have one pending.
if drOk && dr.First > 0 {
+6
View File
@@ -584,6 +584,9 @@ func (s *Server) clearObserverState(remote *leafNodeCfg) {
return
}
acc.jscmMu.Lock()
defer acc.jscmMu.Unlock()
// Walk all streams looking for any clustered stream, skip otherwise.
for _, mset := range acc.streams() {
node := mset.raftNode()
@@ -619,6 +622,9 @@ func (s *Server) checkJetStreamMigrate(remote *leafNodeCfg) {
return
}
acc.jscmMu.Lock()
defer acc.jscmMu.Unlock()
// Walk all streams looking for any clustered stream, skip otherwise.
// If we are the leader force stepdown.
for _, mset := range acc.streams() {
+110 -49
View File
@@ -261,7 +261,7 @@ func (ms *memStore) SkipMsg() uint64 {
ms.state.LastSeq = seq
ms.state.LastTime = now
if ms.state.Msgs == 0 {
ms.state.FirstSeq = seq
ms.state.FirstSeq = seq + 1
ms.state.FirstTime = now
} else {
ms.dmap.Insert(seq)
@@ -389,9 +389,9 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
}
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filter)
_tsa, _fsa := [32]string{}, [32]string{}
tsa, fsa := _tsa[:0], _fsa[:0]
fsa = tokenizeSubjectIntoSlice(fsa[:0], filter)
wc := subjectHasWildcard(filter)
// 1. See if we match any subs from fss.
@@ -405,8 +405,8 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
if !wc {
return subj == filter
}
tts := tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tts, fts)
tsa = tokenizeSubjectIntoSlice(tsa[:0], subj)
return isSubsetMatchTokenized(tsa, fsa)
}
update := func(fss *SimpleState) {
@@ -426,9 +426,8 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
var havePartial bool
// We will track start and end sequences as we go.
ms.fss.Match(stringToBytes(filter), func(subj []byte, fss *SimpleState) {
subjs := bytesToString(subj)
if fss.firstNeedsUpdate {
ms.recalculateFirstForSubj(subjs, fss.First, fss)
ms.recalculateFirstForSubj(bytesToString(subj), fss.First, fss)
}
if sseq <= fss.First {
update(fss)
@@ -465,14 +464,28 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
}
if toScan < toExclude {
ss.Msgs, ss.First = 0, 0
for seq := first; seq <= last; seq++ {
if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && isMatch(sm.subj) {
ss.Msgs++
if ss.First == 0 {
ss.First = seq
update := func(sm *StoreMsg) {
ss.Msgs++
if ss.First == 0 {
ss.First = sm.seq
}
if seen != nil {
seen[sm.subj] = true
}
}
// Check if easier to just scan msgs vs the sequence range.
// This can happen with lots of interior deletes.
if last-first > uint64(len(ms.msgs)) {
for _, sm := range ms.msgs {
if sm.seq >= first && sm.seq <= last && !seen[sm.subj] && isMatch(sm.subj) {
update(sm)
}
if seen != nil {
seen[sm.subj] = true
}
} else {
for seq := first; seq <= last; seq++ {
if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && isMatch(sm.subj) {
update(sm)
}
}
}
@@ -482,17 +495,29 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
var adjust uint64
var tss *SimpleState
for seq := ms.state.FirstSeq; seq < first; seq++ {
if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && isMatch(sm.subj) {
if lastPerSubject {
tss, _ = ms.fss.Find(stringToBytes(sm.subj))
update := func(sm *StoreMsg) {
if lastPerSubject {
tss, _ = ms.fss.Find(stringToBytes(sm.subj))
}
// If we are last per subject, make sure to only adjust if all messages are before our first.
if tss == nil || tss.Last < first {
adjust++
}
if seen != nil {
seen[sm.subj] = true
}
}
// Check if easier to just scan msgs vs the sequence range.
if first-ms.state.FirstSeq > uint64(len(ms.msgs)) {
for _, sm := range ms.msgs {
if sm.seq < first && !seen[sm.subj] && isMatch(sm.subj) {
update(sm)
}
// If we are last per subject, make sure to only adjust if all messages are before our first.
if tss == nil || tss.Last < first {
adjust++
}
if seen != nil {
seen[sm.subj] = true
}
} else {
for seq := ms.state.FirstSeq; seq < first; seq++ {
if sm, ok := ms.msgs[seq]; ok && !seen[sm.subj] && isMatch(sm.subj) {
update(sm)
}
}
}
@@ -507,10 +532,27 @@ func (ms *memStore) filteredStateLocked(sseq uint64, filter string, lastPerSubje
}
ss.Msgs -= adjust
if needScanFirst {
for seq := first; seq < last; seq++ {
if sm, ok := ms.msgs[seq]; ok && isMatch(sm.subj) {
ss.First = seq
break
// Check if easier to just scan msgs vs the sequence range.
// Since we will need to scan all of the msgs vs below where we break on the first match,
// we will only do so if a few orders of magnitude lower.
if last-first > 100*uint64(len(ms.msgs)) {
low := ms.state.LastSeq
for _, sm := range ms.msgs {
if sm.seq >= first && sm.seq < last && isMatch(sm.subj) {
if sm.seq < low {
low = sm.seq
}
}
}
if low < ms.state.LastSeq {
ss.First = low
}
} else {
for seq := first; seq < last; seq++ {
if sm, ok := ms.msgs[seq]; ok && isMatch(sm.subj) {
ss.First = seq
break
}
}
}
}
@@ -559,9 +601,9 @@ func (ms *memStore) SubjectsTotals(filterSubject string) map[string]uint64 {
return nil
}
tsa := [32]string{}
fsa := [32]string{}
fts := tokenizeSubjectIntoSlice(fsa[:0], filterSubject)
_tsa, _fsa := [32]string{}, [32]string{}
tsa, fsa := _tsa[:0], _fsa[:0]
fsa = tokenizeSubjectIntoSlice(fsa[:0], filterSubject)
isAll := filterSubject == _EMPTY_ || filterSubject == fwcs
fst := make(map[string]uint64)
@@ -570,7 +612,7 @@ func (ms *memStore) SubjectsTotals(filterSubject string) map[string]uint64 {
if isAll {
fst[subjs] = ss.Msgs
} else {
if tts := tokenizeSubjectIntoSlice(tsa[:0], subjs); isSubsetMatchTokenized(tts, fts) {
if tsa = tokenizeSubjectIntoSlice(tsa[:0], subjs); isSubsetMatchTokenized(tsa, fsa) {
fst[subjs] = ss.Msgs
}
}
@@ -1176,7 +1218,11 @@ func (ms *memStore) removeSeqPerSubject(subj string, seq uint64) {
// Will recalculate the first sequence for this subject in this block.
// Lock should be held.
func (ms *memStore) recalculateFirstForSubj(subj string, startSeq uint64, ss *SimpleState) {
for tseq := startSeq + 1; tseq <= ss.Last; tseq++ {
tseq := startSeq + 1
if tseq < ms.state.FirstSeq {
tseq = ms.state.FirstSeq
}
for ; tseq <= ss.Last; tseq++ {
if sm := ms.msgs[tseq]; sm != nil && sm.subj == subj {
ss.First = tseq
ss.firstNeedsUpdate = false
@@ -1509,7 +1555,8 @@ func (o *consumerMemStore) UpdateDelivered(dseq, sseq, dc uint64, ts int64) erro
// Check for an update to a message already delivered.
if sseq <= o.state.Delivered.Stream {
if p = o.state.Pending[sseq]; p != nil {
p.Sequence, p.Timestamp = dseq, ts
// Do not update p.Sequence, that should be the original delivery sequence.
p.Timestamp = ts
}
} else {
// Add to pending.
@@ -1558,23 +1605,38 @@ func (o *consumerMemStore) UpdateAcks(dseq, sseq uint64) error {
if o.cfg.AckPolicy == AckNone {
return ErrNoAckPolicy
}
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
return ErrStoreMsgNotFound
}
// On restarts the old leader may get a replay from the raft logs that are old.
if dseq <= o.state.AckFloor.Consumer {
return nil
}
// Match leader logic on checking if ack is ahead of delivered.
// This could happen on a cooperative takeover with high speed deliveries.
if sseq > o.state.Delivered.Stream {
o.state.Delivered.Stream = sseq + 1
}
if len(o.state.Pending) == 0 || o.state.Pending[sseq] == nil {
delete(o.state.Redelivered, sseq)
return ErrStoreMsgNotFound
}
// Check for AckAll here.
if o.cfg.AckPolicy == AckAll {
sgap := sseq - o.state.AckFloor.Stream
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
for seq := sseq; seq > sseq-sgap; seq-- {
delete(o.state.Pending, seq)
if len(o.state.Redelivered) > 0 {
if sgap > uint64(len(o.state.Pending)) {
for seq := range o.state.Pending {
if seq <= sseq {
delete(o.state.Pending, seq)
delete(o.state.Redelivered, seq)
}
}
} else {
for seq := sseq; seq > sseq-sgap && len(o.state.Pending) > 0; seq-- {
delete(o.state.Pending, seq)
delete(o.state.Redelivered, seq)
}
}
@@ -1586,23 +1648,20 @@ func (o *consumerMemStore) UpdateAcks(dseq, sseq uint64) error {
// First delete from our pending state.
if p, ok := o.state.Pending[sseq]; ok {
delete(o.state.Pending, sseq)
dseq = p.Sequence // Use the original.
}
// Now remove from redelivered.
if len(o.state.Redelivered) > 0 {
delete(o.state.Redelivered, sseq)
if dseq > p.Sequence && p.Sequence > 0 {
dseq = p.Sequence // Use the original.
}
}
if len(o.state.Pending) == 0 {
o.state.AckFloor.Consumer = o.state.Delivered.Consumer
o.state.AckFloor.Stream = o.state.Delivered.Stream
} else if dseq == o.state.AckFloor.Consumer+1 {
first := o.state.AckFloor.Consumer == 0
o.state.AckFloor.Consumer = dseq
o.state.AckFloor.Stream = sseq
if !first && o.state.Delivered.Consumer > dseq {
for ss := sseq + 1; ss < o.state.Delivered.Stream; ss++ {
if o.state.Delivered.Consumer > dseq {
for ss := sseq + 1; ss <= o.state.Delivered.Stream; ss++ {
if p, ok := o.state.Pending[ss]; ok {
if p.Sequence > 0 {
o.state.AckFloor.Consumer = p.Sequence - 1
@@ -1613,6 +1672,8 @@ func (o *consumerMemStore) UpdateAcks(dseq, sseq uint64) error {
}
}
}
// We do these regardless.
delete(o.state.Redelivered, sseq)
return nil
}
+156
View File
@@ -1387,6 +1387,8 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
var srcUrl string
if gitCommit == _EMPTY_ {
srcUrl = "https://github.com/nats-io/nats-server"
} else if serverVersion != _EMPTY_ {
srcUrl = fmt.Sprintf("https://github.com/nats-io/nats-server/tree/%s", serverVersion)
} else {
srcUrl = fmt.Sprintf("https://github.com/nats-io/nats-server/tree/%s", gitCommit)
}
@@ -1421,6 +1423,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
<a href=.%s>Routes</a>
<a href=.%s>LeafNodes</a>
<a href=.%s>Gateways</a>
<a href=.%s>Raft Groups</a>
<a href=.%s class=last>Health Probe</a>
<a href=https://docs.nats.io/running-a-nats-service/nats_admin/monitoring class="help">Help</a>
</body>
@@ -1436,6 +1439,7 @@ func (s *Server) HandleRoot(w http.ResponseWriter, r *http.Request) {
s.basePath(RoutezPath),
s.basePath(LeafzPath),
s.basePath(GatewayzPath),
s.basePath(RaftzPath),
s.basePath(HealthzPath),
)
}
@@ -3490,6 +3494,23 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
return health
}
// Are we still recovering meta layer?
if js.isMetaRecovering() {
if !details {
health.Status = na
health.Error = "JetStream is still recovering meta layer"
} else {
health.Errors = []HealthzError{
{
Type: HealthzErrorJetStream,
Error: "JetStream is still recovering meta layer",
},
}
}
return health
}
// Range across all accounts, the streams assigned to them, and the consumers.
// If they are assigned to this server check their status.
ourID := meta.ID()
@@ -3724,3 +3745,138 @@ func (s *Server) profilez(opts *ProfilezOptions) *ProfilezStatus {
Profile: buffer.Bytes(),
}
}
type RaftzGroup struct {
ID string `json:"id"`
State string `json:"state"`
Size int `json:"size"`
QuorumNeeded int `json:"quorum_needed"`
Observer bool `json:"observer,omitempty"`
Paused bool `json:"paused,omitempty"`
Committed uint64 `json:"committed"`
Applied uint64 `json:"applied"`
CatchingUp bool `json:"catching_up,omitempty"`
Leader string `json:"leader,omitempty"`
EverHadLeader bool `json:"ever_had_leader"`
Term uint64 `json:"term"`
Vote string `json:"voted_for,omitempty"`
PTerm uint64 `json:"pterm"`
PIndex uint64 `json:"pindex"`
IPQPropLen int `json:"ipq_proposal_len"`
IPQEntryLen int `json:"ipq_entry_len"`
IPQRespLen int `json:"ipq_resp_len"`
IPQApplyLen int `json:"ipq_apply_len"`
WAL StreamState `json:"wal"`
WALError error `json:"wal_error,omitempty"`
Peers map[string]RaftzGroupPeer `json:"peers"`
}
type RaftzGroupPeer struct {
Name string `json:"name"`
Known bool `json:"known"`
LastReplicatedIndex uint64 `json:"last_replicated_index,omitempty"`
LastSeen string `json:"last_seen,omitempty"`
}
func (s *Server) HandleRaftz(w http.ResponseWriter, r *http.Request) {
if s.raftNodes == nil {
w.WriteHeader(404)
w.Write([]byte("No Raft nodes registered"))
return
}
gfilter := r.URL.Query().Get("group")
afilter := r.URL.Query().Get("acc")
if afilter == "" {
afilter = s.SystemAccount().Name
}
groups := map[string]RaftNode{}
infos := map[string]map[string]RaftzGroup{} // account -> group ID
s.rnMu.RLock()
if gfilter != _EMPTY_ {
if rg, ok := s.raftNodes[gfilter]; ok && rg != nil {
if n, ok := rg.(*raft); ok {
if n.accName == afilter {
groups[gfilter] = rg
}
}
}
} else {
for name, rg := range s.raftNodes {
if rg == nil {
continue
}
if n, ok := rg.(*raft); ok {
if n.accName != afilter {
continue
}
groups[name] = rg
}
}
}
s.rnMu.RUnlock()
if len(groups) == 0 {
w.WriteHeader(404)
w.Write([]byte("No Raft nodes found, does the specified account/group exist?"))
return
}
for name, rg := range groups {
n, ok := rg.(*raft)
if n == nil || !ok {
continue
}
if _, ok := infos[n.accName]; !ok {
infos[n.accName] = map[string]RaftzGroup{}
}
// Only take the lock once, using the public RaftNode functions would
// cause us to take and release the locks over and over again.
n.RLock()
info := RaftzGroup{
ID: n.id,
State: RaftState(n.state.Load()).String(),
Size: n.csz,
QuorumNeeded: n.qn,
Observer: n.observer,
Paused: n.paused,
Committed: n.commit,
Applied: n.applied,
CatchingUp: n.catchup != nil,
Leader: n.leader,
EverHadLeader: n.pleader,
Term: n.term,
Vote: n.vote,
PTerm: n.pterm,
PIndex: n.pindex,
IPQPropLen: n.prop.len(),
IPQEntryLen: n.entry.len(),
IPQRespLen: n.resp.len(),
IPQApplyLen: n.apply.len(),
WALError: n.werr,
Peers: map[string]RaftzGroupPeer{},
}
n.wal.FastState(&info.WAL)
for id, p := range n.peers {
if id == n.id {
continue
}
peer := RaftzGroupPeer{
Name: s.serverNameForNode(id),
Known: p.kp,
LastReplicatedIndex: p.li,
}
if p.ts > 0 {
peer.LastSeen = time.Since(time.Unix(0, p.ts)).String()
}
info.Peers[id] = peer
}
n.RUnlock()
infos[n.accName][name] = info
}
b, _ := json.MarshalIndent(infos, "", " ")
ResponseHandler(w, r, b)
}
+29 -10
View File
@@ -974,7 +974,7 @@ func (s *Server) mqttHandleClosedClient(c *client) {
// This needs to be done outside of any lock.
if doClean {
if err := sess.clear(); err != nil {
if err := sess.clear(true); err != nil {
c.Errorf(err.Error())
}
}
@@ -1449,7 +1449,7 @@ func (s *Server) mqttCreateAccountSessionManager(acc *Account, quitCh chan struc
// Opportunistically delete the old (legacy) consumer, from v2.10.10 and
// before. Ignore any errors that might arise.
rmLegacyDurName := mqttRetainedMsgsStreamName + "_" + jsa.id
jsa.deleteConsumer(mqttRetainedMsgsStreamName, rmLegacyDurName)
jsa.deleteConsumer(mqttRetainedMsgsStreamName, rmLegacyDurName, true)
// Create a new, uniquely names consumer for retained messages for this
// server. The prior one will expire eventually.
@@ -1672,8 +1672,21 @@ func (jsa *mqttJSA) createDurableConsumer(cfg *CreateConsumerRequest) (*JSApiCon
return ccr, ccr.ToError()
}
func (jsa *mqttJSA) deleteConsumer(streamName, consName string) (*JSApiConsumerDeleteResponse, error) {
func (jsa *mqttJSA) sendMsg(subj string, msg []byte) {
if subj == _EMPTY_ {
return
}
jsa.sendq.push(&mqttJSPubMsg{subj: subj, msg: msg, hdr: -1})
}
// if noWait is specified, does not wait for the JS response, returns nil
func (jsa *mqttJSA) deleteConsumer(streamName, consName string, noWait bool) (*JSApiConsumerDeleteResponse, error) {
subj := fmt.Sprintf(JSApiConsumerDeleteT, streamName, consName)
if noWait {
jsa.sendMsg(subj, nil)
return nil, nil
}
cdri, err := jsa.newRequest(mqttJSAConsumerDel, subj, 0, nil)
if err != nil {
return nil, err
@@ -1950,9 +1963,13 @@ func (as *mqttAccountSessionManager) processRetainedMsg(_ *subscription, c *clie
}
// If lastSeq is 0 (nothing to recover, or done doing it) and this is
// from our own server, ignore.
as.mu.RLock()
if as.rrmLastSeq == 0 && rm.Origin == as.jsa.id {
as.mu.RUnlock()
return
}
as.mu.RUnlock()
// At this point we either recover from our own server, or process a remote retained message.
seq, _, _ := ackReplyInfo(reply)
@@ -1960,11 +1977,13 @@ func (as *mqttAccountSessionManager) processRetainedMsg(_ *subscription, c *clie
as.handleRetainedMsg(rm.Subject, &mqttRetainedMsgRef{sseq: seq}, rm, false)
// If we were recovering (lastSeq > 0), then check if we are done.
as.mu.Lock()
if as.rrmLastSeq > 0 && seq >= as.rrmLastSeq {
as.rrmLastSeq = 0
close(as.rrmDoneCh)
as.rrmDoneCh = nil
}
as.mu.Unlock()
}
func (as *mqttAccountSessionManager) processRetainedMsgDel(_ *subscription, c *client, _ *Account, subject, reply string, rmsg []byte) {
@@ -3072,7 +3091,7 @@ func (sess *mqttSession) save() error {
//
// Runs from the client's readLoop.
// Lock not held on entry, but session is in the locked map.
func (sess *mqttSession) clear() error {
func (sess *mqttSession) clear(noWait bool) error {
var durs []string
var pubRelDur string
@@ -3100,19 +3119,19 @@ func (sess *mqttSession) clear() error {
sess.mu.Unlock()
for _, dur := range durs {
if _, err := sess.jsa.deleteConsumer(mqttStreamName, dur); isErrorOtherThan(err, JSConsumerNotFoundErr) {
if _, err := sess.jsa.deleteConsumer(mqttStreamName, dur, noWait); isErrorOtherThan(err, JSConsumerNotFoundErr) {
return fmt.Errorf("unable to delete consumer %q for session %q: %v", dur, sess.id, err)
}
}
if pubRelDur != "" {
_, err := sess.jsa.deleteConsumer(mqttOutStreamName, pubRelDur)
if pubRelDur != _EMPTY_ {
_, err := sess.jsa.deleteConsumer(mqttOutStreamName, pubRelDur, noWait)
if isErrorOtherThan(err, JSConsumerNotFoundErr) {
return fmt.Errorf("unable to delete consumer %q for session %q: %v", pubRelDur, sess.id, err)
}
}
if seq > 0 {
err := sess.jsa.deleteMsg(mqttSessStreamName, seq, true)
err := sess.jsa.deleteMsg(mqttSessStreamName, seq, !noWait)
// Ignore the various errors indicating that the message (or sequence)
// is already deleted, can happen in a cluster.
if isErrorOtherThan(err, JSSequenceNotFoundErrF) {
@@ -3378,7 +3397,7 @@ func (sess *mqttSession) untrackPubRel(pi uint16) (jsAckSubject string) {
func (sess *mqttSession) deleteConsumer(cc *ConsumerConfig) {
sess.mu.Lock()
sess.tmaxack -= cc.MaxAckPending
sess.jsa.sendq.push(&mqttJSPubMsg{subj: sess.jsa.prefixDomain(fmt.Sprintf(JSApiConsumerDeleteT, mqttStreamName, cc.Durable))})
sess.jsa.deleteConsumer(mqttStreamName, cc.Durable, true)
sess.mu.Unlock()
}
@@ -3717,7 +3736,7 @@ CHECK:
// This Session lasts as long as the Network Connection. State data
// associated with this Session MUST NOT be reused in any subsequent
// Session.
if err := es.clear(); err != nil {
if err := es.clear(false); err != nil {
asm.removeSession(es, true)
return err
}
+125 -68
View File
@@ -38,7 +38,7 @@ import (
type RaftNode interface {
Propose(entry []byte) error
ProposeDirect(entries []*Entry) error
ProposeMulti(entries []*Entry) error
ForwardProposal(entry []byte) error
InstallSnapshot(snap []byte) error
SendSnapshot(snap []byte) error
@@ -85,6 +85,7 @@ type WAL interface {
RemoveMsg(index uint64) (bool, error)
Compact(index uint64) (uint64, error)
Purge() (uint64, error)
PurgeEx(subject string, seq, keep uint64) (uint64, error)
Truncate(seq uint64) error
State() StreamState
FastState(*StreamState)
@@ -155,25 +156,27 @@ type raft struct {
llqrt time.Time // Last quorum lost time
lsut time.Time // Last scale-up time
term uint64 // The current vote term
pterm uint64 // Previous term from the last snapshot
pindex uint64 // Previous index from the last snapshot
commit uint64 // Sequence number of the most recent commit
applied uint64 // Sequence number of the most recently applied commit
hcbehind bool // Were we falling behind at the last health check? (see: isCurrent)
term uint64 // The current vote term
pterm uint64 // Previous term from the last snapshot
pindex uint64 // Previous index from the last snapshot
commit uint64 // Index of the most recent commit
applied uint64 // Index of the most recently applied commit
leader string // The ID of the leader
vote string // Our current vote state
lxfer bool // Are we doing a leadership transfer?
hcbehind bool // Were we falling behind at the last health check? (see: isCurrent)
s *Server // Reference to top-level server
c *client // Internal client for subscriptions
js *jetStream // JetStream, if running, to see if we are out of resources
dflag bool // Debug flag
pleader bool // Has the group ever had a leader?
observer bool // The node is observing, i.e. not participating in voting
extSt extensionState // Extension state
dflag bool // Debug flag
pleader bool // Has the group ever had a leader?
observer bool // The node is observing, i.e. not participating in voting
extSt extensionState // Extension state
psubj string // Proposals subject
rpsubj string // Remove peers subject
@@ -232,16 +235,18 @@ const (
hbIntervalDefault = 1 * time.Second
lostQuorumIntervalDefault = hbIntervalDefault * 10 // 10 seconds
lostQuorumCheckIntervalDefault = hbIntervalDefault * 10 // 10 seconds
observerModeIntervalDefault = 48 * time.Hour
)
var (
minElectionTimeout = minElectionTimeoutDefault
maxElectionTimeout = maxElectionTimeoutDefault
minCampaignTimeout = minCampaignTimeoutDefault
maxCampaignTimeout = maxCampaignTimeoutDefault
hbInterval = hbIntervalDefault
lostQuorumInterval = lostQuorumIntervalDefault
lostQuorumCheck = lostQuorumCheckIntervalDefault
minElectionTimeout = minElectionTimeoutDefault
maxElectionTimeout = maxElectionTimeoutDefault
minCampaignTimeout = minCampaignTimeoutDefault
maxCampaignTimeout = maxCampaignTimeoutDefault
hbInterval = hbIntervalDefault
lostQuorumInterval = lostQuorumIntervalDefault
lostQuorumCheck = lostQuorumCheckIntervalDefault
observerModeInterval = observerModeIntervalDefault
)
type RaftConfig struct {
@@ -270,6 +275,7 @@ var (
errLeaderLen = fmt.Errorf("raft: leader should be exactly %d bytes", idLen)
errTooManyEntries = errors.New("raft: append entry can contain a max of 64k entries")
errBadAppendEntry = errors.New("raft: append entry corrupt")
errNoInternalClient = errors.New("raft: no internal client")
)
// This will bootstrap a raftNode by writing its config into the store directory.
@@ -387,7 +393,7 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe
apply: newIPQueue[*CommittedEntry](s, qpfx+"committedEntry"),
stepdown: newIPQueue[string](s, qpfx+"stepdown"),
accName: accName,
leadc: make(chan bool, 1),
leadc: make(chan bool, 32),
observer: cfg.Observer,
extSt: ps.domainExt,
}
@@ -414,7 +420,8 @@ func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabe
return nil, fmt.Errorf("could not create snapshots directory - %v", err)
}
// Can't recover snapshots if memory based.
// Can't recover snapshots if memory based since wal will be reset.
// We will inherit from the current leader.
if _, ok := n.wal.(*memStore); ok {
os.Remove(filepath.Join(n.sd, snapshotsDir, "*"))
} else {
@@ -692,36 +699,34 @@ func (n *raft) Propose(data []byte) error {
n.debug("Proposal ignored, not leader (state: %v)", state)
return errNotLeader
}
n.RLock()
n.Lock()
defer n.Unlock()
// Error if we had a previous write error.
if werr := n.werr; werr != nil {
n.RUnlock()
return werr
}
prop := n.prop
n.RUnlock()
prop.push(newEntry(EntryNormal, data))
n.prop.push(newEntry(EntryNormal, data))
return nil
}
// ProposeDirect will propose entries directly by skipping the Raft state
// machine and sending them straight to the wire instead.
// ProposeDirect will propose multiple entries at once.
// This should only be called on the leader.
func (n *raft) ProposeDirect(entries []*Entry) error {
func (n *raft) ProposeMulti(entries []*Entry) error {
if state := n.State(); state != Leader {
n.debug("Direct proposal ignored, not leader (state: %v)", state)
return errNotLeader
}
n.RLock()
n.Lock()
defer n.Unlock()
// Error if we had a previous write error.
if werr := n.werr; werr != nil {
n.RUnlock()
return werr
}
n.RUnlock()
n.sendAppendEntry(entries)
for _, e := range entries {
n.prop.push(e)
}
return nil
}
@@ -871,7 +876,7 @@ func (n *raft) PauseApply() error {
n.hcommit = n.commit
// Also prevent us from trying to become a leader while paused and catching up.
n.pobserver, n.observer = n.observer, true
n.resetElect(48 * time.Hour)
n.resetElect(observerModeInterval)
return nil
}
@@ -1012,25 +1017,20 @@ func (n *raft) InstallSnapshot(data []byte) error {
}
n.Lock()
defer n.Unlock()
// If a write error has occurred already then stop here.
if werr := n.werr; werr != nil {
n.Unlock()
return werr
}
// Check that a catchup isn't already taking place. If it is then we won't
// allow installing snapshots until it is done.
if len(n.progress) > 0 {
n.Unlock()
return errCatchupsRunning
}
var state StreamState
n.wal.FastState(&state)
if n.applied == 0 {
n.Unlock()
return errNoSnapAvailable
}
@@ -1055,6 +1055,12 @@ func (n *raft) InstallSnapshot(data []byte) error {
data: data,
}
return n.installSnapshot(snap)
}
// Install the snapshot.
// Lock should be held.
func (n *raft) installSnapshot(snap *snapshot) error {
snapDir := filepath.Join(n.sd, snapshotsDir)
sn := fmt.Sprintf(snapFileT, snap.lastTerm, snap.lastIndex)
sfile := filepath.Join(snapDir, sn)
@@ -1064,29 +1070,21 @@ func (n *raft) InstallSnapshot(data []byte) error {
dios <- struct{}{}
if err != nil {
n.Unlock()
// We could set write err here, but if this is a temporary situation, too many open files etc.
// we want to retry and snapshots are not fatal.
return err
}
// Delete our previous snapshot file if it exists.
if n.snapfile != _EMPTY_ && n.snapfile != sfile {
os.Remove(n.snapfile)
}
// Remember our latest snapshot file.
n.snapfile = sfile
if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil {
n.setWriteErrLocked(err)
n.Unlock()
return err
}
n.Unlock()
psnaps, _ := os.ReadDir(snapDir)
// Remove any old snapshots.
for _, fi := range psnaps {
pn := fi.Name()
if pn != sn {
os.Remove(filepath.Join(snapDir, pn))
}
}
return nil
}
@@ -1628,6 +1626,13 @@ func (n *raft) shutdown(shouldDelete bool) {
// allowing shutdown() to be called again. If that happens then the below
// close(n.quit) will panic from trying to close an already-closed channel.
if n.state.Swap(int32(Closed)) == int32(Closed) {
// If we get called again with shouldDelete, in case we were called first with Stop() cleanup
if shouldDelete {
if wal := n.wal; wal != nil {
wal.Delete()
}
os.RemoveAll(n.sd)
}
n.Unlock()
return
}
@@ -1644,17 +1649,22 @@ func (n *raft) shutdown(shouldDelete bool) {
n.unsubscribe(sub)
}
c.closeConnection(InternalClient)
n.c = nil
}
s, g, wal := n.s, n.group, n.wal
// Unregistering ipQueues do not prevent them from push/pop
// just will remove them from the central monitoring map
queues := []interface {
unregister()
drain()
}{n.reqs, n.votes, n.prop, n.entry, n.resp, n.apply, n.stepdown}
for _, q := range queues {
q.drain()
q.unregister()
}
sd := n.sd
n.Unlock()
s.unregisterRaftNode(g)
@@ -1669,7 +1679,7 @@ func (n *raft) shutdown(shouldDelete bool) {
if shouldDelete {
// Delete all our peer state and vote state and any snapshots.
os.RemoveAll(n.sd)
os.RemoveAll(sd)
n.debug("Deleted")
} else {
n.debug("Shutdown")
@@ -1724,12 +1734,15 @@ func (n *raft) newInbox() string {
// Our internal subscribe.
// Lock should be held.
func (n *raft) subscribe(subject string, cb msgHandler) (*subscription, error) {
if n.c == nil {
return nil, errNoInternalClient
}
return n.s.systemSubscribe(subject, _EMPTY_, false, n.c, cb)
}
// Lock should be held.
func (n *raft) unsubscribe(sub *subscription) {
if sub != nil {
if n.c != nil && sub != nil {
n.c.processUnsub(sub.sid)
}
}
@@ -1888,8 +1901,24 @@ func (n *raft) SetObserver(isObserver bool) {
func (n *raft) setObserver(isObserver bool, extSt extensionState) {
n.Lock()
defer n.Unlock()
if n.paused {
// Applies are paused so we're already in observer state.
// Resuming the applies will set the state back to whatever
// is in "pobserver", so update that instead.
n.pobserver = isObserver
return
}
wasObserver := n.observer
n.observer = isObserver
n.extSt = extSt
// If we're leaving observer state then reset the election timer or
// we might end up waiting for up to the observerModeInterval.
if wasObserver && !isObserver {
n.resetElect(randCampaignTimeout())
}
}
// processAppendEntries is called by the Raft state machine when there are
@@ -1939,7 +1968,7 @@ func (n *raft) runAsFollower() {
n.resetElectionTimeoutWithLock()
n.debug("Not switching to candidate, no resources")
} else if n.IsObserver() {
n.resetElectWithLock(48 * time.Hour)
n.resetElectWithLock(observerModeInterval)
n.debug("Not switching to candidate, observer only")
} else if n.isCatchingUp() {
n.debug("Not switching to candidate, catching up")
@@ -2304,15 +2333,15 @@ func (n *raft) runAsLeader() {
return
}
n.RLock()
n.Lock()
psubj, rpsubj := n.psubj, n.rpsubj
n.RUnlock()
// For forwarded proposals, both normal and remove peer proposals.
fsub, err := n.subscribe(psubj, n.handleForwardedProposal)
if err != nil {
n.warn("Error subscribing to forwarded proposals: %v", err)
n.stepdown.push(noLeader)
n.Unlock()
return
}
rpsub, err := n.subscribe(rpsubj, n.handleForwardedRemovePeerProposal)
@@ -2320,8 +2349,10 @@ func (n *raft) runAsLeader() {
n.warn("Error subscribing to forwarded remove peer proposals: %v", err)
n.unsubscribe(fsub)
n.stepdown.push(noLeader)
n.Unlock()
return
}
n.Unlock()
// Cleanup our subscription when we leave.
defer func() {
@@ -2450,8 +2481,10 @@ func (n *raft) lostQuorum() bool {
}
func (n *raft) lostQuorumLocked() bool {
// Make sure we let any scale up actions settle before deciding.
if !n.lsut.IsZero() && time.Since(n.lsut) < lostQuorumInterval {
// In order to avoid false positives that can happen in heavily loaded systems
// make sure nothing is queued up that we have not processed yet.
// Also make sure we let any scale up actions settle before deciding.
if n.resp.len() != 0 || (!n.lsut.IsZero() && time.Since(n.lsut) < lostQuorumInterval) {
return false
}
@@ -3080,17 +3113,20 @@ func (n *raft) truncateWAL(term, index uint64) {
if err := n.wal.Truncate(index); err != nil {
// If we get an invalid sequence, reset our wal all together.
// We will not have holes, so this means we do not have this message stored anymore.
if err == ErrInvalidSequence {
n.debug("Resetting WAL")
n.wal.Truncate(0)
index, n.term, n.pterm, n.pindex = 0, 0, 0, 0
// If our index is non-zero use PurgeEx to set us to the correct next index.
if index > 0 {
n.wal.PurgeEx(fwcs, index+1, 0)
}
} else {
n.warn("Error truncating WAL: %v", err)
n.setWriteErrLocked(err)
return
}
return
}
// Set after we know we have truncated properly.
n.term, n.pterm, n.pindex = term, term, index
}
@@ -3159,15 +3195,17 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
// to a follower of that node instead.
if n.State() == Candidate {
// Ignore old terms, otherwise we might end up stepping down incorrectly.
if ae.term >= n.term {
// Needs to be ahead of our pterm (last log index), as an isolated node
// could have bumped its vote term up considerably past this point.
if ae.term >= n.pterm {
// If the append entry term is newer than the current term, erase our
// vote.
if ae.term > n.term {
n.term = ae.term
n.vote = noVote
n.writeTermVote()
}
n.debug("Received append entry in candidate state from %q, converting to follower", ae.leader)
n.term = ae.term
n.writeTermVote()
n.stepdown.push(ae.leader)
}
}
@@ -3262,7 +3300,7 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
// If terms mismatched, or we got an error loading, delete that entry and all others past it.
// Make sure to cancel any catchups in progress.
// Truncate will reset our pterm and pindex. Only do so if we have an entry.
n.truncateWAL(ae.pterm, ae.pindex)
n.truncateWAL(eae.pterm, eae.pindex)
}
// Cancel regardless.
n.cancelCatchup()
@@ -3309,6 +3347,7 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
return
}
// Inherit state from appendEntry with the leader's snapshot.
n.pindex = ae.pindex
n.pterm = ae.pterm
n.commit = ae.pindex
@@ -3319,6 +3358,19 @@ func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
return
}
snap := &snapshot{
lastTerm: n.pterm,
lastIndex: n.pindex,
peerstate: encodePeerState(&peerState{n.peerNames(), n.csz, n.extSt}),
data: ae.entries[0].Data,
}
// Install the leader's snapshot as our own.
if err := n.installSnapshot(snap); err != nil {
n.setWriteErrLocked(err)
n.Unlock()
return
}
// Now send snapshot to upper levels. Only send the snapshot, not the peerstate entry.
n.apply.push(newCommittedEntry(n.commit, ae.entries[:1]))
n.Unlock()
@@ -3735,7 +3787,8 @@ func readPeerState(sd string) (ps *peerState, err error) {
}
const termVoteFile = "tav.idx"
const termVoteLen = idLen + 8
const termLen = 8 // uint64
const termVoteLen = idLen + termLen
// Writes out our term & vote outside of a specific raft context.
func writeTermVote(sd string, wtv []byte) error {
@@ -3761,6 +3814,10 @@ func (n *raft) readTermVote() (term uint64, voted string, err error) {
if err != nil {
return 0, noVote, err
}
if len(buf) < termLen {
// Not enough bytes for the uint64 below, so avoid a panic.
return 0, noVote, nil
}
var le = binary.LittleEndian
term = le.Uint64(buf[0:])
if len(buf) < termVoteLen {
+3 -1
View File
@@ -995,9 +995,11 @@ func (s *Server) Reload() error {
return s.ReloadOptions(newOpts)
}
// ReloadOptions applies any supported options from the provided Option
// ReloadOptions applies any supported options from the provided Options
// type. This returns an error if an option which doesn't support
// hot-swapping was changed.
// The provided Options type should not be re-used afterwards.
// Either use Options.Clone() to pass a copy, or make a new one.
func (s *Server) ReloadOptions(newOpts *Options) error {
s.reloadMu.Lock()
defer s.reloadMu.Unlock()
+22 -3
View File
@@ -600,6 +600,8 @@ func New(opts *Options) *Server {
// NewServer will setup a new server struct after parsing the options.
// Could return an error if options can not be validated.
// The provided Options type should not be re-used afterwards.
// Either use Options.Clone() to pass a copy, or make a new one.
func NewServer(opts *Options) (*Server, error) {
setBaselineOptions(opts)
@@ -1095,11 +1097,11 @@ func (s *Server) configureAccounts(reloading bool) (map[string]struct{}, error)
if reloading && acc.Name != globalAccountName {
if ai, ok := s.accounts.Load(acc.Name); ok {
a = ai.(*Account)
a.mu.Lock()
// Before updating the account, check if stream imports have changed.
if !a.checkStreamImportsEqual(acc) {
awcsti[acc.Name] = struct{}{}
}
a.mu.Lock()
// Collect the sids for the service imports since we are going to
// replace with new ones.
var sids [][]byte
@@ -2062,7 +2064,6 @@ func (s *Server) fetchAccount(name string) (*Account, error) {
return nil, err
}
acc := s.buildInternalAccount(accClaims)
acc.claimJWT = claimJWT
// Due to possible race, if registerAccount() returns a non
// nil account, it means the same account was already
// registered and we should use this one.
@@ -2078,6 +2079,7 @@ func (s *Server) fetchAccount(name string) (*Account, error) {
var needImportSubs bool
acc.mu.Lock()
acc.claimJWT = claimJWT
if len(acc.imports.services) > 0 {
if acc.ic == nil {
acc.ic = s.createInternalAccountClient()
@@ -2847,6 +2849,7 @@ const (
JszPath = "/jsz"
HealthzPath = "/healthz"
IPQueuesPath = "/ipqueuesz"
RaftzPath = "/raftz"
)
func (s *Server) basePath(p string) string {
@@ -2961,6 +2964,8 @@ func (s *Server) startMonitoring(secure bool) error {
mux.HandleFunc(s.basePath(HealthzPath), s.HandleHealthz)
// IPQueuesz
mux.HandleFunc(s.basePath(IPQueuesPath), s.HandleIPQueuesz)
// Raftz
mux.HandleFunc(s.basePath(RaftzPath), s.HandleRaftz)
// Do not set a WriteTimeout because it could cause cURL/browser
// to return empty response or unable to display page if the
@@ -4093,6 +4098,16 @@ func (s *Server) isLameDuckMode() bool {
return s.ldm
}
// LameDuckShutdown will perform a lame duck shutdown of NATS, whereby
// the client listener is closed, existing client connections are
// kicked, Raft leaderships are transferred, JetStream is shutdown
// and then finally shutdown the the NATS Server itself.
// This function blocks and will not return until the NATS Server
// has completed the entire shutdown operation.
func (s *Server) LameDuckShutdown() {
s.lameDuckMode()
}
// This function will close the client listener then close the clients
// at some interval to avoid a reconnect storm.
// We will also transfer any raft leaders and shutdown JetStream.
@@ -4222,6 +4237,7 @@ func (s *Server) lameDuckMode() {
}
}
s.Shutdown()
s.WaitForShutdown()
}
// Send an INFO update to routes with the indication that this server is in LDM mode.
@@ -4416,8 +4432,11 @@ func (s *Server) DisconnectClientByID(id uint64) error {
if client := s.getClient(id); client != nil {
client.closeConnection(Kicked)
return nil
} else if client = s.GetLeafNode(id); client != nil {
client.closeConnection(Kicked)
return nil
}
return errors.New("no such client id")
return errors.New("no such client or leafnode id")
}
// LDMClientByID sends a Lame Duck Mode info message to a client by connection ID
+2
View File
@@ -51,6 +51,7 @@ func (s *Server) handleSignals() {
switch sig {
case syscall.SIGINT:
s.Shutdown()
s.WaitForShutdown()
os.Exit(0)
case syscall.SIGTERM:
// Shutdown unless graceful shutdown already in progress.
@@ -60,6 +61,7 @@ func (s *Server) handleSignals() {
if !ldm {
s.Shutdown()
s.WaitForShutdown()
os.Exit(1)
}
case syscall.SIGUSR1:
+85 -46
View File
@@ -462,7 +462,7 @@ func (a *Account) addStreamWithAssignment(config *StreamConfig, fsConfig *FileSt
}
}
jsa.usageMu.RLock()
selected, tier, hasTier := jsa.selectLimits(&cfg)
selected, tier, hasTier := jsa.selectLimits(cfg.Replicas)
jsa.usageMu.RUnlock()
reserved := int64(0)
if !isClustered {
@@ -858,7 +858,11 @@ func (mset *stream) setLeader(isLeader bool) error {
if mset.sourcesConsumerSetup != nil {
mset.sourcesConsumerSetup.Stop()
mset.sourcesConsumerSetup = nil
} else {
// Stop any source consumers
mset.stopSourceConsumers()
}
// Stop responding to sync requests.
mset.stopClusterSubs()
// Unsubscribe from direct stream.
@@ -1482,19 +1486,38 @@ func (s *Server) checkStreamCfg(config *StreamConfig, acc *Account) (StreamConfi
}
// Check for literal duplication of subject interest in config
// and no overlap with any JS API subject space
// and no overlap with any JS or SYS API subject space.
dset := make(map[string]struct{}, len(cfg.Subjects))
for _, subj := range cfg.Subjects {
// Make sure the subject is valid. Check this first.
if !IsValidSubject(subj) {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("invalid subject"))
}
if _, ok := dset[subj]; ok {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("duplicate subjects detected"))
}
// Also check to make sure we do not overlap with our $JS API subjects.
if subjectIsSubsetMatch(subj, "$JS.API.>") {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("subjects overlap with jetstream api"))
// Check for trying to capture everything.
if subj == fwcs {
if !cfg.NoAck {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("capturing all subjects requires no-ack to be true"))
}
// Capturing everything also will require R1.
if cfg.Replicas != 1 {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("capturing all subjects requires replicas of 1"))
}
}
// Make sure the subject is valid.
if !IsValidSubject(subj) {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("invalid subject"))
// Also check to make sure we do not overlap with our $JS API subjects.
if !cfg.NoAck && (subjectIsSubsetMatch(subj, "$JS.>") || subjectIsSubsetMatch(subj, "$JSC.>")) {
// We allow an exception for $JS.EVENT.> since these could have been created in the past.
if !subjectIsSubsetMatch(subj, "$JS.EVENT.>") {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("subjects that overlap with jetstream api require no-ack to be true"))
}
}
// And the $SYS subjects.
if !cfg.NoAck && subjectIsSubsetMatch(subj, "$SYS.>") {
if !subjectIsSubsetMatch(subj, "$SYS.ACCOUNT.>") {
return StreamConfig{}, NewJSStreamInvalidConfigError(fmt.Errorf("subjects that overlap with system api require no-ack to be true"))
}
}
// Mark for duplicate check.
dset[subj] = struct{}{}
@@ -1662,9 +1685,9 @@ func (jsa *jsAccount) configUpdateCheck(old, new *StreamConfig, s *Server) (*Str
jsa.mu.RLock()
acc := jsa.account
jsa.usageMu.RLock()
selected, tier, hasTier := jsa.selectLimits(&cfg)
selected, tier, hasTier := jsa.selectLimits(cfg.Replicas)
if !hasTier && old.Replicas != cfg.Replicas {
selected, tier, hasTier = jsa.selectLimits(old)
selected, tier, hasTier = jsa.selectLimits(old.Replicas)
}
jsa.usageMu.RUnlock()
reserved := int64(0)
@@ -1818,7 +1841,7 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool)
si.trs[i], err = NewSubjectTransform(s.SubjectTransforms[i].Source, s.SubjectTransforms[i].Destination)
if err != nil {
mset.mu.Unlock()
mset.srv.Errorf("Unable to get subject transform for source: %v", err)
return fmt.Errorf("unable to get subject transform for source: %v", err)
}
}
}
@@ -1899,7 +1922,7 @@ func (mset *stream) updateWithAdvisory(config *StreamConfig, sendAdvisory bool)
js := mset.js
if targetTier := tierName(cfg); mset.tier != targetTier {
if targetTier := tierName(cfg.Replicas); mset.tier != targetTier {
// In cases such as R1->R3, only one update is needed
jsa.usageMu.RLock()
_, ok := jsa.limits[targetTier]
@@ -2187,9 +2210,11 @@ func (mset *stream) processMirrorMsgs(mirror *sourceInfo, ready *sync.WaitGroup)
msgs.recycle(&ims)
case <-t.C:
mset.mu.RLock()
var stalled bool
if mset.mirror != nil {
stalled = time.Since(time.Unix(0, mset.mirror.last.Load())) > sourceHealthCheckInterval
}
isLeader := mset.isLeader()
last := time.Unix(0, mset.mirror.last.Load())
stalled := mset.mirror != nil && time.Since(last) > sourceHealthCheckInterval
mset.mu.RUnlock()
// No longer leader.
if !isLeader {
@@ -2406,14 +2431,14 @@ func (mset *stream) skipMsgs(start, end uint64) {
return
}
// FIXME (dlc) - We should allow proposals of DeleteEange, but would need to make sure all peers support.
// FIXME (dlc) - We should allow proposals of DeleteRange, but would need to make sure all peers support.
// With syncRequest was easy to add bool into request.
var entries []*Entry
for seq := start; seq <= end; seq++ {
entries = append(entries, &Entry{EntryNormal, encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq-1, 0)})
entries = append(entries, newEntry(EntryNormal, encodeStreamMsg(_EMPTY_, _EMPTY_, nil, nil, seq-1, 0)))
// So a single message does not get too big.
if len(entries) > 10_000 {
node.ProposeDirect(entries)
node.ProposeMulti(entries)
// We need to re-create `entries` because there is a reference
// to it in the node's pae map.
entries = entries[:0]
@@ -2421,7 +2446,7 @@ func (mset *stream) skipMsgs(start, end uint64) {
}
// Send all at once.
if len(entries) > 0 {
node.ProposeDirect(entries)
node.ProposeMulti(entries)
}
}
@@ -5249,9 +5274,8 @@ func (mset *stream) checkInterestState() {
var zeroAcks []*consumer
var lowAckFloor uint64 = math.MaxUint64
consumers := mset.getConsumers()
for _, o := range consumers {
for _, o := range mset.getConsumers() {
o.checkStateForInterestStream()
o.mu.Lock()
@@ -5290,39 +5314,45 @@ func (mset *stream) checkInterestState() {
return
}
// Hold stream write lock in case we need to purge.
mset.mu.Lock()
defer mset.mu.Unlock()
// Capture our current state.
// ok to do so without lock.
var state StreamState
mset.store.FastState(&state)
if lowAckFloor < math.MaxUint64 && lowAckFloor > state.FirstSeq {
// Check if we had any zeroAcks, we will need to check them.
for _, o := range zeroAcks {
var np uint64
o.mu.RLock()
if o.isLeader() {
np = uint64(o.numPending())
} else {
np, _ = o.calculateNumPending()
}
o.mu.RUnlock()
// This means we have pending and can not remove anything at this time.
if np > 0 {
return
}
}
if lowAckFloor <= state.LastSeq {
// Purge the stream to lowest ack floor + 1
mset.store.PurgeEx(_EMPTY_, lowAckFloor+1, 0)
if lowAckFloor <= state.FirstSeq {
return
}
// Do not want to hold stream lock if calculating numPending.
// Check if we had any zeroAcks, we will need to check them.
for _, o := range zeroAcks {
var np uint64
o.mu.RLock()
if o.isLeader() {
np = uint64(o.numPending())
} else {
// Here we have a low ack floor higher then our last seq.
// So we will just do normal purge.
mset.store.Purge()
np, _ = o.calculateNumPending()
}
o.mu.RUnlock()
// This means we have pending and can not remove anything at this time.
if np > 0 {
return
}
}
mset.mu.Lock()
defer mset.mu.Unlock()
// Check which purge we need to perform.
if lowAckFloor <= state.LastSeq || state.Msgs == 0 {
// Purge the stream to lowest ack floor + 1
mset.store.PurgeEx(_EMPTY_, lowAckFloor+1, 0)
} else {
// Here we have a low ack floor higher then our last seq.
// So we will just do normal purge.
mset.store.Purge()
}
// Make sure to reset our local lseq.
mset.store.FastState(&state)
mset.lseq = state.LastSeq
@@ -5840,6 +5870,8 @@ func (a *Account) RestoreStream(ncfg *StreamConfig, r io.Reader) (*stream, error
}
mset, err := a.addStream(&cfg)
if err != nil {
// Make sure to clean up after ourselves here.
os.RemoveAll(ndir)
return nil, err
}
if !fcfg.Created.IsZero() {
@@ -5975,3 +6007,10 @@ func (mset *stream) clearMonitorRunning() {
defer mset.mu.Unlock()
mset.inMonitor = false
}
// Check if our monitor is running.
func (mset *stream) isMonitorRunning() bool {
mset.mu.RLock()
defer mset.mu.RUnlock()
return mset.inMonitor
}
+1
View File
@@ -51,6 +51,7 @@ func (t *SubjectTree[T]) dump(w io.Writer, n node, depth int) {
func (n *leaf[T]) kind() string { return "LEAF" }
func (n *node4) kind() string { return "NODE4" }
func (n *node16) kind() string { return "NODE16" }
func (n *node48) kind() string { return "NODE48" }
func (n *node256) kind() string { return "NODE256" }
// Calculates the indendation, etc.
+3 -2
View File
@@ -18,16 +18,17 @@ import (
)
// Leaf node
// Order of struct fields for best memory alignment (as per govet/fieldalignment)
type leaf[T any] struct {
value T
// This could be the whole subject, but most likely just the suffix portion.
// We will only store the suffix here and assume all prior prefix paths have
// been checked once we arrive at this leafnode.
suffix []byte
value T
}
func newLeaf[T any](suffix []byte, value T) *leaf[T] {
return &leaf[T]{copyBytes(suffix), value}
return &leaf[T]{value, copyBytes(suffix)}
}
func (n *leaf[T]) isLeaf() bool { return true }
+4 -3
View File
@@ -14,10 +14,11 @@
package stree
// Node with 16 children
// Order of struct fields for best memory alignment (as per govet/fieldalignment)
type node16 struct {
meta
child [16]node
key [16]byte
meta
key [16]byte
}
func newNode16(prefix []byte) *node16 {
@@ -49,7 +50,7 @@ func (n *node16) findChild(c byte) *node {
func (n *node16) isFull() bool { return n.size >= 16 }
func (n *node16) grow() node {
nn := newNode256(n.prefix)
nn := newNode48(n.prefix)
for i := 0; i < 16; i++ {
nn.addChild(n.key[i], n.child[i])
}
+4 -3
View File
@@ -14,9 +14,10 @@
package stree
// Node with 256 children
// Order of struct fields for best memory alignment (as per govet/fieldalignment)
type node256 struct {
meta
child [256]node
meta
}
func newNode256(prefix []byte) *node256 {
@@ -50,10 +51,10 @@ func (n *node256) deleteChild(c byte) {
// Shrink if needed and return new node, otherwise return nil.
func (n *node256) shrink() node {
if n.size > 16 {
if n.size > 48 {
return nil
}
nn := newNode16(nil)
nn := newNode48(nil)
for c, child := range n.child {
if child != nil {
nn.addChild(byte(c), n.child[c])
+3 -2
View File
@@ -14,10 +14,11 @@
package stree
// Node with 4 children
// Order of struct fields for best memory alignment (as per govet/fieldalignment)
type node4 struct {
meta
child [4]node
key [4]byte
meta
key [4]byte
}
func newNode4(prefix []byte) *node4 {
+110
View File
@@ -0,0 +1,110 @@
// Copyright 2023-2024 The NATS Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stree
// Node with 48 children
// Memory saving vs node256 comes from the fact that the child array is 16 bytes
// per `node` entry, so node256's 256*16=4096 vs node48's 256+(48*16)=1024
// Note that `key` is effectively 1-indexed, as 0 means no entry, so offset by 1
// Order of struct fields for best memory alignment (as per govet/fieldalignment)
type node48 struct {
child [48]node
meta
key [256]byte
}
func newNode48(prefix []byte) *node48 {
nn := &node48{}
nn.setPrefix(prefix)
return nn
}
func (n *node48) addChild(c byte, nn node) {
if n.size >= 48 {
panic("node48 full!")
}
n.child[n.size] = nn
n.key[c] = byte(n.size + 1) // 1-indexed
n.size++
}
func (n *node48) findChild(c byte) *node {
i := n.key[c]
if i == 0 {
return nil
}
return &n.child[i-1]
}
func (n *node48) isFull() bool { return n.size >= 48 }
func (n *node48) grow() node {
nn := newNode256(n.prefix)
for c := byte(0); c < 255; c++ {
if i := n.key[c]; i > 0 {
nn.addChild(c, n.child[i-1])
}
}
return nn
}
// Deletes a child from the node.
func (n *node48) deleteChild(c byte) {
i := n.key[c]
if i == 0 {
return
}
i-- // Adjust for 1-indexing
last := byte(n.size - 1)
if i < last {
n.child[i] = n.child[last]
for c := byte(0); c <= 255; c++ {
if n.key[c] == last+1 {
n.key[c] = i + 1
break
}
}
}
n.child[last] = nil
n.key[c] = 0
n.size--
}
// Shrink if needed and return new node, otherwise return nil.
func (n *node48) shrink() node {
if n.size > 16 {
return nil
}
nn := newNode16(nil)
for c := byte(0); c < 255; c++ {
if i := n.key[c]; i > 0 {
nn.addChild(c, n.child[i-1])
}
}
return nn
}
// Iterate over all children calling func f.
func (n *node48) iter(f func(node) bool) {
for _, c := range n.child {
if c != nil && !f(c) {
return
}
}
}
// Return our children as a slice.
func (n *node48) children() []node {
return n.child[:n.size]
}
+14 -1
View File
@@ -51,6 +51,10 @@ func (t *SubjectTree[T]) Empty() *SubjectTree[T] {
// Insert a value into the tree. Will return if the value was updated and if so the old value.
func (t *SubjectTree[T]) Insert(subject []byte, value T) (*T, bool) {
if t == nil {
return nil, false
}
old, updated := t.insert(&t.root, subject, value, 0)
if !updated {
t.size++
@@ -60,6 +64,10 @@ func (t *SubjectTree[T]) Insert(subject []byte, value T) (*T, bool) {
// Find will find the value and return it or false if it was not found.
func (t *SubjectTree[T]) Find(subject []byte) (*T, bool) {
if t == nil {
return nil, false
}
var si int
for n := t.root; n != nil; {
if n.isLeaf() {
@@ -88,6 +96,10 @@ func (t *SubjectTree[T]) Find(subject []byte) (*T, bool) {
// Delete will delete the item and return its value, or not found if it did not exist.
func (t *SubjectTree[T]) Delete(subject []byte) (*T, bool) {
if t == nil {
return nil, false
}
val, deleted := t.delete(&t.root, subject, 0)
if deleted {
t.size--
@@ -97,7 +109,7 @@ func (t *SubjectTree[T]) Delete(subject []byte) (*T, bool) {
// Match will match against a subject that can have wildcards and invoke the callback func for each matched value.
func (t *SubjectTree[T]) Match(filter []byte, cb func(subject []byte, val *T)) {
if len(filter) == 0 || cb == nil {
if t == nil || t.root == nil || len(filter) == 0 || cb == nil {
return
}
// We need to break this up into chunks based on wildcards, either pwc '*' or fwc '>'.
@@ -340,6 +352,7 @@ func (t *SubjectTree[T]) match(n node, parts [][]byte, pre []byte, cb func(subje
t.match(cn, nparts, pre, cb)
}
}
return
}
// Here we have normal traversal, so find the next child.
nn := n.findChild(p)