eggsmigrate: initial version with basic functionality (#175)

This commit is contained in:
Saulius Grusnys
2024-01-31 14:10:13 +00:00
committed by GitHub Enterprise
parent 9c477ffa40
commit 10b432d002
11 changed files with 108 additions and 44 deletions

1
.gitignore vendored
View File

@@ -5,3 +5,4 @@ build/
vgcore.*
compile_commands.json
.idea
.rsync-rules

View File

@@ -330,21 +330,22 @@ func main() {
}
}
stats := lib.MigrateStats{}
progressReportAlert := log.NewNCAlert(10 * time.Second)
for failureDomain, bss := range blockServicesToMigrate {
for _, blockServiceId := range *bss {
log.Info("migrating block service %v, %v", blockServiceId, failureDomain)
if *migrateFileIdU64 == 0 && *migrateShard < 0 {
if err := lib.MigrateBlocksInAllShards(log, client, &stats, blockServiceId); err != nil {
if err := lib.MigrateBlocksInAllShards(log, client, &stats, progressReportAlert, blockServiceId); err != nil {
panic(err)
}
} else if *migrateFileIdU64 != 0 {
fileId := msgs.InodeId(*migrateFileIdU64)
if err := lib.MigrateBlocksInFile(log, client, &stats, blockServiceId, fileId); err != nil {
if err := lib.MigrateBlocksInFile(log, client, &stats, progressReportAlert, blockServiceId, fileId); err != nil {
panic(fmt.Errorf("error while migrating file %v away from block service %v: %v", fileId, blockServiceId, err))
}
} else {
shid := msgs.ShardId(*migrateShard)
if err := lib.MigrateBlocks(log, client, &stats, shid, blockServiceId); err != nil {
if err := lib.MigrateBlocks(log, client, &stats, progressReportAlert, shid, blockServiceId); err != nil {
panic(err)
}
}
@@ -352,6 +353,7 @@ func main() {
}
}
log.Info("finished migrating away from all block services, stats: %+v", stats)
log.ClearNC(progressReportAlert)
}
commands["migrate"] = commandSpec{
flags: migrateCmd,
@@ -918,13 +920,13 @@ func main() {
flags: scrubFileCmd,
run: scrubFileRun,
}
scrubCmd := flag.NewFlagSet("scrub", flag.ExitOnError)
scrubRun := func() {
stats := lib.ScrubState{}
if err := lib.ScrubFilesInAllShards(log, client, &lib.ScrubOptions{NumWorkersPerShard: 10}, nil, &stats); err != nil {
panic(err)
}
}
commands["scrub"] = commandSpec{
flags: scrubCmd,

View File

@@ -85,6 +85,7 @@ func main() {
zeroBlockServices := flag.Bool("zero-block-services", false, "")
metrics := flag.Bool("metrics", false, "Send metrics")
countMetrics := flag.Bool("count-metrics", false, "Compute and send count metrics")
migrate := flag.Bool("migrate", false, "migrate")
scrub := flag.Bool("scrub", false, "scrub")
scrubWorkersPerShard := flag.Int("scrub-workers-per-shard", 10, "")
scrubWorkersQueueSize := flag.Int("scrub-workers-queue-size", 50, "")
@@ -96,7 +97,7 @@ func main() {
os.Exit(2)
}
if !*destructFiles && !*collectDirectories && !*zeroBlockServices && !*countMetrics && !*scrub {
if !*destructFiles && !*collectDirectories && !*zeroBlockServices && !*countMetrics && !*scrub && !*migrate {
fmt.Fprintf(os.Stderr, "Nothing to do!\n")
os.Exit(2)
}
@@ -190,6 +191,8 @@ func main() {
panic(err)
}
migrateState := &lib.MigrateState{}
// store the state
go func() {
defer func() { lib.HandleRecoverChan(log, terminateChan, recover()) }()
@@ -302,6 +305,48 @@ func main() {
}()
}
}
if *migrate {
go func() {
defer func() { lib.HandleRecoverChan(log, terminateChan, recover()) }()
for {
log.Info("requesting block services")
blockServicesResp, err := lib.ShuckleRequest(log, nil, *shuckleAddress, &msgs.AllBlockServicesReq{})
if err != nil {
terminateChan <- err
return
}
blockServices := blockServicesResp.(*msgs.AllBlockServicesResp)
blockServicesToMigrate := make(map[string]*[]msgs.BlockServiceId) // by failure domain
for _, bs := range blockServices.BlockServices {
if bs.Flags.HasAny(msgs.EGGSFS_BLOCK_SERVICE_DECOMMISSIONED) {
bss := blockServicesToMigrate[bs.FailureDomain.String()]
if bss == nil {
bss = &[]msgs.BlockServiceId{}
blockServicesToMigrate[bs.FailureDomain.String()] = bss
}
*bss = append(*bss, bs.Id)
}
}
progressReportAlert := log.NewNCAlert(10 * time.Second)
for failureDomain, bss := range blockServicesToMigrate {
for _, blockServiceId := range *bss {
log.RaiseNCInfo(progressReportAlert, "migrating block service %v, %v", blockServiceId, failureDomain)
if err := lib.MigrateBlocksInAllShards(log, client, &migrateState.Stats, progressReportAlert, blockServiceId); err != nil {
terminateChan <- err
}
log.Info("finished migrating blocks away from block service %v, stats so far: %+v", blockServiceId, migrateState.Stats)
log.ClearNC(progressReportAlert)
}
}
if migrateState.Stats.MigratedBlocks > 0 {
log.Info("finished migrating away from all block services, stats: %+v", migrateState.Stats)
}
time.Sleep(time.Minute)
}
}()
}
if *metrics && (*destructFiles || *collectDirectories || *zeroBlockServices || *scrub) {
// one thing just pushing the stats every minute
go func() {

View File

@@ -863,8 +863,9 @@ func fsTestInternal[Id comparable](
defer client.Close()
blockServiceToPurge := findBlockServiceToPurge(log, client)
log.Info("will migrate block service %v", blockServiceToPurge)
progressReportAlert := log.NewNCAlert(10 * time.Second)
migrateStats := lib.MigrateStats{}
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, blockServiceToPurge)
err = lib.MigrateBlocksInAllShards(log, client, &migrateStats, progressReportAlert, blockServiceToPurge)
if err != nil {
panic(fmt.Errorf("could not migrate: %w", err))
}

View File

@@ -1236,10 +1236,10 @@ func (client *Client) singleBlockReq(log *Logger, timeouts *ReqTimeouts, process
if RetriableBlockError(err) {
next := timeouts.Next(startedAt)
if next == 0 {
log.RaiseNCStack(timeoutAlert, 2, "block request to %v:%v %v:%v failed with retriable error, will not retry since time is up: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
log.RaiseNCStack(timeoutAlert, ERROR, 2, "block request to %v:%v %v:%v failed with retriable error, will not retry since time is up: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
return nil, err
}
log.RaiseNCStack(timeoutAlert, 2, "block request to %v:%v %v:%v failed with retriable error, might retry: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
log.RaiseNCStack(timeoutAlert, ERROR, 2, "block request to %v:%v %v:%v failed with retriable error, might retry: %v", net.IP(args.ip1[:]), args.port1, net.IP(args.ip2[:]), args.port2, err)
time.Sleep(next)
} else {
return nil, err

View File

@@ -233,19 +233,23 @@ func (l *Logger) NewNCAlert(quietTime time.Duration) *XmonNCAlert {
}
func (l *Logger) RaiseAlertStack(calldepth int, format string, v ...any) {
l.xmon.RaiseStack(l, l.xmon, 1+calldepth, format, v...)
l.xmon.RaiseStack(l, l.xmon, ERROR, 1+calldepth, format, v...)
}
func (l *Logger) RaiseAlert(format string, v ...any) {
l.RaiseAlertStack(1, format, v...)
}
func (l *Logger) RaiseNCStack(alert *XmonNCAlert, calldepth int, format string, v ...any) {
alert.RaiseStack(l, l.xmon, 1+calldepth, format, v...)
func (l *Logger) RaiseNCStack(alert *XmonNCAlert, logLevel LogLevel, calldepth int, format string, v ...any) {
alert.RaiseStack(l, l.xmon, logLevel, 1+calldepth, format, v...)
}
func (l *Logger) RaiseNC(alert *XmonNCAlert, format string, v ...any) {
l.RaiseNCStack(alert, 1, format, v...)
l.RaiseNCStack(alert, ERROR, 1, format, v...)
}
func (l *Logger) RaiseNCInfo(alert *XmonNCAlert, format string, v ...any) {
l.RaiseNCStack(alert, INFO, 1, format, v...)
}
func (l *Logger) ClearNC(alert *XmonNCAlert) {

View File

@@ -25,6 +25,16 @@ import (
"xtx/eggsfs/rs"
)
type MigrateStats struct {
MigratedFiles uint64
MigratedBlocks uint64
MigratedBytes uint64
}
type MigrateState struct {
Stats MigrateStats
}
func fetchBlock(
log *Logger,
client *Client,
@@ -215,20 +225,20 @@ func newTimeStats() *timeStats {
return &timeStats{startedAt: now, lastReportAt: now}
}
func printStatsLastReport(log *Logger, what string, client *Client, stats *MigrateStats, timeStats *timeStats, lastReport int64, now int64) {
func printStatsLastReport(log *Logger, what string, client *Client, stats *MigrateStats, timeStats *timeStats, progressReportAlert *XmonNCAlert, lastReport int64, now int64) {
timeSinceLastReport := time.Duration(now - lastReport)
timeSinceStart := time.Duration(now - atomic.LoadInt64(&timeStats.startedAt))
overallMB := float64(stats.MigratedBytes) / 1e6
overallMBs := 1000.0 * overallMB / float64(timeSinceStart.Milliseconds())
recentMB := float64(stats.MigratedBytes-timeStats.lastReportBytes) / 1e6
recentMBs := 1000.0 * recentMB / float64(timeSinceLastReport.Milliseconds())
log.Info("%s %0.2fMB in %v blocks in %v files, at %.2fMB/s (recent), %0.2fMB/s (overall)", what, overallMB, stats.MigratedBlocks, stats.MigratedFiles, recentMBs, overallMBs)
log.RaiseNCInfo(progressReportAlert, "%s %0.2fMB in %v blocks in %v files, at %.2fMB/s (recent), %0.2fMB/s (overall)", what, overallMB, stats.MigratedBlocks, stats.MigratedFiles, recentMBs, overallMBs)
timeStats.lastReportAt = now
timeStats.lastReportBytes = stats.MigratedBytes
}
func printMigrateStats(log *Logger, what string, client *Client, stats *MigrateStats, timeStats *timeStats) {
printStatsLastReport(log, what, client, stats, timeStats, atomic.LoadInt64(&timeStats.lastReportAt), time.Now().UnixNano())
func printMigrateStats(log *Logger, what string, client *Client, stats *MigrateStats, timeStats *timeStats, progressReportAlert *XmonNCAlert) {
printStatsLastReport(log, what, client, stats, timeStats, progressReportAlert, atomic.LoadInt64(&timeStats.lastReportAt), time.Now().UnixNano())
}
// We reuse this functionality for scrubbing, they're basically doing the same
@@ -239,6 +249,7 @@ func migrateBlocksInFileGeneric(
bufPool *BufPool,
stats *MigrateStats,
timeStats *timeStats,
progressReportAlert *XmonNCAlert,
what string,
badBlock func(blockService *msgs.BlockService, blockSize uint32, block *msgs.FetchedBlock) (bool, error),
scratchFile *scratchFile,
@@ -250,7 +261,7 @@ func migrateBlocksInFileGeneric(
now := time.Now().UnixNano()
if (now - lastReportAt) > time.Minute.Nanoseconds() {
if atomic.CompareAndSwapInt64(&timeStats.lastReportAt, lastReportAt, now) {
printStatsLastReport(log, what, client, stats, timeStats, lastReportAt, now)
printStatsLastReport(log, what, client, stats, timeStats, progressReportAlert, lastReportAt, now)
}
}
}()
@@ -406,12 +417,6 @@ func migrateBlocksInFileGeneric(
return nil
}
type MigrateStats struct {
MigratedFiles uint64
MigratedBlocks uint64
MigratedBytes uint64
}
// Migrates the blocks in that block service, in that file.
//
// If the source block service it's still healthy, it'll just copy the block over, otherwise
@@ -420,6 +425,7 @@ func MigrateBlocksInFile(
log *Logger,
client *Client,
stats *MigrateStats,
progressReportAlert *XmonNCAlert,
blockServiceId msgs.BlockServiceId,
fileId msgs.InodeId,
) error {
@@ -429,7 +435,7 @@ func MigrateBlocksInFile(
badBlock := func(blockService *msgs.BlockService, blockSize uint32, block *msgs.FetchedBlock) (bool, error) {
return blockService.Id == blockServiceId, nil
}
return migrateBlocksInFileGeneric(log, client, NewBufPool(), stats, newTimeStats(), "migrated", badBlock, &scratchFile, fileId)
return migrateBlocksInFileGeneric(log, client, NewBufPool(), stats, newTimeStats(), progressReportAlert, "migrated", badBlock, &scratchFile, fileId)
}
// Tries to migrate as many blocks as possible from that block service in a certain
@@ -440,6 +446,7 @@ func migrateBlocksInternal(
bufPool *BufPool,
stats *MigrateStats,
timeStats *timeStats,
progressReportAlert *XmonNCAlert,
shid msgs.ShardId,
blockServiceId msgs.BlockServiceId,
) error {
@@ -467,7 +474,7 @@ func migrateBlocksInternal(
continue
}
for attempts := 1; ; attempts++ {
if err := migrateBlocksInFileGeneric(log, client, bufPool, stats, timeStats, "migrated", badBlock, &scratchFile, file); err != nil {
if err := migrateBlocksInFileGeneric(log, client, bufPool, stats, timeStats, progressReportAlert, "migrated", badBlock, &scratchFile, file); err != nil {
if err == msgs.BLOCK_NOT_FOUND {
log.RaiseNC(blockNotFoundAlert, "could not migrate blocks in file %v after %v attempts because a block was not found in it. this is probably due to conflicts with other migrations or scrubbing. will retry in one second.", file, attempts)
time.Sleep(time.Second)
@@ -487,15 +494,16 @@ func MigrateBlocks(
log *Logger,
client *Client,
stats *MigrateStats,
progressReportAlert *XmonNCAlert,
shid msgs.ShardId,
blockServiceId msgs.BlockServiceId,
) error {
timeStats := newTimeStats()
bufPool := NewBufPool()
if err := migrateBlocksInternal(log, client, bufPool, stats, timeStats, shid, blockServiceId); err != nil {
if err := migrateBlocksInternal(log, client, bufPool, stats, timeStats, progressReportAlert, shid, blockServiceId); err != nil {
return err
}
printMigrateStats(log, "migrated", client, stats, timeStats)
printMigrateStats(log, "migrated", client, stats, timeStats, progressReportAlert)
log.Info("finished migrating blocks out of %v in shard %v, stats: %+v", blockServiceId, shid, stats)
return nil
}
@@ -504,6 +512,7 @@ func MigrateBlocksInAllShards(
log *Logger,
client *Client,
stats *MigrateStats,
progressReportAlert *XmonNCAlert,
blockServiceId msgs.BlockServiceId,
) error {
timeStats := newTimeStats()
@@ -514,7 +523,7 @@ func MigrateBlocksInAllShards(
for i := 0; i < 256; i++ {
shid := msgs.ShardId(i)
go func() {
if err := migrateBlocksInternal(log, client, bufPool, stats, timeStats, shid, blockServiceId); err != nil {
if err := migrateBlocksInternal(log, client, bufPool, stats, timeStats, progressReportAlert, shid, blockServiceId); err != nil {
log.Info("could not migrate block service %v in shard %v: %v", blockServiceId, shid, err)
atomic.StoreInt32(&failed, 1)
}
@@ -523,7 +532,7 @@ func MigrateBlocksInAllShards(
}()
}
wg.Wait()
printMigrateStats(log, "migrated", client, stats, timeStats)
printMigrateStats(log, "migrated", client, stats, timeStats, progressReportAlert)
log.Info("finished migrating blocks out of %v in all shards, stats: %+v", blockServiceId, stats)
if atomic.LoadInt32(&failed) == 1 {
return fmt.Errorf("some shards failed to migrate, check logs")

View File

@@ -34,6 +34,7 @@ func scrubFileInternal(
bufPool *BufPool,
stats *ScrubState,
timeStats *timeStats,
progressReportAlert *XmonNCAlert,
scratchFile *scratchFile,
scrubbingMu *sync.Mutex,
file msgs.InodeId,
@@ -50,7 +51,7 @@ func scrubFileInternal(
}
return false, err
}
return migrateBlocksInFileGeneric(log, client, bufPool, &stats.Migrate, timeStats, "scrubbed", badBlock, scratchFile, file)
return migrateBlocksInFileGeneric(log, client, bufPool, &stats.Migrate, timeStats, progressReportAlert, "scrubbed", badBlock, scratchFile, file)
}
type scrubRequest struct {
@@ -99,7 +100,7 @@ func scrubWorker(
atomic.AddUint64(&stats.CheckedBytes, uint64(req.size))
}
for attempts := 1; ; attempts++ {
if err := scrubFileInternal(log, client, bufPool, stats, nil, scratchFile, scrubbingMu, req.file); err != nil {
if err := scrubFileInternal(log, client, bufPool, stats, nil, nil, scratchFile, scrubbingMu, req.file); err != nil {
if err == msgs.BLOCK_NOT_FOUND {
log.RaiseNC(blockNotFoundAlert, "could not migrate blocks in file %v after %v attempts because a block was not found in it. this is probably due to conflicts with other migrations or scrubbing. will retry in one second.", req.file, attempts)
time.Sleep(time.Second)
@@ -217,7 +218,7 @@ func ScrubFile(
keepAlive := startToKeepScratchFileAlive(log, client, &scratchFile)
defer keepAlive.stop()
var scrubbingMu sync.Mutex
return scrubFileInternal(log, client, bufPool, stats, nil, &scratchFile, &scrubbingMu, file)
return scrubFileInternal(log, client, bufPool, stats, nil, nil, &scratchFile, &scrubbingMu, file)
}
func ScrubFiles(

View File

@@ -234,7 +234,7 @@ Reconnect:
log.Info("could not connect to shuckle and we're out of attempts: %v", err)
return nil, err
}
log.RaiseNCStack(alert, 1, "could not connect to shuckle, will retry in %v: %v", delay, err)
log.RaiseNCStack(alert, ERROR, 1, "could not connect to shuckle, will retry in %v: %v", delay, err)
time.Sleep(delay)
ReconnectBegin:

View File

@@ -398,11 +398,11 @@ const tooManyAlertsAlertId = int64(0)
var alertIdCount = int64(1)
func xmonRaiseStack(log *Logger, xmon *Xmon, calldepth int, alertId *int64, binnable bool, quietPeriod time.Duration, format string, v ...any) string {
func xmonRaiseStack(log *Logger, xmon *Xmon, logLevel LogLevel, calldepth int, alertId *int64, binnable bool, quietPeriod time.Duration, format string, v ...any) string {
file, line := getFileLine(1 + calldepth)
message := fmt.Sprintf("%s:%d "+format, append([]any{file, line}, v...)...)
if binnable || quietPeriod == 0 {
log.LogLocation(ERROR, file, line, message)
if binnable || quietPeriod == 0 || xmon.onlyLogging {
log.LogLocation(logLevel, file, line, message)
}
if *alertId < 0 {
*alertId = atomic.AddInt64(&alertIdCount, 1)
@@ -429,22 +429,22 @@ func xmonRaiseStack(log *Logger, xmon *Xmon, calldepth int, alertId *int64, binn
return message
}
func (x *Xmon) RaiseStack(log *Logger, xmon *Xmon, calldepth int, format string, v ...any) {
func (x *Xmon) RaiseStack(log *Logger, xmon *Xmon, logLevel LogLevel, calldepth int, format string, v ...any) {
alertId := int64(-1)
xmonRaiseStack(log, x, 1+calldepth, &alertId, true, 0, format, v...)
xmonRaiseStack(log, x, logLevel, 1+calldepth, &alertId, true, 0, format, v...)
}
func (x *Xmon) Raise(log *Logger, xmon *Xmon, format string, v ...any) {
func (x *Xmon) Raise(log *Logger, xmon *Xmon, logLevel LogLevel, format string, v ...any) {
alertId := int64(-1)
xmonRaiseStack(log, x, 1, &alertId, true, 0, format, v...)
xmonRaiseStack(log, x, logLevel, 1, &alertId, true, 0, format, v...)
}
func (a *XmonNCAlert) RaiseStack(log *Logger, xmon *Xmon, calldepth int, format string, v ...any) {
a.lastMessage = xmonRaiseStack(log, xmon, 1+calldepth, &a.alertId, false, a.quietPeriod, format, v...)
func (a *XmonNCAlert) RaiseStack(log *Logger, xmon *Xmon, logLevel LogLevel, calldepth int, format string, v ...any) {
a.lastMessage = xmonRaiseStack(log, xmon, logLevel, 1+calldepth, &a.alertId, false, a.quietPeriod, format, v...)
}
func (a *XmonNCAlert) Raise(log *Logger, xmon *Xmon, format string, v ...any) {
a.lastMessage = xmonRaiseStack(log, xmon, 1, &a.alertId, false, a.quietPeriod, format, v...)
func (a *XmonNCAlert) Raise(log *Logger, xmon *Xmon, logLevel LogLevel, format string, v ...any) {
a.lastMessage = xmonRaiseStack(log, xmon, logLevel, 1, &a.alertId, false, a.quietPeriod, format, v...)
}
func (a *XmonNCAlert) Clear(log *Logger, xmon *Xmon) {

1
kmod/.gitignore vendored
View File

@@ -12,3 +12,4 @@ eggsfs-client-*
linux*
*.img
revision.c
html