Skip to content

fdbkubernetesmonitor: Add a check for new binaries in the shared binary directory and report them back in an annotation #12230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions fdbkubernetesmonitor/api/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,34 @@
package api

const (
// FoundationDBPrefix represents the prefix for all FoundationDB related annotations.
FoundationDBPrefix = "foundationdb.org/"

// CurrentConfigurationAnnotation is the annotation we use to store the
// latest configuration.
CurrentConfigurationAnnotation = "foundationdb.org/launcher-current-configuration"
CurrentConfigurationAnnotation = FoundationDBPrefix + "launcher-current-configuration"

// EnvironmentAnnotation is the annotation we use to store the environment
// variables.
EnvironmentAnnotation = "foundationdb.org/launcher-environment"
EnvironmentAnnotation = FoundationDBPrefix + "launcher-environment"

// OutdatedConfigMapAnnotation is the annotation we read to get notified of
// outdated configuration.
OutdatedConfigMapAnnotation = "foundationdb.org/outdated-config-map-seen"
OutdatedConfigMapAnnotation = FoundationDBPrefix + "outdated-config-map-seen"

// DelayShutdownAnnotation defines how long the FDB Kubernetes monitor process should sleep before shutting itself down.
// The FDB Kubernetes monitor will always shutdown all fdbserver processes, independent of this setting.
// The value of this annotation must be a duration like "60s".
DelayShutdownAnnotation = "foundationdb.org/delay-shutdown"
DelayShutdownAnnotation = FoundationDBPrefix + "delay-shutdown"

// ClusterFileChangeDetectedAnnotation is the annotation that will be updated if the fdb.cluster file is updated.
ClusterFileChangeDetectedAnnotation = "foundationdb.org/cluster-file-change"
ClusterFileChangeDetectedAnnotation = FoundationDBPrefix + "cluster-file-change"

// IsolateProcessGroupAnnotation is the annotation that defines if the current Pod should be isolated. Isolated
// process groups will shutdown the fdbserver instance but keep the Pod and other Kubernetes resources running
// for debugging purpose.
IsolateProcessGroupAnnotation = "foundationdb.org/isolate-process-group"
IsolateProcessGroupAnnotation = FoundationDBPrefix + "isolate-process-group"

// AvailableBinariesAnnotation is the annotation we use to store the available binaries on this Pod.
AvailableBinariesAnnotation = FoundationDBPrefix + "available-binaries"
)
2 changes: 2 additions & 0 deletions fdbkubernetesmonitor/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ require (
github.com/go-openapi/swag v0.23.1 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.9 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/pprof v0.0.0-20250607225305-033d6d78b36a // indirect
Expand All @@ -72,6 +73,7 @@ require (
go.yaml.in/yaml/v2 v2.4.2 // indirect
golang.org/x/net v0.41.0 // indirect
golang.org/x/oauth2 v0.30.0 // indirect
golang.org/x/sync v0.15.0 // indirect
golang.org/x/sys v0.33.0 // indirect
golang.org/x/term v0.32.0 // indirect
golang.org/x/text v0.26.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions fdbkubernetesmonitor/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k=
github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ=
github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU=
github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM=
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
Expand All @@ -29,6 +31,8 @@ github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1v
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg=
github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4=
github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw=
github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
Expand Down Expand Up @@ -124,6 +128,8 @@ golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKl
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8=
golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
Expand Down
19 changes: 19 additions & 0 deletions fdbkubernetesmonitor/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,28 @@ func (podClient *kubernetesClient) updateAnnotations(monitor *monitor) error {
return err
}

availableBinaries, err := json.Marshal(monitor.availableBinaries)
if err != nil {
return err
}

return podClient.updateAnnotationsOnPod(map[string]string{
api.CurrentConfigurationAnnotation: string(monitor.activeConfigurationBytes),
api.EnvironmentAnnotation: string(jsonEnvironment),
api.AvailableBinariesAnnotation: string(availableBinaries),
})
}

// updateAvailableBinariesAnnotation updates the api.AvailableBinariesAnnotation annotation on the pod
// after a new fdbserver binary was copied into the shared directory.
func (podClient *kubernetesClient) updateAvailableBinariesAnnotation(currentBinaries map[string]struct{}) error {
availableBinaries, err := json.Marshal(currentBinaries)
if err != nil {
return err
}

return podClient.updateAnnotationsOnPod(map[string]string{
api.AvailableBinariesAnnotation: string(availableBinaries),
})
}

Expand Down
5 changes: 4 additions & 1 deletion fdbkubernetesmonitor/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ import (
"syscall"

"github.com/apple/foundationdb/fdbkubernetesmonitor/api"

"github.com/go-logr/logr"
"github.com/go-logr/zapr"
"github.com/spf13/pflag"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"gopkg.in/natefinch/lumberjack.v2"
ctrl "sigs.k8s.io/controller-runtime"
)

var (
Expand Down Expand Up @@ -174,6 +174,9 @@ func main() {
logger.Error(err, "Error parsing container version", "currentContainerVersion", currentContainerVersion)
os.Exit(1)
}

// Update the logger for the controller-runtime.
ctrl.SetLogger(logger)
startMonitor(context.Background(), logger, path.Join(inputDir, monitorConfFile), customEnvironment, processCount, promConfig, enablePprof, parsedVersion, enableNodeWatch)
case executionModeInit:
err = copyFiles(logger, outputDir, copyDetails, requiredCopies)
Expand Down
103 changes: 100 additions & 3 deletions fdbkubernetesmonitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"os/exec"
"os/signal"
"path"
"path/filepath"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -107,6 +108,15 @@ type monitor struct {

// metrics represents the prometheus monitor metrics.
metrics *metrics

// runVersionCommand when set to false the monitor will not try to run the fdbserver --version command to ensure
// that the binary is executable.
runVersionCommand bool

// availableBinaries represents all available binaries in the sharedBinaryDir. Most of the time this map will be
// empty but during version incompatible upgrades, this information will be used to signal the operator that
// the new fdbserver binary is present and executable in the sharedBinaryDir.
availableBinaries map[string]struct{}
}

type httpConfig struct {
Expand All @@ -129,6 +139,8 @@ func startMonitor(ctx context.Context, logger logr.Logger, configFile string, cu
processCount: processCount,
processIDs: make([]int, processCount+1),
currentContainerVersion: currentContainerVersion,
runVersionCommand: true,
availableBinaries: map[string]struct{}{},
}

go func() { mon.watchPodTimestamps() }()
Expand Down Expand Up @@ -253,7 +265,8 @@ func (monitor *monitor) readConfiguration() (*api.ProcessConfiguration, []byte)
configuration.BinaryPath = path.Join(sharedBinaryDir, configuration.Version.String(), "fdbserver")
}

err = checkOwnerExecutable(configuration.BinaryPath)
// TODO (johscheuer): Should we run this check every time?
err = checkOwnerExecutable(configuration.BinaryPath, monitor.runVersionCommand)
if err != nil {
monitor.logger.Error(err, "Error with binary path for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath)
return nil, nil
Expand Down Expand Up @@ -295,14 +308,26 @@ func (monitor *monitor) loadConfiguration() {

// checkOwnerExecutable validates that a path is a file that exists and is
// executable by its owner.
func checkOwnerExecutable(path string) error {
func checkOwnerExecutable(path string, runVersionCommand bool) error {
binaryStat, err := os.Stat(path)
if err != nil {
return err
}
if binaryStat.Mode()&0o100 == 0 {
return fmt.Errorf("binary is not executable")
}

if !runVersionCommand {
return nil
}

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, path, "--version")
if err = cmd.Run(); err != nil {
return fmt.Errorf("could not run the version command with binary: %s, error: %w", path, err)
}

return nil
}

Expand Down Expand Up @@ -523,7 +548,7 @@ func (monitor *monitor) watchConfiguration(watcher *fsnotify.Watcher) {
return
}

monitor.logger.Info("Detected event on monitor conf file or cluster file", "event", event)
monitor.logger.Info("Detected event on monitor conf file, cluster file or shared binaries", "event", event)
if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
monitor.handleFileChange(event.Name)
} else if event.Op&fsnotify.Remove == fsnotify.Remove {
Expand All @@ -542,6 +567,59 @@ func (monitor *monitor) watchConfiguration(watcher *fsnotify.Watcher) {
}
}

// getBinariesFromSharedBinaryDir returns all fdbserver binaries that are found in the shared binary directory.
func (monitor *monitor) waitForSharedBinariesAndUpdateAnnotation(dir string) error {
var fdbserverBinaries []string

startTime := time.Now()
for len(fdbserverBinaries) == 0 {
// If after 5 minutes the new fdbserver binary was not copied, something is probably wrong.
if time.Since(startTime) > 5*time.Minute {
return fmt.Errorf("could not find fdbserver binary in shared binary dir after more than 2 minutes")
}

monitor.logger.Info("Checking shared binary dir for new fdbserver binary", "sharedBinaryDir", sharedBinaryDir, "fdbserverBinaries", fdbserverBinaries)
err := filepath.Walk(dir,
func(currentPath string, info os.FileInfo, err error) error {
if err != nil {
return err
}

if info.IsDir() {
return nil
}

if path.Base(currentPath) != "fdbserver" {
return nil
}

monitor.logger.Info("found new fdbserver binary in shared binary dir", "sharedBinaryDir", sharedBinaryDir, "currentPath", currentPath)
fdbserverBinaries = append(fdbserverBinaries, currentPath)

return nil
})

if err != nil {
monitor.logger.Error(err, "Error getting binaries from sharedBinaryDir", "sharedBinaryDir", sharedBinaryDir)
}

time.Sleep(1 * time.Second)
}

for _, binary := range fdbserverBinaries {
err := checkOwnerExecutable(binary, monitor.runVersionCommand)
if err != nil {
monitor.logger.Error(err, "Error with binary in shared binary directory", "sharedBinaryDir", sharedBinaryDir, "binary", binary)
continue
}

monitor.availableBinaries[binary] = struct{}{}
monitor.logger.Info("Adding new binary to available binaries", "sharedBinaryDir", sharedBinaryDir, "binary", binary)
}

return monitor.podClient.updateAvailableBinariesAnnotation(monitor.availableBinaries)
}

// handleFileChange will perform the required action based on the changed/modified file.
func (monitor *monitor) handleFileChange(changedFile string) {
if changedFile == fdbClusterFilePath {
Expand All @@ -552,6 +630,18 @@ func (monitor *monitor) handleFileChange(changedFile string) {
return
}

// If the changed file is in the shared binary path, check if the binary can be executed. If the binary is
// executable then we can add it to the available binaries.
if strings.HasPrefix(changedFile, sharedBinaryDir) {
go func(dir string) {
err := monitor.waitForSharedBinariesAndUpdateAnnotation(sharedBinaryDir)
if err != nil {
monitor.logger.Error(err, "Error getting binaries from sharedBinaryDir", "sharedBinaryDir", sharedBinaryDir, "changedFile", changedFile)
return
}
}(sharedBinaryDir)
}

monitor.loadConfiguration()
}

Expand Down Expand Up @@ -615,6 +705,13 @@ func (monitor *monitor) run() {
panic(err)
}

// Create a watcher for the sharedBinaryDir, this watcher will update the available binaries during an upgrade.
monitor.logger.Info("adding watch for shared binary path", "path", path.Dir(sharedBinaryDir))
err = watcher.Add(path.Dir(sharedBinaryDir))
if err != nil {
panic(err)
}

defer func(watcher *fsnotify.Watcher) {
err := watcher.Close()
if err != nil {
Expand Down