diff options
author | Julien Dessaux | 2020-01-08 11:22:14 +0100 |
---|---|---|
committer | Julien Dessaux | 2020-01-08 15:09:31 +0100 |
commit | 4724979262cbbb6792412881fc812ab2101631ef (patch) | |
tree | 067e304d64d447ddf013b246905253f3d444f532 | |
parent | Initial import (diff) | |
download | bareos-zabbix-check-4724979262cbbb6792412881fc812ab2101631ef.tar.gz bareos-zabbix-check-4724979262cbbb6792412881fc812ab2101631ef.tar.bz2 bareos-zabbix-check-4724979262cbbb6792412881fc812ab2101631ef.zip |
Improved error messages and added a readme
-rw-r--r-- | README.md | 74 | ||||
-rw-r--r-- | main.go | 2 | ||||
-rw-r--r-- | spool.go | 12 | ||||
-rw-r--r-- | state.go | 31 |
4 files changed, 100 insertions, 19 deletions
diff --git a/README.md b/README.md new file mode 100644 index 0000000..2f5c98d --- /dev/null +++ b/README.md @@ -0,0 +1,74 @@ +# Zabbix check for bareos backups + +This repository contains code for a go program that can inspect a bareos status file to check the last run jobs. It outputs errors if a job's last run did not end successfully, or if a job is missing (ie it did not run). It should also be compatible with bacula. + +This program was born from a need to query the status of the backups from the client machine and report it in zabbix at my workplace. Being a zabbix check it must exit with a code 0 even when reporting errors, be warned if you intend to use it with something else than zabbix. Changing this behaviour to suit your needs should not be hard at all though. + +## Dependencies + +go is required. Only go version >= 1.13.5 on linux amd64 has been tested. + +## Building + +For a debug build, use : +``` +go build +``` + +For a release build, use : +``` +go build -ldflags="-s -w" +``` + +## Usage + +The common way to run this check is without any argument : +``` +./bareos-zabbix-check +``` + +There are several flags available if you need to override the defaults : + - -f string : Force the state file to use, defaults to bareos-fd.9102.state if it exists else bacula-fd.9102.state. + - -q bool : Suppress all output, suitable to force a silent update of the spool file. + - -v bool : Activates verbose debugging output, defaults to false. + - -w string : Force the work directory to use, defaults to /var/lib/bareos if it exists else /var/lib/bacula. + +## Output + +As all zabbix checks, the program will exit 0 whatever happens. You will use the output in your triggers. + +If there were no errors and there is no missing jobs, the program simply outputs : `OK`. The program outputs an `INFO <message>` if there were no backups ever (bootstrap situation mainly) or any special error. The program outputs an `AVERAGE <message>` if there was an error during the last run of a job, or if a job didn't run successfully in the last 24 hours. + +Here is a list of the possible error messages and their meaning : + - `AVERAGE: errors:%s missing:%s additionnal errors: %s` : there are backup errors or missing jobs. + - `AVERAGE: Couldn't save spool : %s` : the program could not save its spool file in the work directory. + - `INFO Invalid work directory %s : it does not exist or is not a directory.` : you manually specified a work directory with the `-w` flag and it is invalid. + - `INFO Could not find a suitable work directory. Is bareos or bacula installed?` : neither /var/lib/bareos nor /var/lib/bacula seem to exist. + - `INFO The state file %s does not exist.\n` : you manually specified a state file with the `-f` flag and it is invalid or does not exist in the working directory. + - `INFO Could not find a suitable state file. Has a job ever run?` : neither bareos-fd.9102.state nor bacula-fd.9102.state seem to exist in the working directory. + - `INFO Couldn't open state file : %s` : the bacula or bareos state file could not be opened. + - `INFO Invalid state file : This script only supports bareos state file version 4, got %d` : The bacula or bareos version installed is not supported (yet!). + - `INFO Corrupted state file : %s` : the bacula or bareos state file could not be parsed. + - `INFO No jobs exist in the state file` : no jobs were found in the state file. + - `INFO Couldn't parse job name, this shouldn't happen : %s` : the program uses a regex to strip time and date from a job entry and it did not work. This is a bug in this program! Please open an issue. + +## Limitations + +### No alerts if a job fails to start on its first run + +The Bareos file daemon holds no status reference for a job that never started properly. Therefore any director misconfiguration will not be caught up by this program unless the job ran successfully at least once. If it happened the job will have a status missing. + +### False positives + +Bareos status file only holds the last 10 jobs that ran on the host. This should be enough for nearly all use cases, but if a host has many jobs it won't do. + +The solution to this is to have a `Client Run After Job` entry that runs this program after each job in order to have the program record that successful run in its spool. + +### Missing job alert when you legitimately remove a job in the director's configuration + +Because of the way we record jobs in a spool file in order to track missing jobs, if you remove a job in the director's configuration you will get a missing job alert the next day. To avoid this you just need to : + - stop the bareos file daemon + - delete the bareos file daemon status file (/var/lib/bareos/bareos-fd.9102.state by default) + - start the bareos file daemon again + - run any job in order to have the file daemon recreate a valid status file + - delete the line referencing this job in the spool file (/var/lib/bareos/bareos-zabbix-check.spool by default)
\ No newline at end of file @@ -118,7 +118,7 @@ func main() { } // we write this new spool if err2 := saveSpool(successfulJobs); err2 != nil { - fmt.Printf("AVERAGE: Couldn't save spool : %s\n", err2) + fmt.Printf("AVERAGE: Error saving the spool file : %s\n", err2) os.Exit(0) } @@ -20,12 +20,12 @@ func loadSpool() (entries jobs, err error) { // We read the spool file, err = os.Open(path.Join(workDir, spoolFile)) if err != nil { - return nil, fmt.Errorf("INFO Couldn't open spool file: %s", err) + return nil, fmt.Errorf("Couldn't open spool file, starting from scratch: %s", err) } defer file.Close() lines, err = csv.NewReader(file).ReadAll() if err != nil { - return nil, fmt.Errorf("INFO Corrupted spool file : %s", err) + return nil, fmt.Errorf("Corrupted spool file, starting from scratch : %s", err) } if verbose { log.Printf("Spool file content : %v\n", lines) @@ -33,12 +33,10 @@ func loadSpool() (entries jobs, err error) { entries = make(map[string]uint64) for _, line := range lines { - var ( - i int - ) + var i int i, err = strconv.Atoi(line[1]) if err != nil { - return nil, fmt.Errorf("INFO Corrupted spool file : couldn't parse timestamp entry") + return nil, fmt.Errorf("Corrupted spool file : couldn't parse timestamp entry") } entries[line[0]] = uint64(i) } @@ -55,7 +53,7 @@ func saveSpool(entries jobs) (err error) { ) file, err = os.Create(path.Join(workDir, spoolFile)) if err != nil { - return fmt.Errorf("INFO Couldn't open spool file for writing : %s", err) + return } defer file.Close() @@ -7,6 +7,7 @@ import ( "log" "os" "regexp" + "time" ) // stateFileHeader : A structure to hold the header of the state file. It is statically aligned for amd64 architecture @@ -47,8 +48,8 @@ func (je jobEntry) String() string { if len(matches) >= 4 { jobNameLen = matches[3] } - return fmt.Sprintf("Errors: %d, JobType: %c, JobStatus: %c, JobLevel: %c, JobID: %d, VolSessionID: %d, VolSessionTime: %d, JobFiles: %d, JobBytes: %d, StartTime: %d, EndTime: %d, Job: %s", - je.Errors, je.JobType, je.JobStatus, je.JobLevel, je.JobID, je.VolSessionID, je.VolSessionTime, je.JobFiles, je.JobBytes, je.StartTime, je.EndTime, je.Job[:jobNameLen]) + return fmt.Sprintf("Errors: %d, JobType: %c, JobStatus: %c, JobLevel: %c, JobID: %d, VolSessionID: %d, VolSessionTime: %d, JobFiles: %d, JobBytes: %d, StartTime: %s, EndTime: %s, Job: %s", + je.Errors, je.JobType, je.JobStatus, je.JobLevel, je.JobID, je.VolSessionID, je.VolSessionTime, je.JobFiles, je.JobBytes, time.Unix(int64(je.StartTime), 0), time.Unix(int64(je.EndTime), 0), je.Job[:jobNameLen]) } const ( @@ -63,15 +64,14 @@ const ( var jobNameRegex = regexp.MustCompilePOSIX(`^([-A-Za-z0-9_]+)\.[0-9]{4}-[0-9]{2}-[0-9]{2}.*`) // readNextBytes : Reads the next "number" bytes from a "file", returns the number of bytes actually read as well as the bytes read -func readNextBytes(file *os.File, number int) (int, []byte) { - var bytes = make([]byte, number) - - var n, err = file.Read(bytes) +func readNextBytes(file *os.File, number int) (n int, bytes []byte, err error) { + bytes = make([]byte, number) + n, err = file.Read(bytes) if err != nil { - fmt.Printf("INFO Corrupted state file : file.Read failed in %s : %s\n", stateFile, err) + return 0, nil, fmt.Errorf("file.Read failed in %s : %s", stateFile, err) } - return n, bytes + return } func parseStateFile() (successfulJobs jobs, errorJobs jobs, err error) { @@ -92,7 +92,10 @@ func parseStateFile() (successfulJobs jobs, errorJobs jobs, err error) { // Parsing the state file header var header stateFileHeader - n, data = readNextBytes(stateFileHandle, stateFileHeaderLength) + n, data, err = readNextBytes(stateFileHandle, stateFileHeaderLength) + if err != nil { + return nil, nil, fmt.Errorf("INFO Corrupted state file : %s", err) + } if n != stateFileHeaderLength { return nil, nil, fmt.Errorf("INFO Corrupted state file : invalid header length in %s", stateFile) } @@ -118,7 +121,10 @@ func parseStateFile() (successfulJobs jobs, errorJobs jobs, err error) { stateFileHandle.Seek(int64(header.LastJobsAddr), 0) // We read how many jobs there are in the state file - n, data = readNextBytes(stateFileHandle, 4) + n, data, err = readNextBytes(stateFileHandle, 4) + if err != nil { + return nil, nil, fmt.Errorf("INFO Corrupted state file : %s", err) + } if n != 4 { return nil, nil, fmt.Errorf("INFO Corrupted state file : invalid numberOfJobs read length in %s", stateFile) } @@ -139,7 +145,10 @@ func parseStateFile() (successfulJobs jobs, errorJobs jobs, err error) { jobResult jobEntry jobName string ) - n, data = readNextBytes(stateFileHandle, jobResultLength) + n, data, err = readNextBytes(stateFileHandle, jobResultLength) + if err != nil { + return nil, nil, fmt.Errorf("INFO Corrupted state file : %s", err) + } if n != jobResultLength { return nil, nil, fmt.Errorf("INFO Corrupted state file : invalid job entry in %s", stateFile) } |