00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <config.h>
00021
00022 #include <stdio.h>
00023 #include <errno.h>
00024 #include <signal.h>
00025 #include <stdarg.h>
00026 #include <stdlib.h>
00027 #include <setjmp.h>
00028 #include <sys/socket.h>
00029
00030 #ifdef HAVE_STRING_H
00031 #include <string.h>
00032 #endif
00033
00034 #ifdef HAVE_UNISTD_H
00035 #include <unistd.h>
00036 #endif
00037
00038 #include "monitor.h"
00039 #include "alert.h"
00040 #include "net.h"
00041 #include "monit_process.h"
00042
00043
00044 static sigjmp_buf timeout;
00045
00046
00047 static void do_validate(Process_T);
00048 static void do_restart(Process_T, char *, ...) ;
00049 static void do_start(Process_T, char *, ...) ;
00050 static void do_stop(Process_T, char *, ...);
00051 static int do_not_validate(Process_T);
00052 static int check_connection(Process_T, Port_T, char *);
00053 static int check_resources(Process_T, Resource_T, char *);
00054 static int check_process_state(Process_T, char *);
00055 static int check_skip(Process_T);
00056 static int check_timeout(Process_T);
00057 static int check_checksum(Process_T);
00058 static int checksum_helper(Process_T, char *, char *);
00059 static void connection_timeout(int);
00060 static void reset_resource_counter(Process_T);
00061 static void vlog(char * report, int n, Process_T p, char *m,...);
00062
00080
00081
00082
00086 void validate() {
00087
00088 Process_T p;
00089
00090 if ( ! update_loadavg() ) {
00091
00092 log("Update of loadavg has failed!\n");
00093
00094 }
00095
00096 for (p= processlist; p; p= p->next) {
00097
00098 do_validate(p);
00099
00100 }
00101
00102 }
00103
00104
00105
00106
00107
00116 static void do_validate(Process_T p) {
00117
00118 Port_T pp;
00119 Resource_T pr;
00120 pid_t pid= -1;
00121 sigset_t ns,os;
00122 char report[STRLEN];
00123
00124
00125
00126 if (do_not_validate(p)) return;
00127
00128
00129 sigemptyset(&ns);
00130 sigaddset(&ns, SIGTERM);
00131 pthread_sigmask(SIG_BLOCK, &ns, &os);
00132
00133
00134 if (!(pid= is_process_running(p))) {
00135
00136 log("'%s' is not running.\n", p->name);
00137 do_start(p, "Reason: Process is not running.");
00138 goto reinstall;
00139
00140 } else {
00141
00142 if (Run.debug) {
00143
00144 log("'%s' is running with pid %d\n", p->name, (int)pid);
00145
00146 }
00147
00148 }
00149
00150 if ( Run.doprocess ) {
00151
00152 if ( update_process_data(p, pid) ) {
00153
00154 if ( ! check_process_state(p, report)) {
00155
00156 smtp_alert_resource(p, "Reason: %s\n", report);
00157
00158 } else {
00159
00160 if ( Run.debug ) {
00161
00162 log("'%s' check_process_state() passed.\n", p->name);
00163
00164 }
00165
00166 }
00167
00168 for( pr= p->resourcelist; pr; pr= pr->next ) {
00169
00170 if ( !check_resources(p, pr, report) ) {
00171
00172 switch (pr->action) {
00173 case RESOURCE_ACTION_ALERT:
00174 smtp_alert_resource(p, "Reason: %s\n", report);
00175
00176
00177 pr->cycle=0;
00178 break;
00179
00180 case RESOURCE_ACTION_STOP:
00181
00182 do_stop(p, "Reason: %s\n", report);
00183 reset_resource_counter(p);
00184 goto reinstall;
00185
00186 case RESOURCE_ACTION_RESTART:
00187
00188 do_restart(p, "Reason: %s\n", report);
00189 reset_resource_counter(p);
00190 goto reinstall;
00191
00192 default:
00193
00194 log("'%s' Unknow resource failure action.\n", p->name);
00195 break;
00196
00197
00198
00199 }
00200
00201 }
00202
00203 }
00204
00205 } else {
00206
00207 log("'%s' failed to get process data\n", p->name);
00208
00209
00210
00211 }
00212
00213 }
00214
00215
00216 for(pp= p->portlist; pp; pp= pp->next) {
00217
00218 if (!check_connection(p, pp, report)) {
00219
00220 do_restart(p, "Reason: %s\n", report);
00221 goto reinstall;
00222
00223 }
00224
00225 }
00226
00227 reinstall:
00228
00229
00230 pthread_sigmask(SIG_SETMASK, &os, NULL);
00231
00232 }
00233
00234
00239 static void do_restart(Process_T p, char *m, ...) {
00240
00241 va_list ap;
00242 char *tmp = NULL;
00243
00244 if(!p->start || !p->stop || !p->do_validate) {
00245
00246 return;
00247
00248 }
00249
00250 log("Trying to restart '%s'\n", p->name);
00251
00252 stop_process(p);
00253 sleep(10);
00254
00255 va_start(ap, m);
00256
00257 if (m) {
00258
00259 tmp=format(m,ap);
00260
00261 }
00262
00263 va_end(ap);
00264
00265
00266 do_start(p, "%s", tmp );
00267 free(tmp);
00268
00269 }
00270
00271
00276 static void do_start(Process_T p, char *m, ...) {
00277
00278 va_list ap;
00279 char *tmp = NULL;
00280
00281 if (!p->start || !p->do_validate) {
00282
00283 return;
00284
00285 }
00286
00287 if (p->def_timeout) {
00288
00289 p->nstart++;
00290
00291 }
00292
00293 start_process(p);
00294
00295
00296
00297
00298 sleep(3);
00299
00300 va_start(ap, m);
00301
00302 if (m) {
00303
00304 tmp=format(m,ap);
00305
00306 }
00307
00308 va_end(ap);
00309
00310 smtp_alert_restart(p, "%s", tmp);
00311
00312 free(tmp);
00313
00314 }
00315
00316
00321 static void do_stop(Process_T p, char *m, ...) {
00322
00323 va_list ap;
00324 char *tmp = NULL;
00325
00326 if ( !p->stop || !p->do_validate ) {
00327
00328 return;
00329
00330 }
00331
00332 LOCK(Run.mutex)
00333 p->do_validate= FALSE;
00334 END_LOCK;
00335
00336
00337 va_start(ap, m);
00338
00339 if (m) {
00340
00341 tmp=format(m,ap);
00342
00343 }
00344
00345 va_end(ap);
00346
00347
00348 smtp_alert_stop(p, "%s", tmp);
00349
00350 free(tmp);
00351
00352 sleep(3);
00353
00354 stop_process(p);
00355
00356 }
00357
00358
00363 static int do_not_validate(Process_T p) {
00364
00365 return (!p->do_validate || check_skip(p) || check_timeout(p) ||
00366 check_checksum(p));
00367
00368 }
00369
00370
00375 static int check_connection(Process_T p, Port_T pp, char *report) {
00376
00377 volatile int rv= TRUE;
00378
00379
00380 if (sigsetjmp(timeout, TRUE)) {
00381
00382 snprintf(report, STRLEN,
00383 "timed out when testing %s [%s]",
00384 pp->address, pp->protocol->name);
00385
00386 log("'%s' %s\n", p->name, report);
00387
00388 rv= FALSE;
00389 goto error;
00390
00391 }
00392
00393
00394 set_alarm_handler(connection_timeout);
00395 alarm(CHECK_TIMEOUT);
00396
00397
00398
00399 if ((pp->socket= create_generic_socket(pp)) < 0) {
00400
00401 snprintf(report, STRLEN,
00402 "does not accept connection at %s.",
00403 pp->address);
00404
00405 log("'%s' %s\n", p->name, report);
00406
00407 rv= FALSE;
00408 goto error;
00409
00410 } else {
00411
00412 if (Run.debug) {
00413
00414 log("'%s' succeeded connecting to %s\n",
00415 p->name, pp->address);
00416
00417 }
00418
00419 }
00420
00421
00422 if (!check_connection_io(pp)) {
00423
00424 snprintf(report, STRLEN,
00425 "socket at %s is not ready for i|o -- %s",
00426 pp->address, STRERROR);
00427
00428 log("'%s' %s\n", p->name, report);
00429
00430 rv= FALSE;
00431 goto error;
00432
00433 }
00434
00435
00436 if (! pp->protocol->check(pp)) {
00437
00438 snprintf(report, STRLEN,
00439 "test with protocol [%s] failed at %s.",
00440 pp->protocol->name, pp->address);
00441
00442 log("'%s' %s\n", p->name, report);
00443
00444 rv= FALSE;
00445 goto error;
00446
00447 } else {
00448
00449 if (Run.debug) {
00450
00451 log("'%s' succeeded testing protocol [%s] at %s\n",
00452 p->name, pp->protocol->name, pp->address);
00453
00454 }
00455 }
00456
00457 error:
00458 alarm(0);
00459 close_socket(pp->socket);
00460
00461 return rv;
00462
00463 }
00464
00465
00470 static int check_process_state(Process_T p, char *report) {
00471
00472 ProcInfo_T pi= p->procinfo;
00473
00474 if ( pi->status_flag & PROCESS_ZOMBIE ) {
00475
00476 snprintf(report, STRLEN,
00477 "process with pid %d is a zombie\n", pi->pid);
00478
00479 log("'%s' %s\n", p->name, report);
00480
00481 return FALSE;
00482
00483 } else {
00484
00485 if ( Run.debug ) {
00486
00487 log("'%s' zombie check passed [status_flag=%04x]\n",
00488 p->name, pi->status_flag);
00489
00490 }
00491
00492 }
00493
00494 return TRUE;
00495
00496 }
00497
00498
00502 static void reset_resource_counter(Process_T p) {
00503 Resource_T pr;
00504
00505 for( pr= p->resourcelist; pr; pr= pr->next ) {
00506
00507 pr->cycle=0;
00508
00509 }
00510
00511 }
00512
00513
00518 static int check_resources(Process_T p, Resource_T pr, char *report) {
00519
00520 ProcInfo_T pi= p->procinfo;
00521 int okay= TRUE;
00522
00523 switch (pr->resource_id) {
00524 case RESOURCE_ID_CPU_PERCENT:
00525 if ( compare_resource(pi->cpu_percent, pr) ) {
00526
00527 vlog(report, STRLEN, p,
00528 "cpu usage of %.1f%% matches resource limit [cpu usage%s%.1f%%]",
00529 pi->cpu_percent/10.0, operatorshortnames[pr->operator],
00530 pr->limit/10.0);
00531
00532 okay= FALSE;
00533
00534 } else {
00535
00536 if ( Run.debug ) {
00537
00538 log("'%s' cpu usage check passed [current cpu usage=%.1f%%]\n",
00539 p->name, pi->cpu_percent/10.0);
00540
00541 }
00542 }
00543 break;
00544
00545 case RESOURCE_ID_MEM_PERCENT:
00546 if ( compare_resource(pi->mem_percent, pr) ) {
00547
00548 vlog(report, STRLEN, p,
00549 "mem usage of %.1f%% matches resource limit [mem usage%s%.1f%%]",
00550 pi->mem_percent/10.0, operatorshortnames[pr->operator],
00551 pr->limit/10.0);
00552
00553 okay= FALSE;
00554
00555 } else {
00556
00557 if ( Run.debug ) {
00558
00559 log("'%s' mem usage check passed [current mem usage=%.1f%%]\n",
00560 p->name, pi->mem_percent/10.0);
00561
00562 }
00563 }
00564 break;
00565
00566 case RESOURCE_ID_MEM_KBYTE:
00567
00568 if ( compare_resource(pi->mem_kbyte, pr) ) {
00569
00570 vlog(report, STRLEN, p,
00571 "mem amount of %ldkB matches resource limit [mem amount%s%ldkB]",
00572 pi->mem_kbyte, operatorshortnames[pr->operator],
00573 pr->limit);
00574
00575 okay= FALSE;
00576
00577 } else {
00578
00579 if ( Run.debug ) {
00580
00581 log("'%s' mem amount check passed [current mem amount=%ldkB]\n",
00582 p->name, pi->mem_kbyte);
00583
00584 }
00585 }
00586 break;
00587
00588 case RESOURCE_ID_LOAD1:
00589
00590 if ( compare_resource((int) (Run.loadavg[0]*10.0), pr) ) {
00591
00592 vlog(report, STRLEN, p,
00593 "loadavg(1min) of %.1f matches resource limit "
00594 "[loadavg(1min)%s%.1f]",
00595 Run.loadavg[0], operatorshortnames[pr->operator],
00596 pr->limit/10.0);
00597
00598 okay= FALSE;
00599
00600 } else {
00601
00602 if ( Run.debug ) {
00603
00604 log("'%s' loadavg(1min) check passed [current loadavg(1min)=%.1f]\n",
00605 p->name, Run.loadavg[0] );
00606
00607 }
00608 }
00609 break;
00610
00611 case RESOURCE_ID_LOAD5:
00612
00613 if ( compare_resource((int) (Run.loadavg[1]*10.0), pr) ) {
00614
00615 vlog(report, STRLEN, p,
00616 "loadavg(5min) of %.1f matches resource limit "
00617 "[loadavg(5min)%s%.1f]",
00618 Run.loadavg[1], operatorshortnames[pr->operator],
00619 pr->limit/10.0);
00620
00621 okay= FALSE;
00622
00623 } else {
00624
00625 if ( Run.debug ) {
00626
00627 log("'%s' loadavg(5min) check passed [current loadavg(5min)=%.1f]\n",
00628 p->name, Run.loadavg[1]);
00629
00630 }
00631 }
00632 break;
00633
00634 case RESOURCE_ID_LOAD15:
00635
00636 if ( compare_resource((int) (Run.loadavg[2]*10.0), pr) ) {
00637
00638 vlog(report, STRLEN, p,
00639 "loadavg(15min) of %.1f matches resource limit "
00640 "[loadavg(15min)%s%.1f]",
00641 Run.loadavg[2], operatorshortnames[pr->operator],
00642 pr->limit/10.0);
00643
00644 okay= FALSE;
00645
00646 } else {
00647
00648 if ( Run.debug ) {
00649
00650 log("'%s' loadavg(15min) check passed "
00651 "[current loadavg(15min)=%.1f]\n",
00652 p->name, Run.loadavg[2]);
00653
00654 }
00655 }
00656 break;
00657
00658 default:
00659
00660 log("'%s' error: unknow resource ID: [%d]\n", p->name, pr->resource_id);
00661
00662
00663
00664 }
00665
00666 if ( okay && pr->cycle > 0 ) {
00667
00668 pr->cycle--;
00669
00670 } else if ( ! okay ) {
00671
00672 pr->cycle++;
00673
00674 }
00675
00676 if ( pr->cycle >= pr->max_cycle ) {
00677
00678 return FALSE;
00679
00680 }
00681
00682 return TRUE;
00683 }
00684
00685
00689 static int check_timeout(Process_T p) {
00690
00691 if (!p->def_timeout) {
00692
00693 return FALSE;
00694
00695 }
00696
00697
00698
00699
00700 if (p->nstart > 0) {
00701
00702 p->ncycle++;
00703
00704 }
00705
00706
00707
00708
00709 if (p->nstart >= p->to_start && p->ncycle <= p->to_cycle) {
00710
00711
00712
00713
00714
00715
00716 LOCK(Run.mutex)
00717 p->do_validate= FALSE;
00718 END_LOCK;
00719
00720
00721
00722
00723 log("**Alert** process '%s' timed out and will not be checked anymore.\n",
00724 p->name);
00725
00726 smtp_alert_timeout(p, NULL);
00727
00728 return TRUE;
00729
00730 }
00731
00732
00733
00734
00735
00736 if (p->ncycle > p->to_cycle) {
00737
00738 p->ncycle= 0;
00739 p->nstart= 0;
00740
00741 }
00742
00743 return FALSE;
00744
00745 }
00746
00747
00752 static int check_skip(Process_T p) {
00753
00754 if (!p->def_every) {
00755
00756 return FALSE;
00757
00758 }
00759
00760 if (++p->nevery < p->every) {
00761
00762 return TRUE;
00763
00764 }
00765
00766 p->nevery= 0;
00767
00768 return FALSE;
00769
00770 }
00771
00772
00780 static int check_checksum(Process_T p) {
00781
00782 Checksum_T c;
00783
00784 if (!p->def_checksum) {
00785
00786 return FALSE;
00787
00788 }
00789
00790 for (c= p->checksumlist; c; c= c->next) {
00791
00792 if (! checksum_helper(p, c->file, c->md5)) {
00793
00794 return TRUE;
00795
00796 }
00797
00798 }
00799
00800 if (Run.debug) {
00801
00802 log("'%s' have valid checksums\n", p->name);
00803
00804 }
00805
00806 return FALSE;
00807
00808 }
00809
00810
00817 static int checksum_helper(Process_T p, char *program, char *sum) {
00818
00819 int rv= TRUE;
00820
00821 if(program && sum) {
00822
00823 if(!check_md5(program, sum)) {
00824
00825 log("'%s' **Alert** checksum error for %s\n", p->name, program);
00826
00827 smtp_alert_checksum(p, NULL);
00828
00829 LOCK(Run.mutex)
00830 p->do_validate= FALSE;
00831 p->has_checksum_error= TRUE;
00832 END_LOCK;
00833
00834 rv= FALSE;
00835
00836 }
00837
00838 }
00839
00840 return rv;
00841
00842 }
00843
00844
00848 static void connection_timeout(int sig) {
00849
00850 siglongjmp(timeout, TRUE);
00851
00852 }
00853
00854
00858 static void vlog(char * report, int n, Process_T p, char *m,...) {
00859
00860 va_list ap;
00861 char *tmp = NULL;
00862
00863 va_start(ap, m);
00864
00865 if (m) {
00866
00867 tmp=format(m,ap);
00868
00869 }
00870
00871 va_end(ap);
00872
00873 strncpy(report, tmp, n);
00874 log("'%s' %s\n", p->name, report);
00875
00876 free(tmp);
00877 }