00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <config.h>
00021
00022 #include <stdio.h>
00023 #include <errno.h>
00024 #include <signal.h>
00025 #include <stdarg.h>
00026 #include <stdlib.h>
00027 #include <setjmp.h>
00028 #include <sys/socket.h>
00029
00030 #ifdef HAVE_STRING_H
00031 #include <string.h>
00032 #endif
00033
00034 #ifdef HAVE_UNISTD_H
00035 #include <unistd.h>
00036 #endif
00037
00038 #include "monitor.h"
00039 #include "alert.h"
00040 #include "net.h"
00041 #include "monit_process.h"
00042
00043
00044 static sigjmp_buf timeout;
00045
00046
00047 static void do_validate(Process_T);
00048 static void do_restart(Process_T, char *, ...) ;
00049 static void do_start(Process_T, char *, ...) ;
00050 static void do_stop(Process_T, char *, ...);
00051 static int do_not_validate(Process_T);
00052 static int check_connection(Process_T, Port_T, char *);
00053 static int check_resources(Process_T, Resource_T, char *);
00054 static int check_process_state(Process_T, char *);
00055 static int check_skip(Process_T);
00056 static int check_timeout(Process_T);
00057 static int check_checksum(Process_T);
00058 static int checksum_helper(Process_T, char *, char *);
00059 static int check_timestamp(Process_T, Timestamp_T, char *);
00060 static void connection_timeout(int);
00061 static void reset_resource_counter(Process_T);
00062 static void vlog(char * report, int n, Process_T p, char *m,...);
00063 static int compare_value(int, int, int);
00064
00084
00085
00086
00090 void validate() {
00091
00092 Process_T p;
00093
00094 if(! update_loadavg()) {
00095
00096 log("Update of loadavg has failed!\n");
00097
00098 }
00099
00100 for(p= processlist; p; p= p->next) {
00101 if(p->visited)
00102 continue;
00103 do_validate(p);
00104 }
00105
00106 reset_depend();
00107
00108 }
00109
00110
00111
00112
00113
00122 static void do_validate(Process_T p) {
00123
00124 Port_T pp;
00125 Resource_T pr;
00126 Timestamp_T tl;
00127 pid_t pid= -1;
00128 sigset_t ns,os;
00129 char report[STRLEN];
00130
00131 ASSERT(p);
00132
00133
00134 if(do_not_validate(p)) return;
00135
00136
00137 sigemptyset(&ns);
00138 sigaddset(&ns, SIGTERM);
00139 pthread_sigmask(SIG_BLOCK, &ns, &os);
00140
00141
00142 if(!(pid= is_process_running(p))) {
00143
00144 do_start(p, "Reason: Process is not running.");
00145 goto reinstall;
00146
00147 } else {
00148
00149 if(Run.debug) {
00150
00151 log("'%s' is running with pid %d\n", p->name, (int)pid);
00152
00153 }
00154
00155 }
00156
00157 if(Run.doprocess) {
00158
00159 if(update_process_data(p, pid)) {
00160
00161 if(! check_process_state(p, report)) {
00162
00163 smtp_alert_resource(p, "Reason: %s\n", report);
00164
00165 } else {
00166
00167 if(Run.debug) {
00168
00169 log("'%s' check_process_state() passed.\n", p->name);
00170
00171 }
00172
00173 }
00174
00175 for(pr= p->resourcelist; pr; pr= pr->next) {
00176
00177 if(!check_resources(p, pr, report)) {
00178
00179 switch(pr->action) {
00180 case ACTION_ALERT:
00181 smtp_alert_resource(p, "Reason: %s\n", report);
00182
00183 pr->cycle=0;
00184 break;
00185
00186 case ACTION_STOP:
00187 do_stop(p, "Reason: %s\n", report);
00188 reset_resource_counter(p);
00189 goto reinstall;
00190
00191 case ACTION_RESTART:
00192 do_restart(p, "Reason: %s\n", report);
00193 reset_resource_counter(p);
00194 goto reinstall;
00195
00196 default:
00197 log("'%s' Unknow resource failure action.\n", p->name);
00198 break;
00199 }
00200
00201 }
00202
00203 }
00204
00205 } else {
00206
00207 log("'%s' failed to get process data\n", p->name);
00208
00209 }
00210
00211 }
00212
00213 for(tl= p->timestamplist; tl; tl= tl->next) {
00214
00215 if(!check_timestamp(p, tl, report)) {
00216
00217 switch(tl->action) {
00218 case ACTION_ALERT:
00219 smtp_alert_timestamp(p, "Reason: %s\n", report);
00220 break;
00221
00222 case ACTION_STOP:
00223 do_stop(p, "Reason: %s\n", report);
00224 goto reinstall;
00225
00226 case ACTION_RESTART:
00227 do_restart(p, "Reason: %s\n", report);
00228 goto reinstall;
00229
00230 default:
00231 log("'%s' Unknow timestamp failure action.\n", p->name);
00232 break;
00233
00234 }
00235
00236 }
00237
00238 }
00239
00240
00241 for(pp= p->portlist; pp; pp= pp->next) {
00242
00243 if(!check_connection(p, pp, report)) {
00244
00245 do_restart(p, "Reason: %s\n", report);
00246 goto reinstall;
00247
00248 }
00249
00250 }
00251
00252 reinstall:
00253
00254
00255 pthread_sigmask(SIG_SETMASK, &os, NULL);
00256
00257 }
00258
00259
00264 static void do_restart(Process_T p, char *m, ...) {
00265
00266 va_list ap;
00267 char *tmp = NULL;
00268
00269 ASSERT(p);
00270
00271 if(!p->do_validate)
00272 return;
00273
00274 if(p->def_timeout)
00275 p->nstart++;
00276
00277 va_start(ap, m);
00278 if(m)
00279 tmp=format(m, ap);
00280 va_end(ap);
00281
00282 if(p->mode!=MODE_PASSIVE && p->start && p->stop) {
00283
00284 log("Trying to restart '%s'\n", p->name);
00285
00286 check_process(p->name, "stop", FALSE);
00287 check_process(p->name, "start", FALSE);
00288 smtp_alert_restart(p, "%s", tmp);
00289
00290 } else {
00291
00292 smtp_alert_failed(p, "%s", tmp);
00293
00294 }
00295
00296 free(tmp);
00297
00298 }
00299
00300
00305 static void do_start(Process_T p, char *m, ...) {
00306
00307 va_list ap;
00308 char *tmp = NULL;
00309
00310 ASSERT(p);
00311
00312 if(!p->do_validate)
00313 return;
00314
00315 if(p->def_timeout)
00316 p->nstart++;
00317
00318 va_start(ap, m);
00319 if(m)
00320 tmp=format(m, ap);
00321 va_end(ap);
00322
00323 if(p->mode!= MODE_PASSIVE && p->start) {
00324
00325 check_process(p->name, "start", FALSE);
00326 smtp_alert_restart(p, "%s", tmp);
00327
00328 } else {
00329
00330 smtp_alert_failed(p, "%s", tmp);
00331
00332 }
00333
00334 free(tmp);
00335
00336 }
00337
00338
00343 static void do_stop(Process_T p, char *m, ...) {
00344
00345 va_list ap;
00346 char *tmp = NULL;
00347
00348 ASSERT(p);
00349
00350 if(!p->do_validate)
00351 return;
00352
00353 va_start(ap, m);
00354 if(m)
00355 tmp=format(m, ap);
00356 va_end(ap);
00357
00358 LOCK(Run.mutex)
00359 p->do_validate= FALSE;
00360 END_LOCK;
00361
00362 if(p->mode!= MODE_PASSIVE && p->stop) {
00363
00364 smtp_alert_stop(p, "%s", tmp);
00365 check_process(p->name, "stop", FALSE);
00366
00367 } else {
00368
00369 if(p->def_timeout)
00370 p->nstart++;
00371
00372 smtp_alert_failed(p, "%s", tmp);
00373
00374 }
00375
00376 free(tmp);
00377
00378 }
00379
00380
00385 static int do_not_validate(Process_T p) {
00386
00387 ASSERT(p);
00388
00389 return(!p->do_validate ||
00390 check_skip(p) ||
00391 check_timeout(p) ||
00392 check_checksum(p));
00393
00394 }
00395
00396
00401 static int check_connection(Process_T p, Port_T pp, char *report) {
00402
00403 volatile int rv= TRUE;
00404
00405 ASSERT(p && pp);
00406
00407
00408 if(sigsetjmp(timeout, TRUE)) {
00409
00410 snprintf(report, STRLEN,
00411 "timed out when testing %s [%s]",
00412 pp->address, pp->protocol->name);
00413
00414 log("'%s' %s\n", p->name, report);
00415
00416 rv= FALSE;
00417 goto error;
00418
00419 }
00420
00421
00422 set_alarm_handler(connection_timeout);
00423 alarm(CHECK_TIMEOUT);
00424
00425
00426 if((pp->socket= create_generic_socket(pp)) < 0) {
00427
00428 snprintf(report, STRLEN,
00429 "does not accept connection at %s.",
00430 pp->address);
00431
00432 log("'%s' %s\n", p->name, report);
00433
00434 rv= FALSE;
00435 goto error;
00436
00437 } else {
00438
00439 if(Run.debug) {
00440
00441 log("'%s' succeeded connecting to %s\n",
00442 p->name, pp->address);
00443
00444 }
00445
00446 }
00447
00448
00449 if(!check_connection_io(pp)) {
00450
00451 snprintf(report, STRLEN,
00452 "socket at %s is not ready for i|o -- %s",
00453 pp->address, STRERROR);
00454
00455 log("'%s' %s\n", p->name, report);
00456
00457 rv= FALSE;
00458 goto error;
00459
00460 }
00461
00462 if(pp->ssl != NULL) {
00463
00464 if(!embed_ssl_socket(pp->ssl, pp->socket)) {
00465
00466 snprintf(report, STRLEN,
00467 "failed establish SSL communication on socket at %s",
00468 pp->address);
00469
00470 log("'%s' %s\n", p->name, report);
00471 rv= FALSE;
00472 goto error;
00473
00474 }
00475
00476 if(pp->certmd5 != NULL) {
00477
00478 if(! check_ssl_md5sum(pp->ssl, pp->certmd5)) {
00479
00480 snprintf(report, STRLEN,
00481 "md5sums of SSL certificates do not match at %s",
00482 pp->address);
00483
00484 log("'%s' %s\n", p->name, report);
00485 rv= FALSE;
00486 goto error;
00487
00488 }
00489
00490 }
00491
00492 }
00493
00494
00495 if(! pp->protocol->check(pp)) {
00496
00497 snprintf(report, STRLEN,
00498 "test with protocol [%s] failed at %s.",
00499 pp->protocol->name, pp->address);
00500
00501 log("'%s' %s\n", p->name, report);
00502
00503 rv= FALSE;
00504 goto error;
00505
00506 } else {
00507
00508 if(Run.debug) {
00509
00510 log("'%s' succeeded testing protocol [%s] at %s\n",
00511 p->name, pp->protocol->name, pp->address);
00512
00513 }
00514 }
00515
00516 error:
00517
00518 alarm(0);
00519
00520 if(pp->ssl != NULL) {
00521
00522 cleanup_ssl_socket(pp->ssl);
00523
00524 }
00525
00526 close_socket(pp->socket);
00527
00528 return rv;
00529
00530 }
00531
00532
00537 static int check_process_state(Process_T p, char *report) {
00538
00539 ProcInfo_T pi;
00540
00541 ASSERT(p);
00542
00543 pi= p->procinfo;
00544
00545 if(pi->status_flag & PROCESS_ZOMBIE) {
00546
00547 snprintf(report, STRLEN,
00548 "process with pid %d is a zombie\n", pi->pid);
00549
00550 log("'%s' %s\n", p->name, report);
00551
00552
00553
00554 LOCK(Run.mutex)
00555 p->do_validate= FALSE;
00556 END_LOCK;
00557
00558 return FALSE;
00559
00560 } else {
00561
00562 if(Run.debug) {
00563
00564 log("'%s' zombie check passed [status_flag=%04x]\n",
00565 p->name, pi->status_flag);
00566
00567 }
00568
00569 }
00570
00571 return TRUE;
00572
00573 }
00574
00575
00579 static void reset_resource_counter(Process_T p) {
00580
00581 Resource_T pr;
00582
00583 ASSERT(p);
00584
00585 for(pr= p->resourcelist; pr; pr= pr->next) {
00586
00587 pr->cycle=0;
00588
00589 }
00590
00591 }
00592
00593
00598 static int check_resources(Process_T p, Resource_T pr, char *report) {
00599
00600 ProcInfo_T pi;
00601 int okay= TRUE;
00602
00603 ASSERT(p);
00604 ASSERT(pr);
00605
00606 pi= p->procinfo;
00607
00608 switch(pr->resource_id) {
00609 case RESOURCE_ID_CPU_PERCENT:
00610 if(compare_value(pr->operator, pi->cpu_percent, pr->limit)) {
00611
00612 vlog(report, STRLEN, p,
00613 "cpu usage of %.1f%% matches resource limit [cpu usage%s%.1f%%]",
00614 pi->cpu_percent/10.0, operatorshortnames[pr->operator],
00615 pr->limit/10.0);
00616
00617 okay= FALSE;
00618
00619 } else {
00620
00621 if(Run.debug) {
00622
00623 log("'%s' cpu usage check passed [current cpu usage=%.1f%%]\n",
00624 p->name, pi->cpu_percent/10.0);
00625
00626 }
00627 }
00628 break;
00629
00630 case RESOURCE_ID_MEM_PERCENT:
00631 if(compare_value(pr->operator, pi->mem_percent, pr->limit)) {
00632
00633 vlog(report, STRLEN, p,
00634 "mem usage of %.1f%% matches resource limit [mem usage%s%.1f%%]",
00635 pi->mem_percent/10.0, operatorshortnames[pr->operator],
00636 pr->limit/10.0);
00637
00638 okay= FALSE;
00639
00640 } else {
00641
00642 if(Run.debug) {
00643
00644 log("'%s' mem usage check passed [current mem usage=%.1f%%]\n",
00645 p->name, pi->mem_percent/10.0);
00646
00647 }
00648 }
00649 break;
00650
00651 case RESOURCE_ID_MEM_KBYTE:
00652
00653 if(compare_value(pr->operator, pi->mem_kbyte, pr->limit)) {
00654
00655 vlog(report, STRLEN, p,
00656 "mem amount of %ldkB matches resource limit [mem amount%s%ldkB]",
00657 pi->mem_kbyte, operatorshortnames[pr->operator],
00658 pr->limit);
00659
00660 okay= FALSE;
00661
00662 } else {
00663
00664 if(Run.debug) {
00665
00666 log("'%s' mem amount check passed [current mem amount=%ldkB]\n",
00667 p->name, pi->mem_kbyte);
00668
00669 }
00670 }
00671 break;
00672
00673 case RESOURCE_ID_LOAD1:
00674
00675 if(compare_value(pr->operator, (int)(Run.loadavg[0]*10.0), pr->limit)) {
00676
00677 vlog(report, STRLEN, p,
00678 "loadavg(1min) of %.1f matches resource limit "
00679 "[loadavg(1min)%s%.1f]",
00680 Run.loadavg[0], operatorshortnames[pr->operator],
00681 pr->limit/10.0);
00682
00683 okay= FALSE;
00684
00685 } else {
00686
00687 if(Run.debug) {
00688
00689 log("'%s' loadavg(1min) check passed [current loadavg(1min)=%.1f]\n",
00690 p->name, Run.loadavg[0]);
00691
00692 }
00693 }
00694 break;
00695
00696 case RESOURCE_ID_LOAD5:
00697
00698 if(compare_value(pr->operator, (int)(Run.loadavg[1]*10.0), pr->limit)) {
00699
00700 vlog(report, STRLEN, p,
00701 "loadavg(5min) of %.1f matches resource limit "
00702 "[loadavg(5min)%s%.1f]",
00703 Run.loadavg[1], operatorshortnames[pr->operator],
00704 pr->limit/10.0);
00705
00706 okay= FALSE;
00707
00708 } else {
00709
00710 if(Run.debug) {
00711
00712 log("'%s' loadavg(5min) check passed [current loadavg(5min)=%.1f]\n",
00713 p->name, Run.loadavg[1]);
00714
00715 }
00716 }
00717 break;
00718
00719 case RESOURCE_ID_LOAD15:
00720
00721 if(compare_value(pr->operator, (int)(Run.loadavg[2]*10.0), pr->limit)) {
00722
00723 vlog(report, STRLEN, p,
00724 "loadavg(15min) of %.1f matches resource limit "
00725 "[loadavg(15min)%s%.1f]",
00726 Run.loadavg[2], operatorshortnames[pr->operator],
00727 pr->limit/10.0);
00728
00729 okay= FALSE;
00730
00731 } else {
00732
00733 if(Run.debug) {
00734
00735 log("'%s' loadavg(15min) check passed "
00736 "[current loadavg(15min)=%.1f]\n",
00737 p->name, Run.loadavg[2]);
00738
00739 }
00740 }
00741 break;
00742
00743 default:
00744
00745 log("'%s' error: unknow resource ID: [%d]\n", p->name, pr->resource_id);
00746
00747 }
00748
00749 if(okay && pr->cycle > 0) {
00750
00751 pr->cycle--;
00752
00753 } else if(! okay) {
00754
00755 pr->cycle++;
00756
00757 }
00758
00759 if(pr->cycle >= pr->max_cycle) {
00760
00761 return FALSE;
00762
00763 }
00764
00765 return TRUE;
00766
00767 }
00768
00769
00773 static int check_timeout(Process_T p) {
00774
00775 ASSERT(p);
00776
00777 if(!p->def_timeout) {
00778
00779 return FALSE;
00780
00781 }
00782
00783
00784
00785
00786 if(p->nstart > 0) {
00787
00788 p->ncycle++;
00789
00790 }
00791
00792
00793
00794
00795 if(p->nstart >= p->to_start && p->ncycle <= p->to_cycle) {
00796
00797
00798
00799
00800
00801
00802 LOCK(Run.mutex)
00803 p->do_validate= FALSE;
00804 END_LOCK;
00805
00806
00807
00808
00809 log("**Alert** process '%s' timed out and will not be checked anymore.\n",
00810 p->name);
00811
00812 smtp_alert_timeout(p, NULL);
00813
00814 return TRUE;
00815
00816 }
00817
00818
00819
00820
00821
00822 if(p->ncycle > p->to_cycle) {
00823
00824 p->ncycle= 0;
00825 p->nstart= 0;
00826
00827 }
00828
00829 return FALSE;
00830
00831 }
00832
00833
00838 static int check_skip(Process_T p) {
00839
00840 ASSERT(p);
00841
00842 if(!p->def_every) {
00843
00844 return FALSE;
00845
00846 }
00847
00848 if(++p->nevery < p->every) {
00849
00850 return TRUE;
00851
00852 }
00853
00854 p->nevery= 0;
00855
00856 return FALSE;
00857
00858 }
00859
00860
00868 static int check_checksum(Process_T p) {
00869
00870 Checksum_T c;
00871
00872 ASSERT(p);
00873
00874 if(!p->def_checksum) {
00875
00876 return FALSE;
00877
00878 }
00879
00880 for(c= p->checksumlist; c; c= c->next) {
00881
00882 if(! checksum_helper(p, c->file, c->md5)) {
00883
00884 return TRUE;
00885
00886 }
00887
00888 }
00889
00890 if(Run.debug) {
00891
00892 log("'%s' have valid checksums\n", p->name);
00893
00894 }
00895
00896 return FALSE;
00897
00898 }
00899
00900
00907 static int checksum_helper(Process_T p, char *program, char *sum) {
00908
00909 int rv= TRUE;
00910
00911 ASSERT(p);
00912
00913 if(program && sum) {
00914
00915 if(!check_md5(program, sum)) {
00916
00917 log("'%s' **Alert** checksum error for %s\n", p->name, program);
00918
00919 smtp_alert_checksum(p, NULL);
00920
00921 LOCK(Run.mutex)
00922 p->do_validate= FALSE;
00923 p->has_checksum_error= TRUE;
00924 END_LOCK;
00925
00926 rv= FALSE;
00927
00928 }
00929
00930 }
00931
00932 return rv;
00933
00934 }
00935
00936
00940 static int check_timestamp(Process_T p, Timestamp_T t, char *report) {
00941
00942 time_t now;
00943 time_t timestamp;
00944
00945 ASSERT(p);
00946 ASSERT(t);
00947
00948 if((int)time(&now) == -1) {
00949 vlog(report, STRLEN, p, "can't get actual time");
00950 return FALSE;
00951 }
00952
00953 if(!(timestamp= get_timestamp(t->pathname, S_IFDIR|S_IFREG))) {
00954 vlog(report, STRLEN, p, "can't get timestamp for %s", t->pathname);
00955 return FALSE;
00956 }
00957
00958 if(compare_value(t->operator, (int)(now - timestamp), t->time)) {
00959 vlog(report, STRLEN, p, "timestamp test failed for %s", t->pathname);
00960 return FALSE;
00961 }
00962
00963 if(Run.debug)
00964 log("'%s' timestamp test passed for %s\n", p->name, t->pathname);
00965
00966 return TRUE;
00967
00968 }
00969
00970
00974 static void connection_timeout(int sig) {
00975
00976 siglongjmp(timeout, TRUE);
00977
00978 }
00979
00980
00984 static void vlog(char * report, int n, Process_T p, char *m,...) {
00985
00986 va_list ap;
00987 char *tmp = NULL;
00988
00989 va_start(ap, m);
00990
00991 if(m) {
00992
00993 tmp=format(m,ap);
00994
00995 }
00996
00997 va_end(ap);
00998
00999 strncpy(report, tmp, n);
01000 log("'%s' %s\n", p->name, report);
01001
01002 free(tmp);
01003 }
01004
01005
01010 static int compare_value(int operator, int left, int right) {
01011
01012 switch(operator) {
01013
01014 case OPERATOR_GREATER:
01015
01016 if(left > right)
01017 return TRUE;
01018 break;
01019
01020 case OPERATOR_LESS:
01021
01022 if(left < right)
01023 return TRUE;
01024 break;
01025
01026 case OPERATOR_EQUAL:
01027
01028 if(left == right)
01029 return TRUE;
01030 break;
01031
01032 case OPERATOR_NOTEQUAL:
01033
01034 if(left != right)
01035 return TRUE;
01036 break;
01037
01038 default:
01039 error("Unknow comparison operator\n");
01040 return FALSE;
01041
01042 }
01043
01044 return FALSE;
01045
01046 }
01047