validate.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C), 2000-2002 by Contributors to the monit codebase. 
00003  * All Rights Reserved.
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License as
00007  * published by the Free Software Foundation; either version 2 of the
00008  * License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful, but
00011  * WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  * General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software Foundation,
00017  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00018  */
00019 
00020 #include <config.h>
00021 
00022 #include <stdio.h>
00023 #include <errno.h>
00024 #include <signal.h>
00025 #include <stdarg.h>
00026 #include <stdlib.h>
00027 #include <setjmp.h>
00028 #include <sys/socket.h>
00029 
00030 #ifdef HAVE_STRING_H
00031 #include <string.h>
00032 #endif
00033 
00034 #ifdef HAVE_UNISTD_H
00035 #include <unistd.h>
00036 #endif
00037 
00038 #include "monitor.h"
00039 #include "alert.h"
00040 #include "net.h"
00041 #include "monit_process.h"
00042 
00043 /* Private variables */
00044 static sigjmp_buf timeout;
00045 
00046 /* Private Prototypes */
00047 static void do_validate(Process_T);
00048 static void do_restart(Process_T, char *, ...) ;
00049 static void do_start(Process_T, char *, ...) ;
00050 static void do_stop(Process_T, char *, ...);
00051 static int  do_not_validate(Process_T);
00052 static int  check_connection(Process_T, Port_T, char *);
00053 static int  check_resources(Process_T, Resource_T, char *);
00054 static int  check_process_state(Process_T, char *);
00055 static int  check_skip(Process_T);
00056 static int  check_timeout(Process_T);
00057 static int  check_checksum(Process_T);
00058 static int  checksum_helper(Process_T, char *, char *);
00059 static void connection_timeout(int);
00060 static void reset_resource_counter(Process_T);
00061 static void vlog(char * report, int n, Process_T p, char *m,...);
00062 
00080 /* ---------------------------------------------------------------- Public */
00081 
00082 
00086 void validate() {
00087   
00088   Process_T p;
00089 
00090   if ( ! update_loadavg() ) {
00091 
00092     log("Update of loadavg has failed!\n");
00093 
00094   }
00095  
00096   for (p= processlist; p; p= p->next) {
00097     
00098     do_validate(p);
00099     
00100   }
00101   
00102 }
00103 
00104 
00105 /* --------------------------------------------------------------- Private */
00106 
00107 
00116 static void do_validate(Process_T p) {
00117   
00118   Port_T pp;
00119   Resource_T pr;
00120   pid_t  pid= -1;
00121   sigset_t ns,os;
00122   char report[STRLEN];
00123 
00124 
00125   /* First, check for pre-conditions */
00126   if (do_not_validate(p)) return;
00127   
00128   /* Set a SIGTERM block */
00129   sigemptyset(&ns);
00130   sigaddset(&ns, SIGTERM);
00131   pthread_sigmask(SIG_BLOCK, &ns, &os);
00132 
00133   /* Test for running process */
00134   if (!(pid= is_process_running(p))) {
00135     
00136     log("'%s' is not running.\n", p->name);
00137     do_start(p, "Reason: Process is not running.");
00138     goto reinstall;
00139     
00140   } else {
00141     
00142     if (Run.debug) {
00143       
00144       log("'%s' is running with pid %d\n", p->name, (int)pid);
00145 
00146     }
00147     
00148   }
00149 
00150   if ( Run.doprocess ) {
00151 
00152     if ( update_process_data(p, pid) ) {
00153   
00154       if ( ! check_process_state(p, report)) {
00155 
00156     smtp_alert_resource(p, "Reason: %s\n", report);
00157     
00158       } else {
00159       
00160     if ( Run.debug ) {
00161     
00162       log("'%s' check_process_state() passed.\n", p->name);
00163     
00164     }
00165       
00166       }
00167     
00168       for( pr= p->resourcelist; pr; pr= pr->next ) {
00169       
00170     if ( !check_resources(p, pr, report) ) {
00171     
00172       switch (pr->action) {
00173       case RESOURCE_ACTION_ALERT:
00174         smtp_alert_resource(p, "Reason: %s\n", report);
00175 
00176         /* We are also interested in other alerts/stops/restarts! */
00177         pr->cycle=0;
00178         break;
00179 
00180       case RESOURCE_ACTION_STOP:
00181 
00182         do_stop(p, "Reason: %s\n", report);
00183         reset_resource_counter(p);
00184         goto reinstall;
00185 
00186       case RESOURCE_ACTION_RESTART:
00187 
00188         do_restart(p, "Reason: %s\n", report);
00189         reset_resource_counter(p);
00190         goto reinstall;
00191 
00192       default:
00193 
00194         log("'%s' Unknow resource failure action.\n", p->name);
00195         break;
00196 
00197         /* Internal Alert ??? */
00198 
00199       }
00200 
00201     }   
00202       
00203       }
00204     
00205     } else {
00206 
00207       log("'%s' failed to get process data\n", p->name);
00208 
00209       /* Internal Alert ??? */
00210 
00211     }
00212 
00213   }
00214 
00215   /* Test each host:port and protocol in the process's portlist */
00216   for(pp= p->portlist; pp; pp= pp->next) {
00217     
00218     if (!check_connection(p, pp, report)) {
00219       
00220       do_restart(p, "Reason: %s\n", report);
00221       goto reinstall;
00222       
00223     }
00224     
00225   }
00226 
00227   reinstall:
00228   
00229   /* Remove the SIGTERM block */
00230   pthread_sigmask(SIG_SETMASK, &os, NULL);
00231  
00232 }
00233 
00234 
00239 static void do_restart(Process_T p, char *m, ...) {
00240 
00241   va_list ap;
00242   char *tmp = NULL;
00243 
00244   if(!p->start || !p->stop || !p->do_validate) {
00245   
00246     return;
00247     
00248   }
00249 
00250   log("Trying to restart '%s'\n", p->name);
00251   
00252   stop_process(p);
00253   sleep(10); /* Wait for process termination */
00254 
00255   va_start(ap, m);
00256 
00257   if (m) { 
00258 
00259     tmp=format(m,ap);
00260 
00261   }
00262 
00263   va_end(ap);
00264   
00265   
00266   do_start(p, "%s", tmp );
00267   free(tmp);
00268 
00269 }
00270 
00271 
00276 static void do_start(Process_T p, char *m, ...) {
00277 
00278   va_list ap;
00279   char *tmp = NULL;
00280 
00281   if (!p->start || !p->do_validate) {
00282     
00283     return;
00284     
00285   }
00286 
00287   if (p->def_timeout) {
00288     
00289       p->nstart++;
00290 
00291   }
00292   
00293   start_process(p);
00294   
00295 
00296   /* (Re)start alert */
00297     
00298   sleep(3); /* In case we just started our own smtp server */
00299 
00300   va_start(ap, m);
00301 
00302   if (m) { 
00303 
00304     tmp=format(m,ap);
00305 
00306   }
00307 
00308   va_end(ap);
00309   
00310   smtp_alert_restart(p, "%s", tmp);
00311 
00312   free(tmp);
00313   
00314 }
00315 
00316 
00321 static void do_stop(Process_T p, char *m, ...) {
00322 
00323   va_list ap;
00324   char *tmp = NULL;
00325 
00326   if ( !p->stop || !p->do_validate ) {
00327     
00328     return;
00329     
00330   }
00331 
00332   LOCK(Run.mutex)
00333     p->do_validate= FALSE;
00334   END_LOCK;
00335 
00336   /* Stop alert */
00337   va_start(ap, m);
00338 
00339   if (m) { 
00340 
00341     tmp=format(m,ap);
00342 
00343   }
00344 
00345   va_end(ap);
00346   
00347   
00348   smtp_alert_stop(p, "%s", tmp);
00349 
00350   free(tmp);
00351 
00352   sleep(3); /* In case we gonna stop our own smtp server */
00353 
00354   stop_process(p);
00355     
00356 }
00357 
00358 
00363 static int do_not_validate(Process_T p) {
00364   
00365   return (!p->do_validate || check_skip(p) || check_timeout(p) ||
00366        check_checksum(p));
00367 
00368 }
00369 
00370 
00375 static int check_connection(Process_T p, Port_T pp, char *report) {
00376 
00377   volatile int rv= TRUE;
00378 
00379   /* Control comes here if a timeout occures */
00380   if (sigsetjmp(timeout, TRUE)) {
00381     
00382     snprintf(report, STRLEN, 
00383          "timed out when testing %s [%s]",
00384          pp->address, pp->protocol->name);
00385     
00386     log("'%s' %s\n", p->name, report); 
00387 
00388     rv= FALSE;
00389     goto error;
00390     
00391   }
00392 
00393   /* Set a timeout handler and activate the timer */
00394   set_alarm_handler(connection_timeout);
00395   alarm(CHECK_TIMEOUT);
00396 
00397   /* Open a socket to the destination INET[hostname:port] or UNIX[pathname] */
00398 
00399   if ((pp->socket= create_generic_socket(pp)) < 0) {
00400       
00401     snprintf(report, STRLEN, 
00402          "does not accept connection at %s.",
00403          pp->address);
00404 
00405     log("'%s' %s\n", p->name, report); 
00406 
00407     rv= FALSE;
00408     goto error;
00409     
00410   } else {
00411     
00412     if (Run.debug) {
00413       
00414       log("'%s' succeeded connecting to %s\n",
00415           p->name, pp->address);
00416       
00417     }
00418     
00419   }
00420 
00421   /* Verify that the socket is ready for i|o */
00422   if (!check_connection_io(pp)) {
00423     
00424     snprintf(report, STRLEN, 
00425          "socket at %s is not ready for i|o -- %s",
00426          pp->address, STRERROR);
00427 
00428     log("'%s' %s\n", p->name, report); 
00429 
00430     rv= FALSE;
00431     goto error;
00432 
00433   }
00434 
00435   /* Run the protocol verification routine through the socket */
00436   if (! pp->protocol->check(pp)) {
00437     
00438     snprintf(report, STRLEN, 
00439          "test with protocol [%s] failed at %s.",
00440          pp->protocol->name, pp->address);
00441 
00442     log("'%s' %s\n", p->name, report); 
00443     
00444     rv= FALSE;
00445     goto error;
00446     
00447   } else {
00448     
00449     if (Run.debug) {
00450       
00451       log("'%s' succeeded testing protocol [%s] at %s\n",
00452           p->name, pp->protocol->name, pp->address);
00453 
00454     }
00455   }
00456 
00457   error:
00458   alarm(0);
00459   close_socket(pp->socket);
00460   
00461   return rv;
00462       
00463 }
00464 
00465 
00470 static int check_process_state(Process_T p, char *report) {
00471 
00472   ProcInfo_T pi= p->procinfo;
00473 
00474   if ( pi->status_flag & PROCESS_ZOMBIE ) {
00475 
00476     snprintf(report, STRLEN, 
00477          "process with pid %d is a zombie\n", pi->pid);
00478 
00479     log("'%s' %s\n", p->name, report); 
00480 
00481     return FALSE;
00482 
00483   } else {
00484 
00485     if ( Run.debug ) {
00486       
00487       log("'%s' zombie check passed [status_flag=%04x]\n",
00488       p->name,  pi->status_flag);
00489       
00490     }
00491 
00492   }
00493 
00494   return TRUE;
00495 
00496 }
00497 
00498 
00502 static void reset_resource_counter(Process_T p) {
00503   Resource_T pr;
00504   
00505   for( pr= p->resourcelist; pr; pr= pr->next ) {
00506 
00507     pr->cycle=0;
00508 
00509   }
00510 
00511 }
00512 
00513 
00518 static int check_resources(Process_T p, Resource_T pr, char *report) {
00519 
00520   ProcInfo_T pi= p->procinfo;
00521   int okay= TRUE;
00522 
00523   switch (pr->resource_id) {
00524   case RESOURCE_ID_CPU_PERCENT: 
00525     if ( compare_resource(pi->cpu_percent, pr) ) {
00526 
00527       vlog(report, STRLEN, p, 
00528        "cpu usage of %.1f%% matches resource limit [cpu usage%s%.1f%%]",
00529        pi->cpu_percent/10.0, operatorshortnames[pr->operator],
00530        pr->limit/10.0);
00531 
00532       okay= FALSE;
00533 
00534     } else {
00535 
00536       if ( Run.debug ) {
00537       
00538     log("'%s' cpu usage check passed [current cpu usage=%.1f%%]\n", 
00539         p->name, pi->cpu_percent/10.0);
00540 
00541       }
00542     }
00543     break;
00544 
00545   case RESOURCE_ID_MEM_PERCENT: 
00546     if ( compare_resource(pi->mem_percent, pr) ) {
00547 
00548       vlog(report, STRLEN, p, 
00549        "mem usage of %.1f%% matches resource limit [mem usage%s%.1f%%]",
00550        pi->mem_percent/10.0, operatorshortnames[pr->operator],
00551        pr->limit/10.0);      
00552 
00553       okay= FALSE;
00554 
00555     } else {
00556       
00557       if ( Run.debug ) {
00558     
00559     log("'%s' mem usage check passed [current mem usage=%.1f%%]\n", 
00560         p->name, pi->mem_percent/10.0);
00561     
00562       }
00563     }
00564     break;
00565 
00566   case RESOURCE_ID_MEM_KBYTE: 
00567 
00568     if ( compare_resource(pi->mem_kbyte, pr) ) {
00569 
00570       vlog(report, STRLEN, p, 
00571        "mem amount of %ldkB matches resource limit [mem amount%s%ldkB]", 
00572        pi->mem_kbyte, operatorshortnames[pr->operator],
00573        pr->limit);      
00574       
00575       okay= FALSE;
00576     
00577     } else {
00578 
00579       if ( Run.debug ) {
00580     
00581     log("'%s' mem amount check passed [current mem amount=%ldkB]\n", 
00582         p->name, pi->mem_kbyte);
00583       
00584       }
00585     }
00586     break;
00587 
00588   case RESOURCE_ID_LOAD1: 
00589 
00590     if ( compare_resource((int) (Run.loadavg[0]*10.0), pr) ) {
00591 
00592       vlog(report, STRLEN, p, 
00593        "loadavg(1min) of %.1f matches resource limit "
00594        "[loadavg(1min)%s%.1f]", 
00595        Run.loadavg[0], operatorshortnames[pr->operator],
00596        pr->limit/10.0);      
00597       
00598       okay= FALSE;
00599     
00600     } else {
00601 
00602       if ( Run.debug ) {
00603     
00604     log("'%s' loadavg(1min) check passed [current loadavg(1min)=%.1f]\n", 
00605         p->name, Run.loadavg[0] );
00606       
00607       }
00608     }
00609     break;
00610 
00611   case RESOURCE_ID_LOAD5: 
00612 
00613     if ( compare_resource((int) (Run.loadavg[1]*10.0), pr) ) {
00614 
00615       vlog(report, STRLEN, p, 
00616        "loadavg(5min) of %.1f matches resource limit "
00617        "[loadavg(5min)%s%.1f]", 
00618        Run.loadavg[1], operatorshortnames[pr->operator],
00619        pr->limit/10.0);      
00620       
00621       okay= FALSE;
00622     
00623     } else {
00624 
00625       if ( Run.debug ) {
00626     
00627     log("'%s' loadavg(5min) check passed [current loadavg(5min)=%.1f]\n", 
00628         p->name, Run.loadavg[1]);
00629       
00630       }
00631     }
00632     break;
00633 
00634   case RESOURCE_ID_LOAD15: 
00635 
00636     if ( compare_resource((int) (Run.loadavg[2]*10.0), pr) ) {
00637 
00638       vlog(report, STRLEN, p, 
00639        "loadavg(15min) of %.1f matches resource limit "
00640        "[loadavg(15min)%s%.1f]", 
00641        Run.loadavg[2], operatorshortnames[pr->operator],
00642        pr->limit/10.0);      
00643       
00644       okay= FALSE;
00645     
00646     } else {
00647 
00648       if ( Run.debug ) {
00649     
00650     log("'%s' loadavg(15min) check passed "
00651         "[current loadavg(15min)=%.1f]\n", 
00652         p->name, Run.loadavg[2]);
00653       
00654       }
00655     }
00656     break;
00657 
00658   default:
00659 
00660     log("'%s' error: unknow resource ID: [%d]\n", p->name, pr->resource_id);
00661 
00662     /* Internal Alert ??? */
00663 
00664   }
00665 
00666   if ( okay && pr->cycle > 0 ) {
00667 
00668     pr->cycle--;
00669 
00670   } else if ( ! okay ) {
00671 
00672     pr->cycle++;
00673 
00674   }
00675 
00676   if ( pr->cycle >= pr->max_cycle ) {
00677 
00678     return FALSE;
00679 
00680   }
00681     
00682   return TRUE;
00683 }
00684 
00685 
00689 static int check_timeout(Process_T p) {
00690   
00691   if (!p->def_timeout) {
00692 
00693   return FALSE;
00694 
00695   }
00696   
00697   /*
00698    * Start counting cycles
00699    */
00700   if (p->nstart > 0) {
00701     
00702     p->ncycle++;
00703     
00704   }
00705   
00706   /*
00707    * Check timeout
00708    */
00709   if (p->nstart >= p->to_start && p->ncycle <= p->to_cycle) {
00710     
00711     /*
00712      * Got timeout - Set do not validate flag. This statement is
00713      * synchronized since the same data can be set from the http
00714      * thread
00715      */
00716     LOCK(Run.mutex)
00717     p->do_validate= FALSE;
00718     END_LOCK;
00719     
00720     /*
00721      * Log and notify the user that a timeout occured
00722      */
00723     log("**Alert** process '%s' timed out and will not be checked anymore.\n",
00724     p->name);
00725       
00726     smtp_alert_timeout(p, NULL);
00727     
00728     return TRUE;
00729     
00730   }
00731   
00732   /*
00733    * Stop counting and reset if the
00734    * cycle interval is passed
00735    */
00736   if (p->ncycle > p->to_cycle) {
00737     
00738     p->ncycle= 0;
00739     p->nstart= 0;
00740     
00741   }
00742 
00743   return FALSE;
00744   
00745 }
00746 
00747 
00752 static int check_skip(Process_T p) {
00753 
00754   if (!p->def_every) {
00755 
00756     return FALSE;
00757 
00758   }
00759   
00760   if (++p->nevery < p->every) {
00761     
00762       return TRUE;
00763 
00764   }
00765   
00766   p->nevery= 0;
00767 
00768   return FALSE;
00769 
00770 }
00771 
00772 
00780 static int check_checksum(Process_T p) {
00781 
00782   Checksum_T c;
00783 
00784   if (!p->def_checksum) {
00785 
00786     return FALSE;
00787 
00788   }
00789 
00790   for (c= p->checksumlist; c; c= c->next) {
00791     
00792     if (! checksum_helper(p, c->file, c->md5)) {
00793     
00794       return TRUE;
00795 
00796     }
00797     
00798   }
00799   
00800   if (Run.debug) {
00801     
00802     log("'%s' have valid checksums\n", p->name);
00803 
00804   }
00805   
00806   return FALSE;
00807 
00808 }
00809 
00810 
00817 static int checksum_helper(Process_T p, char *program, char *sum) {
00818 
00819   int rv= TRUE;
00820 
00821   if(program && sum) {
00822     
00823     if(!check_md5(program, sum)) {
00824       
00825       log("'%s' **Alert** checksum error for %s\n", p->name, program);
00826       
00827       smtp_alert_checksum(p, NULL);
00828       
00829       LOCK(Run.mutex)
00830       p->do_validate= FALSE;
00831       p->has_checksum_error= TRUE;
00832       END_LOCK;
00833       
00834       rv= FALSE;
00835       
00836     }
00837     
00838   }
00839     
00840   return rv;
00841 
00842 }
00843 
00844 
00848 static void connection_timeout(int sig) {
00849   
00850   siglongjmp(timeout, TRUE);
00851   
00852 }
00853 
00854 
00858 static void vlog(char * report, int n, Process_T p, char *m,...) {
00859 
00860   va_list ap;
00861   char *tmp = NULL;
00862 
00863   va_start(ap, m);
00864 
00865   if (m) { 
00866 
00867     tmp=format(m,ap);
00868 
00869   }
00870 
00871   va_end(ap);
00872 
00873   strncpy(report, tmp, n);
00874   log("'%s' %s\n", p->name, report); 
00875 
00876   free(tmp);
00877 }