%CODE{"sh" num="1"}%
# SCRIPT: Cluster Monitoring Script
# Date..: Mon 20 Oct 2008 02:48:13 PM BRST
# Author: Jadir Marra da Silva<jadir.silva13@gmail.com>
#
#####################################################
# Jadir Silva:
# Mon 02 Nov 2008
# + acrescentado o teste para verificar o espaco no /scratch
# do nodes conforme sugerido por Sergio Lietti.
# Jadir Silva:
# Mon 03 Nov 2008 09:47:20 AM BRST
# + acrescentado uma definicão condicional para MAILTO,
# MAILSUBJECT e CARBON_COPY para evitar o envio de emails
# quando estiver depurando o script
# Jadir Silva:
# Mon 10 Nov 2008 11:21:26 AM BRST
# + acrescentado alerta via email para o site verify,
# Jadir Silva:
# Tue 11 Nov 2008 09:09:13 AM BRST
# + acrescentado comando para remover o arquivo temporario com
# o conteudo do email de alerta.
# Jadir Silva:
# Tue 11 Nov 2008 12:43:04 PM BRST
# + alterado o IDLE_THRESHOLD de 699 para 999
# seguindo orientacao do Lietti.
# Jadir Silva:
# Thu 27 Nov 2008 10:22:34 AM BRST
# + correcao de pequeno problema que impedia
# o envio de email com numero de nodes com o condor down.
TIER="T2_BR_SPRACE"
SEND_MAIL="N"
GRID_USER="mdias"
MAIL_BODY=`mktemp /tmp/site_verify.XXXXXXX`
MAILFROM="root@osg-ce.sprace.org.br"
MAILTO="sprace_ops@googlegroups.com"
CARBON_COPY="jadir.silva13@gmail.com"
MAILSUBJECT="SPRACE - Monitoramento Automatico - `date`"
STATUS_PAGE="/var/www/html/spracemon.html"
MAIN_SERVERS="acs.grid osgce.grid osgse.grid storage01.grid storage02.grid"
LOAD_THRESHOLD=9
SERVERS_LOAD_THRESHOLD=14
POOL_THRESHOLD=90
IDLE_THRESHOLD=999
#GANGLIA_LINK='http://prod-frontend.hepgrid.uerj.br/ganglia/'
GANGLIA_LINK='http://osg-ce.sprace.org.br/ganglia'
DCACHE_URL="http://osg-se.sprace.org.br:2288"
#DCACHE_URL="http://cdfdca.fnal.gov:2288/cellInfo"
# Captura uma lista com todos os nodes do cluster
NODE_LIST=`links -source $GANGLIA_LINK | grep 'OPTION.*\.grid' | sed 's/<[^>]*>/ /g'`
NODE_LIST=`echo $NODE_LIST | sed 's/\.grid//g'`
NODE_LIST=`echo $NODE_LIST | sed 's/osgce//g;s/storage01//g;s/storage02//g;s/osgse//g;s/acs//g'`
NODE_LIST=`cat /root/bin/cluster.list`
PHEDEX_PROD_URL="http://cmsweb.cern.ch/phedex/prod/Components::Status"
PHEDEX_DEBG_URL="http://cmsweb.cern.ch/phedex/debug/Components::Status"
MSG=""
FOOTER=" This is an automatic email, please do not reply Message send in `date`"
function Header(){
echo "
" >> $MAIL_BODY
echo " " >> $MAIL_BODY
echo "Report generated by monitor.sh script, developed by Jadir Silva with support of Allan Szu " >> $MAIL_BODY
echo "and some suggestions from Sergio Lietti following steps defined by Marco Dias in " >> $MAIL_BODY
echo "[1]." >> $MAIL_BODY
echo "
Obs.: This script still under development, if you have any opinion, " >> $MAIL_BODY
echo "contact me at jadir.silva13@gmail.com" >> $MAIL_BODY
echo " " >> $MAIL_BODY
echo "" >> $MAIL_BODY
}
function SendMail(){
cat - $1 <<HERE | /usr/sbin/sendmail -oi -t
From: $2
To: $3
Cc: $4
Subject: $5
Content-Type: text/html; charset=us-ascii
Content-Transfer-Encoding: 7bit
MIME-Version: 1.0
HERE
}
SendMsgToTeam(){
if [ "`basename $0`" == "monitor_debug.sh" ];then
MAILSUBJECT="SPRACE - Debug"
CARBON_COPY="jadirmarra@yahoo.com.br"
MAILTO="jadir.silva13@gmail.com"
fi
ReportMail=`mktemp /tmp/ReportMail.XXXXXX`
echo "$MSG" > $ReportMail
SendMail "$ReportMail" "$MAILFROM" "$MAILTO" "$CARBON_COPY" "$MAILSUBJECT"
#rm -fr $ReportMail
}
function ReportToTeam(){
case "$1" in
idle)
MSG="$MSG
Lot of Jobs in idle state
There are $2 in idle state on the farm. "
;;
sam-error)
MSG="$MSG
Error on SAM test
The SAM test presents some errors $2. "
;;
dcache)
MSG="$MSG
Error in some services of dcache
DCache have $2 stopped services. "
;;
dpool)
MSG="$MSG
Low space on some pools in dcache
DCache has some pools with $2 of space used. "
;;
server-load)
MSG="$MSG
T2_BR_SPRACE' | awk -F: '{print $1}'`
if [ "$POSITION" = "" ];then
SED_DATA="$POSITION,$((POSITION+5))p"
JOBROBOT1=`echo "$JOBROBOT1" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
EFICIENCIA=`echo $JOBROBOT1 | awk '{print $6}'`
if [ "$EFICIENCIA" == "" ];then
WriteLn "Efficiency : -- -- --"
else
if [ $EFICIENCIA -lt 60 ];then
if [ "$1" == "1" ];then
ReportToTeam "job_robot" "$EFICIENCIA"
fi
EFI="Efficiency : $EFICIENCIA% (below expected) (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
WriteLn "$EFI"
else
EFI="Efficiency : $EFICIENCIA% Ok (Test done at "`date '+%d/%m/%Y' -d "$1 day ago"`")."
WriteLn "$EFI"
fi
fi
else
WriteLn "Efficiency : -- -- --"
fi
}
# inicializa o ambiente do OSG
source /OSG/setup.sh
InitMail
#####################################################
# espaco reservado para testes nao digite nada aqui
######################################################
# PASSO 1. verifica quantos nodes estao down
Header "Hosts down"
HDOWN=`links -source $GANGLIA_LINK | grep 'class=down' | sed 's/<[^>]*>/ /g' | awk '{ print $1,"|" }'`
HDOWN=`echo $HDOWN | sed 's/\.grid//g'`
UP_LIST=$NODE_LIST
OLD_IFS=$IFS
if [ "$HDOWN" == "" ];then
WriteLn "No hosts down."
else
IFS='|'
for host in $HDOWN;do
IFS=$OLD_IFS
NODE_NAME=`echo $host | awk '{print $1}'`
IFS='|'
Write "$NODE_NAME "
UP_LIST=`echo $UP_LIST | sed "s/$NODE_NAME/ /g"`
Link "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$NODE_NAME.grid"
Write " "
done
ReportToTeam "node_down" "$HDOWN"
fi
IFS=$OLD_IFS
# PASSO 2. verifica os host's que estao load > 10
Header "Hosts with load equal/above $((LOAD_THRESHOLD+1))"
NADA=0
LOADS=""
for a in $UP_LIST;do
node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
LOAD=`echo $node_load | awk '{print $1}'`
LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
if [ $LOAD -gt $LOAD_THRESHOLD ];then
WriteLn "$a load : $LOAD"
LOADS="$LOADS $a(load=$LOAD)"
NADA=1
fi
done
if [ $NADA == 0 ];then
WriteLn "No host with load equal/above $((LOAD_THRESHOLD+1))."
NADA=0
else
ReportToTeam "node-load" "$MSG"
fi
NADA=0
Header "Load of main servers $MAIN_SERVERS"
# PASSO 3. Load acima de 15 nos principais servidores.
LOAD_NODE=""
for a in $MAIN_SERVERS ;do
# node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g' | grep '[1-9][5-9]\.'`
node_load=`links -source "http://osg-ce.sprace.org.br/ganglia/?p=2&c=OSG-CE%20Cluster&h=$a" | sed -n '85q;82,84p' | sed 's/<[^>]*>/ /g'`
LOAD=`echo $node_load | awk '{print $1}'`
LOAD=`echo $LOAD | sed 's/\.[0-9][0-9]//g'`
if [ $LOAD -gt $SERVERS_LOAD_THRESHOLD ];then
LOAD_NODE="$a(load=$LOAD)"
WriteLn "$a load : $LOAD"
NADA=1
fi
done
if [ $NADA == 0 ];then
WriteLn "No host with load equal/above $((SERVERS_LOAD_THRESHOLD+1))."
NADA=0
else
ReportToTeam "server-load" "$LOAD_NODE"
fi
# PASSO 4.
# Site verify
# executa o script site_verify.pl da OSG.
SITE_VERIFY=`su - $GRID_USER -c "source /opt/osg-1.0.0/setup.sh;/opt/osg-1.0.0/verify/site_verify.pl" | grep -i " FAIL"`
Header "Siteverify.pl status"
if [ "$SITE_VERIFY" == "" ];then
WriteLn "Site verify test: SUCCESS"
else
WriteLn "Errors founded:"
WriteLn "$SITE_VERIFY"
ReportToTeam "site_verify" "$SITE_VERIFY"
fi
# verifica se o condor esta rodando em todos os nos
Header "Condor status"
CRUNNING=0
CSTOPPED=0
CONDOR_MSG=""
for node in $UP_LIST;do
IS_RUNNING=`ssh $node ps -fu condor | grep -v UID`
if [ "$IS_RUNNING" == "" ];then
WriteLn "$node with condor stopped."
CSTOPPED=$((CSTOPPED+1))
CONDOR_MSG="$CONDOR_MSG $node"
else
CRUNNING=$((CRUNNING+1))
fi
done
if [ "$CSTOPPED" == "0" ];then
WriteLn "Condor running on all active nodes"
else
ReportToTeam "condor_down" "$CONDOR_MSG"
WriteLn "Condor running on $CRUNNING nodes and stopped on $CSTOPPED nodes."
fi
Header "Job status"
JOBS_STATS=`condor_q | grep running`
IDLE_JOBS=`echo $JOBS_STATS | awk '{print $3}'`
TOTAL_JOBS=`echo $JOBS_STATS | awk '{print $1}'`
RUN_JOBS=`echo $JOBS_STATS | awk '{print $5}'`
HELD_JOBS=`echo $JOBS_STATS | awk '{print $7}'`
OpenTableOpenTbLineOpenCellWriteLn "Running: $RUN_JOBS"
if [ $IDLE_JOBS -gt $IDLE_THRESHOLD ];then
WriteLn "Idle.......: $IDLE_JOBS Warning!!!"
ReportToTeam "idle" "$IDLE_JOBS"
else
WriteLn "Idle.......: $IDLE_JOBS"
fi
WriteLn "Held.......: $HELD_JOBS"
WriteLn "Total......: $TOTAL_JOBS"
CloseCellOpenCellWriteLn "  "
CloseCellOpenCellWriteLn "If has any job held or more than $((IDLE_THRESHOLD+1)) jobs in idle Please report to sprace_ops@yahoo.com.br"
CloseCellCloseTbLineCloseTable
Header "Jobs with more than 2 days on the farm"
JOBS_RUNNING=`condor_q -run | grep [2-9]+ `
if [ "$JOBS_RUNNING" == "" ];then
WriteLn "No jobs more than 2 days on the farm"
else
MORE2DAYS=`echo $JOBS_RUNNING | sed 's/\.grid/\.grid /g'`
WriteLn "$MORE2DAYS"
ReportToTeam "old_jobs" "$MORE2DAYS"
fi
Header "Farm occupation"
FarmOcupation=`condor_q -run | grep -v "OWNER" | grep -v "Submitter" | awk '{print $2}' | sort | uniq -c | sed 1d`
WriteLn "
$FarmOcupation
"
Header "SAM test"
SAM=`links -source "http://dashb-cms-sam.cern.ch/dashboard/request.py/latestresultssmry?siteSelect3=T2T1T0&serviceTypeSelect3=vo&sites=T2_BR_SPRACE&services=CE&services=SRMv2&tests=1301&tests=133&tests=111&tests=6&tests=1261&tests=76&tests=64&tests=20&tests=281&tests=882&exitStatus=all" | sed -e '/latestresultssmrytable/!d' | awk '{ print substr($0,index($0,"latestresultssmrytable")) }' | sed 's/target=\"\_blank\">//g'`
SAM_LINK='"http://dashb-cms-sam.cern.ch/dashboard/request.py/'$SAM
SAM_LINK=`echo $SAM_LINK | sed 's/\"//g'`
SAM_TABLE=`links -source $SAM_LINK | sed 's/\/dashboard/http\:\/\/dashb-cms-sam\.cern\.ch\/dashboard/g'`
SAM_TABLE=`echo $SAM_TABLE | sed 's/SAM-Latest Results<\/title><\/head>//g'`
SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<link rel")) }'`
SAM_TABLE=`echo $SAM_TABLE | sed 's/<\/body><\/html>'//g`
SAM_TABLE=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"<div"))}'`
#SAM_TABLE=`echo $SAM_TABLE | sed 's/Service Type/Tipo de Servico/g;s/Service Name/Nome do Servico/g;s/Sitename/Sitio/g'`
SAM_RED1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF0000"),24) }' `
SAM_RED2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF6666"),24) }' `
SAM_RED3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF9999"),24) }' `
SAM_CRIT1_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#CC00CC"),24) }' `
SAM_CRIT2_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF33FF"),24) }' `
SAM_CRIT3_ERROR=`echo $SAM_TABLE | awk '{ print substr($0,index($0,"background-color:#FF99FF"),24) }' `
if [ "$SAM_RED1_ERROR" == "background-color:#FF0000" ];then
ReportToTeam "sam-error" "Dark red code"
fi
if [ "$SAM_RED2_ERROR" == "background-color:#FF6666" ];then
ReportToTeam "sam-error" "Light red code"
fi
if [ "$SAM_RED3_ERROR" == "background-color:#FF9999" ];then
ReportToTeam "sam-error" "Lightest red code"
fi
if [ "$SAM_CRIT1_ERROR" == "background-color:#CC00CC" ];then
ReportToTeam "sam-error" "Dark critical code"
fi
if [ "$SAM_CRIT2_ERROR" == "background-color:#FF33FF" ];then
ReportToTeam "sam-error" "Light critical code"
fi
if [ "$SAM_CRIT3_ERROR" == "background-color:#FF99FF" ];then
ReportToTeam "sam-error" "Lightest critical code"
fi
echo "$SAM_TABLE" >> $MAIL_BODY
# Verifica os servicos do d-cache
Header "DCache status"
CELL_INFO=`links -source $DCACHE_URL/cellinfo`
USAG_INFO=`links -source $DCACHE_URL/usageInfo`
CELL_STAT=`echo "$CELL_INFO" | sed 's/<[^>]*>/ /g' | grep -i offline | wc -l`
USAG_STAT=`echo "$USAG_INFO" | sed 's/<[^>]*>/ /g'`
if [ $CELL_STAT -gt 0 ];then
WriteLn "$CELL_STAT dcache services are stopped"
ReportToTeam "dcache" "$CELL_STAT"
else
WriteLn "All dcache services(daemons) ok. "
fi
LINE=""
TOTAL=0
FREE=0
REPORTAR=""
echo "$USAG_INFO" | egrep '(cell|total|free|precious)' | grep '<td' | grep -v 'layout' | sed 's/
/ /g;s/<\/td>//g' | while read celula valor ;do
if [ "$celula" == "cell" ];then
LINE="$LINE $valor"
fi
if [ "$celula" == "total" ];then
TOTAL="$valor"
fi
if [ "$celula" == "free" ];then
FREE="$valor"
fi
if [ "$celula" == "precious" ];then
PERCENT=`cat - << HERE | bc
scale=0
100-((100*$FREE)/$TOTAL)
HERE`
if [ $PERCENT -gt $POOL_THRESHOLD ];then
REPORTAR="$REPORTAR $LINE with $PERCENT occupation"
PERCENT="$PERCENT%"
else
PERCENT="$PERCENT%"
fi
WriteLn "$LINE with $PERCENT ocuppation"
LINE=""
fi
done
if [ "$REPORTAR" = "" ];then
ReportToTeam "dpool" "$REPORTAR"
fi
#######################
# Lietti sugeriu acrescentar no relatorio o espaco ocupado pelo/scratch dos nodes
# entao desenvolvi o codigo abaixo para fazer isto.
x=0
Header "Ocuppation of /scratch on nodes"
WriteLn "
Only nodes less than 8Gb.
"
IFS=$OLD_IFS
LOW_DISK_NODES=""
nodes_to_save=""
for node in $UP_LIST;do
if [ "$node" = "`/bin/hostname -s`" ]; then
eval "df -h /scratch"
else
saida=`ssh $node "df /scratch"`
size=`echo $saida | awk '{print $11}'`
perc=`echo $saida | awk '{print $12}'`
if [ $size -lt 8388608 ];then
x=$(($x+1))
if [ $size -lt 1048576 ];then
node_disk_space="$node"'('"$(($size/1024))Mb"') '
WriteLn "$node_disk_space"
LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space "
nodes_to_save="$nodes_to_save $node"
else
node_disk_space="$node"'('"$(($size/1048576))Gb"') '
WriteLn "$node_disk_space"
LOW_DISK_NODES="$LOW_DISK_NODES $node_disk_space "
nodes_to_save="$nodes_to_save $node"
fi
fi
fi
done
if [ "$LOW_DISK_NODES" = "" ];then
ReportToTeam "low_disk" "$LOW_DISK_NODES"
echo "$nodes_to_save" > /tmp/nodes_full.txt
else
WriteLn "No node with low space on /scratch"
fi
WriteLn ""
Header "JobRobot Status"
JobRobotTest 1
JobRobotTest 2
JobRobotTest 3
JobRobotTest 4
JobRobotTest 5
JobRobotTest 6
if [ "$DEBUG" == "monitor_debug.sh" ];then
# codigo de teste ou instavel deve ficar aqui ####################
Header "CEMon Status"
ldap=`ldapsearch -x -LLL -p 2170 -h is.grid.iu.edu -b mds-vo-name=SPRACE,mds-vo-name=local,o=grid`
cods=`ssh node34 "source /OSG/setup.sh;condor_status -pool osg-ress-1.fnal.gov -l -constraint 'GlueCEInfoHostName == \"osg-ce.sprace.org.br\"'"`
LDAP_STATUS=`echo $ldap | grep `
# fim da area de teste ###########################################
fi
# status dos agentes do phedex
Header "Phedex Agents Status"
PRODUCTION_STATUS=`links -source $PHEDEX_PROD_URL`
DEBUG_STATUS=`links -source $PHEDEX_DEBG_URL`
PROD_POSITION=`echo "$PRODUCTION_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
DEBG_POSITION=`echo "$DEBUG_STATUS" | grep -n "$TIER" | awk -F: '{print $1}'`
TBL_CODE=""
TBL_CODE2=""
if [ "$PROD_POSITION" = "" ];then
SED_DATA="$((PROD_POSITION+1)),$((PROD_POSITION+2))p"
PROD_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
TBL_CODE=`echo "$PRODUCTION_STATUS" | sed -n $SED_DATA`
IS_AGENT_DOWN=`echo "$PROD_CODE" | grep DOWN`
if [ "$IS_AGENT_DOWN" = "" ];then
ReportToTeam "phedex_down" "Phedex(production) agent down"
WriteLn "Phedex(production) agents down."
else
WriteLn "Phedex(production) agents are OK."
fi
fi
if [ "$DEBG_POSITION" = "" ];then
SED_DATA="$((DEBG_POSITION+1)),$((DEBG_POSITION+2))p"
DEBG_CODE=`echo "$DEBUG_STATUS" | sed -n $SED_DATA | sed 's/<[^>]*>/ /g'`
TBL_CODE2=`echo "$DEBUG_STATUS" | sed -n $SED_DATA`
IS_AGENT_DOWN=`echo "$DEBG_CODE" | grep DOWN`
if [ "$IS_AGENT_DOWN" = "" ];then
ReportToTeam "phedex_down" "Phedex(debug) agent down"
WriteLn "Phedex(debug) agents down."
else
WriteLn "Phedex(debug) agents are OK."
fi
fi
Write "
Production Agents
$TBL_CODE
Debug Agents
$TBL_CODE2
"
CloseMailWriteStatusPage
# Se houver algum alerta enviar para a lista de administradores
# do cluster
if [ "$MSG" = "" ];then
MSG="$MSG $FOOTER"
SendMsgToTeam
fi
if [ "$SEND_MAIL" = "Y" ];then
SendMail $MAIL_BODY $MAILFROM $MAILTO $CARBON_COPY "$MAILSUBJECT"
fi
rm -frv $MAIL_BODY
%ENDCODE%
Teste
-- JadirSilva - 26 Sep 2008
Outra pagina para teste