PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig $@; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init
  24   fi
  25
  26   updateQA $@
  27 }
  28
  29 updateQA()
  30 {
  31   umask 0002
  32   parseConfig $@
  33
  34   #be paranoid and make some full paths
  35   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  36   inputList=$(get_realpath ${inputList})
  37   mkdir -p ${workingDirectory}
  38   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  39   if [[ ! -d ${workingDirectory} ]]; then
  40     echo "working dir $workingDirectory does not exist and cannot be created"
  41     return 1
  42   fi
  43   cd ${workingDirectory}
  44
  45   echo JOB config:
  46   echo inputList=$inputList
  47   echo outputDirectory=$outputDirectory
  48   echo
  49
  50   dateString=$(date +%Y-%m-%d-%H-%M)
  51   echo "Start time QA process: $dateString"
  52
  53   #logging
  54   mkdir -p $logDirectory
  55   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  56   logFile="$logDirectory/${0##*/}.${dateString}.log"
  57   touch ${logFile}
  58   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  59   echo "logFile = $logFile"
  60
  61   #check lock
  62   lockFile=${logDirectory}/runQA.lock
  63   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
  64   touch ${lockFile}
  65   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
  66
  67   exec &>${logFile}
  68
  69   ################################################################
  70   #ze detector loop
  71   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  72     echo
  73     echo "##############################################"
  74     unset planB
  75     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
  76     detector=${detectorScript%.sh}
  77     detector=${detector##*/}
  78
  79     #skip if excluded
  80     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  81       echo "${detector} is excluded in config, skipping..."
  82       continue
  83     fi
  84
  85     #if includeDetectors set, only process thoe detectors specified there
  86     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  87       echo "${detector} not included in includeDetectors, skipping..."
  88       continue
  89     fi
  90
  91     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  92     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  93     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
  94     if ! mkdir -p ${tmpDetectorRunDir}; then
  95       echo "cannot create the temp dir $tmpDetectorRunDir"
  96       continue
  97     fi
  98     cd ${tmpDetectorRunDir}
  99
 100     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
 101     echo "running QA for ${detector}"
 102     echo "  outputDir=$outputDir"
 103     echo "  tmpPrefix=$tmpPrefix"
 104
 105     unset -f runLevelQA
 106     unset -f periodLevelQA
 107     unset -f runLevelHighPtTreeQA
 108     unset -f periodLevelHighPtTreeQA
 109     source ${detectorScript}
 110
 111     #################################################################
 112     #produce the QA and trending tree for each file (run)
 113     unset arrOfTouchedProductions
 114     declare -A arrOfTouchedProductions
 115     while read qaFile; do
 116       echo
 117
 118       #first check if input file exists
 119       [[ ! -f ${qaFile%\#*} ]] && echo "file ${qaFile%\#*} not accessible" && continue
 120
 121       if ! guessRunData ${qaFile}; then
 122         echo "could not guess run data from ${qaFile}"
 123         continue
 124       fi
 125
 126       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 127       tmpRunDir=${tmpProductionDir}/000${runNumber}
 128       mkdir -p ${tmpRunDir}
 129       cd ${tmpRunDir}
 130
 131       #by default we expect to have everything in the same archive
 132       highPtTree=${qaFile}
 133
 134       #maybe the input is not an archive, but a file
 135       [[ "${qaFile}" =~ QAresults.root$ ]] && highPtTree=""
 136       [[ "${qaFile}" =~ FilterEvents_Trees.root$ ]] && qaFile=""
 137
 138       #it is possible we get the highPt trees from somewhere else
 139       #search the list of high pt trees for the proper run number
 140       if [[ -n ${inputListHighPtTrees} ]]; then
 141         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 142         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 143       fi
 144
 145       echo qaFile=$qaFile
 146       echo highPtTree=$highPtTree
 147
 148       #what if we have a zip archive?
 149       if [[ "$qaFile" =~ .*.zip$ ]]; then
 150         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 151           qaFile="${qaFile}#QAresults.root"
 152         else
 153           qaFile=""
 154         fi
 155       fi
 156       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 157         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 158           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 159         else
 160           highPtTree=""
 161         fi
 162       fi
 163
 164       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 165         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 166         runLevelQA "${qaFile}" &> runLevelQA.log
 167         #perform some default actions:
 168         #if trending.root not created, create a default one
 169         if [[ ! -f trending.root ]]; then
 170           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" 2>&1 | tee -a runLevelQA.log
 171         fi
 172         if [[ -f trending.root ]]; then
 173           arrOfTouchedProductions[${tmpProductionDir}]=1
 174         else
 175           echo "trending.root not created"
 176         fi
 177       fi
 178       #expert QA based on high pt trees
 179       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 180         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 181         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 182         arrOfTouchedProductions[${tmpProductionDir}]=1
 183       fi
 184
 185       cd ${tmpDetectorRunDir}
 186
 187     done < ${inputList}
 188
 189     #################################################################
 190     #cache which productions were (re)done
 191     echo "list of processed productions:"
 192     echo "    ${!arrOfTouchedProductions[@]}"
 193     echo
 194
 195     #################################################################
 196     #(re)do the merging/trending
 197     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 198       cd ${tmpProductionDir}
 199       echo
 200       echo "running period level stuff in ${tmpProductionDir}"
 201
 202       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 203       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 204
 205       mkdir -p ${productionDir}
 206       if [[ ! -d ${productionDir} ]]; then
 207         echo "cannot make productionDir $productionDir" && continue
 208       fi
 209
 210       #move runs to final destination
 211       for dir in ${tmpProductionDir}/000*; do
 212         echo
 213         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 214         if ! guessRunData "${dir}/dummyName"; then
 215           echo "could not guess run data from ${dir}"
 216           continue
 217         fi
 218
 219         #before moving - VALIDATE!!!
 220         if ! validate ${dir}; then
 221           continue
 222         fi
 223
 224         if [[ -d ${oldRunDir} ]]; then
 225           echo "removing old ${oldRunDir}"
 226           rm -rf ${oldRunDir}
 227         fi
 228         echo "moving new ${runNumber} to ${productionDir}"
 229         mv -f ${dir} ${productionDir}
 230       done
 231
 232       #go to a temp dir to do the period level stuff in a completely clean dir
 233       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 234       echo
 235       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 236       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
 237       cd ${tmpPeriodLevelQAdir}
 238
 239       #link the final list of per-run dirs here, just the dirs
 240       #to have a clean working directory
 241       unset linkedStuff
 242       declare -a linkedStuff
 243       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
 244
 245       #merge trending files if any
 246       if /bin/ls 000*/trending.root &>/dev/null; then
 247         hadd trending.root 000*/trending.root &> periodLevelQA.log
 248       fi
 249
 250       #run the period level trending/QA
 251       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 252         echo running ${detector} periodLevelQA for production ${period}/${pass}
 253         periodLevelQA trending.root &>> periodLevelQA.log
 254       else
 255         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 256       fi
 257
 258       if ! validate ${PWD}; then continue; fi
 259
 260       #here we are validated so move the produced QA to the final place
 261       #clean up linked stuff first
 262       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
 263       #some of the output could be a directory, so handle that
 264       #TODO: maybe use rsync?
 265       for x in ${tmpPeriodLevelQAdir}/*; do
 266         if [[ -d ${x} ]]; then
 267           echo "removing ${productionDir}/${x##*/}"
 268           rm -rf ${productionDir}/${x##*/}
 269           echo "moving ${x} to ${productionDir}"
 270           mv ${x} ${productionDir}
 271         fi
 272         if [[ -f ${x} ]]; then
 273           echo "moving ${x} to ${productionDir}"
 274           mv -f ${x} ${productionDir}
 275         fi
 276       done
 277
 278       #remove the temp dir
 279       rm -rf ${tmpPeriodLevelQAdir}
 280
 281     done
 282
 283     cd ${workingDirectory}
 284
 285     if [[ -z ${planB} ]]; then
 286       echo
 287       echo removing ${tmpDetectorRunDir}
 288       rm -rf ${tmpDetectorRunDir}
 289     else
 290       executePlanB
 291     fi
 292   done #end of detector loop
 293
 294   #remove lock
 295   rm -f ${lockFile}
 296 }
 297
 298 executePlanB()
 299 {
 300   #in case of emergency
 301   if [[ -n ${MAILTO} ]]; then
 302     echo
 303     echo "trouble detected, sending email to ${MAILTO}"
 304
 305     cat ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 306   fi
 307 }
 308
 309 validate()
 310 {
 311   summarizeLogs ${1} >> ${logSummary}
 312   logStatus=$?
 313   if [[ ${logStatus} -ne 0 ]]; then
 314     echo "WARNING not validated: ${1}"
 315     planB=1
 316     return 1
 317   fi
 318   return 0
 319 }
 320
 321 summarizeLogs()
 322 {
 323   local dir=$1
 324   [[ ! -d ${dir} ]] && dir=${PWD}
 325
 326   #print a summary of logs
 327   logFiles=(
 328       "*.log"
 329       "stdout"
 330       "stderr"
 331   )
 332
 333   #check logs
 334   local logstatus=0
 335   for log in ${dir}/${logFiles[*]}; do
 336     finallog=${PWD%/}/${log}
 337     [[ ! -f ${log} ]] && continue
 338     errorSummary=$(validateLog ${log})
 339     validationStatus=$?
 340     [[ validationStatus -ne 0 ]] && logstatus=1
 341     if [[ ${validationStatus} -eq 0 ]]; then
 342       #in pretend mode randomly report an error in rec.log some cases
 343       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 344         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 345       else
 346         echo "${finallog} OK"
 347       fi
 348     elif [[ ${validationStatus} -eq 1 ]]; then
 349       echo "${finallog} BAD ${errorSummary}"
 350     elif [[ ${validationStatus} -eq 2 ]]; then
 351       echo "${finallog} OK MWAH ${errorSummary}"
 352     fi
 353   done
 354
 355   #report core files
 356   while read x; do
 357     echo ${x}
 358     chmod 644 ${x}
 359     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 360   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 361
 362   return ${logstatus}
 363 }
 364
 365 validateLog()
 366 {
 367   log=${1}
 368   errorConditions=(
 369             'There was a crash'
 370             'floating'
 371             'error while loading shared libraries'
 372             'std::bad_alloc'
 373             's_err_syswatch_'
 374             'Thread [0-9]* (Thread'
 375             'AliFatal'
 376             'core dumped'
 377             '\.C.*error:.*\.h: No such file'
 378             'segmentation'
 379             'Interpreter error recovered'
 380   )
 381
 382   warningConditions=(
 383             'This is serious'
 384   )
 385
 386   local logstatus=0
 387   local errorSummary=""
 388   local warningSummary=""
 389
 390   for ((i=0; i<${#errorConditions[@]};i++)); do
 391     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 392     [[ -n ${tmp} ]] && tmp+=" : "
 393     errorSummary+=${tmp}
 394   done
 395
 396   for ((i=0; i<${#warningConditions[@]};i++)); do
 397     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 398     [[ -n ${tmp} ]] && tmp+=" : "
 399     warningSummary+=${tmp}
 400   done
 401
 402   if [[ -n ${errorSummary} ]]; then
 403     echo "${errorSummary}"
 404     return 1
 405   fi
 406
 407   if [[ -n ${warningSummary} ]]; then
 408     echo "${warningSummary}"
 409     return 2
 410   fi
 411
 412   return 0
 413 }
 414
 415 parseConfig()
 416 {
 417   #config file
 418   configFile=""
 419   #where to search for qa files
 420   inputList=file.list
 421   #working directory
 422   workingDirectory="${PWD}"
 423   #where to place the final qa plots
 424   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 425   outputDirectory="${workingDirectory}/%DET"
 426   #filter out detector option
 427   excludeDetectors="EXAMPLE"
 428   #logs
 429   logDirectory=${workingDirectory}/logs
 430   #OCDB storage
 431   #ocdbStorage="raw://"
 432   #email to
 433   #MAILTO="fbellini@cern.ch"
 434
 435   #first, check if the config file is configured
 436   #is yes - source it so that other options can override it
 437   #if any
 438   for opt in $@; do
 439     if [[ ${opt} =~ configFile=.* ]]; then
 440       eval "${opt}"
 441       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 442       source "${configFile}"
 443       break
 444     fi
 445   done
 446
 447   #then, parse the options as they override the options from file
 448   while [[ -n ${1} ]]; do
 449     local var=${1#--}
 450     if [[ ${var} =~ .*=.* ]]; then
 451       eval "${var}"
 452     else
 453       echo "badly formatted option ${var}, should be: option=value, stopping..."
 454       return 1
 455     fi
 456     shift
 457   done
 458 }
 459
 460 guessRunData()
 461 {
 462   #guess the period from the path, pick the rightmost one
 463   period=""
 464   runNumber=""
 465   year=""
 466   pass=""
 467   legoTrainRunNumber=""
 468   dataType=""
 469
 470   local shortRunNumber=""
 471   local IFS="/"
 472   declare -a path=( $1 )
 473   local dirDepth=$(( ${#path[*]}-1 ))
 474   i=0
 475   for ((x=${dirDepth};x>=0;x--)); do
 476
 477     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 478     local field=${path[${x}]}
 479     local fieldNext=${path[$((x+1))]}
 480
 481     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 482     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 483     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
 484     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 485     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 486     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 487     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 488     (( i++ ))
 489   done
 490   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 491   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
 492
 493   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 494   if [[ -z ${runNumber}} ]];
 495   then
 496     #error condition
 497     return 1
 498   else
 499     #ALL OK
 500     return 0
 501   fi
 502 }
 503
 504 substituteDetectorName()
 505 {
 506   local det=$1
 507   local dir=$2
 508   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 509   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 510 }
 511
 512 get_realpath()
 513 {
 514   if [[ -f "$1" ]]
 515   then
 516     # file *must* exist
 517     if cd "$(echo "${1%/*}")" &>/dev/null
 518     then
 519       # file *may* not be local
 520       # exception is ./file.ext
 521       # try 'cd .; cd -;' *works!*
 522       local tmppwd="$PWD"
 523       cd - &>/dev/null
 524     else
 525       # file *must* be local
 526       local tmppwd="$PWD"
 527     fi
 528   else
 529     # file *cannot* exist
 530     return 1 # failure
 531   fi
 532   # reassemble realpath
 533   echo "$tmppwd"/"${1##*/}"
 534   return 0 # success
 535 }
 536
 537 main $@