PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig $@; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init
  24   fi
  25
  26   updateQA $@
  27 }
  28
  29 updateQA()
  30 {
  31   umask 0002
  32   parseConfig $@
  33
  34   #be paranoid and make some full paths
  35   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  36   inputList=$(get_realpath ${inputList})
  37   mkdir -p ${workingDirectory}
  38   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  39   if [[ ! -d ${workingDirectory} ]]; then
  40     echo "working dir $workingDirectory does not exist and cannot be created"
  41     return 1
  42   fi
  43   cd ${workingDirectory}
  44
  45   echo JOB config:
  46   echo inputList=$inputList
  47   echo outputDirectory=$outputDirectory
  48   echo
  49
  50   dateString=$(date +%Y-%m-%d-%H-%M)
  51   echo "Start time QA process: $dateString"
  52
  53   #logging
  54   mkdir -p $logDirectory
  55   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  56   logFile="$logDirectory/${0##*/}.${dateString}.log"
  57   touch ${logFile}
  58   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  59   echo "logFile = $logFile"
  60   exec &>${logFile}
  61
  62   #check lock
  63   lockFile=${logDirectory}/runQA.lock
  64   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" && return 1
  65   touch ${lockFile}
  66   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" && return 1
  67
  68   ################################################################
  69   #ze detector loop
  70   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  71
  72     [[ ! ${detectorScript} =~ .*\.sh ]] && continue
  73     detector=${detectorScript%.sh}
  74     detector=${detector##*/}
  75
  76     #skip if excluded
  77     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  78       echo "${detector} is excluded in config, skipping..."
  79       continue
  80     fi
  81
  82     #if includeDetectors set, only process thoe detectors specified there
  83     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  84       echo "${detector} not included in includeDetectors, skipping..."
  85       continue
  86     fi
  87
  88     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  89     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  90     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}
  91     if ! mkdir -p ${tmpDetectorRunDir}; then
  92       echo "cannot create the temp dir $tmpDetectorRunDir"
  93       continue
  94     fi
  95     cd ${tmpDetectorRunDir}
  96
  97     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
  98     echo
  99     echo "##############################################"
 100     echo "running QA for ${detector}"
 101     echo "  outputDir=$outputDir"
 102     echo "  tmpPrefix=$tmpPrefix"
 103
 104     unset -f runLevelQA
 105     unset -f periodLevelQA
 106     unset -f runLevelHighPtTreeQA
 107     unset -f periodLevelHighPtTreeQA
 108     source ${detectorScript}
 109
 110     #################################################################
 111     #produce the QA and trending tree for each file (run)
 112     unset arrOfTouchedProductions
 113     declare -A arrOfTouchedProductions
 114     while read qaFile; do
 115       echo
 116
 117       if ! guessRunData ${qaFile}; then
 118         echo "could not guess run data from ${qaFile}"
 119         continue
 120       fi
 121
 122       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 123       tmpRunDir=${tmpProductionDir}/000${runNumber}
 124       mkdir -p ${tmpRunDir}
 125       cd ${tmpRunDir}
 126
 127       #by default we expect to have everything in the same archive
 128       highPtTree=${qaFile}
 129
 130       #maybe the input is not an archive, but a file
 131       [[ "${qaFile}" =~ "QAresults.root" ]] && highPtTree=""
 132       [[ "${qaFile}" =~ "FilterEvents_Trees.root" ]] && qaFile=""
 133
 134       #it is possible we get the highPt trees from somewhere else
 135       #search the list of high pt trees for the proper run number
 136       if [[ -n ${inputListHighPtTrees} ]]; then
 137         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 138         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 139       fi
 140
 141       echo qaFile=$qaFile
 142       echo highPtTree=$highPtTree
 143
 144       #what if we have a zip archive?
 145       if [[ "$qaFile" =~ .*.zip$ ]]; then
 146         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 147           qaFile="${qaFile}#QAresults.root"
 148         else
 149           qaFile=""
 150         fi
 151       fi
 152       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 153         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 154           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 155         else
 156           highPtTree=""
 157         fi
 158       fi
 159
 160       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 161         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 162         runLevelQA "${qaFile}" &> runLevelQA.log
 163         #perform some default actions:
 164         #if trending.root not created, create a default one
 165         if [[ ! -f trending.root ]]; then
 166           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" &>> runLevelQA.log
 167         fi
 168         arrOfTouchedProductions[${tmpProductionDir}]=1
 169       fi
 170       #expert QA based on high pt trees
 171       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 172         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 173         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 174         arrOfTouchedProductions[${tmpProductionDir}]=1
 175       fi
 176
 177       cd ${tmpDetectorRunDir}
 178
 179     done < ${inputList}
 180
 181     #################################################################
 182     #cache which productions were (re)done
 183     echo "list of processed productions:"
 184     echo "    ${!arrOfTouchedProductions[@]}"
 185     echo
 186
 187     #################################################################
 188     #(re)do the merging/trending in the final destination
 189     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 190       echo
 191       echo "running period level stuff in ${tmpProductionDir}"
 192
 193       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 194
 195       mkdir -p ${productionDir}
 196       if [[ ! -d ${productionDir} ]]; then
 197         echo "cannot make productionDir $productionDir" && continue
 198       fi
 199       cd ${productionDir}
 200
 201       #move to final destination
 202       for dir in ${tmpProductionDir}/*; do
 203         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 204         if ! guessRunData "${dir}/dummyName"; then
 205           echo "could not guess run data from ${dir}"
 206           continue
 207         fi
 208
 209         #before moving - VALIDATE!!!
 210         if ! validate ${dir}; then continue; fi
 211
 212         if [[ -d ${oldRunDir} ]]; then
 213           echo "removing old ${oldRunDir}"
 214           rm -rf ${oldRunDir}
 215         fi
 216         echo "moving new ${runNumber} to ${productionDir}"
 217         mv -f ${dir} ${productionDir}
 218       done
 219
 220       rm -f trending.root
 221
 222       #merge trending files if any
 223       if /bin/ls 000*/trending.root &>/dev/null; then
 224         hadd trending.root 000*/trending.root &> periodLevelQA.log
 225       fi
 226
 227       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 228         echo running ${detector} periodLevelQA for production ${period}/${pass}
 229         periodLevelQA trending.root &>> periodLevelQA.log
 230       else
 231         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 232       fi
 233
 234
 235       if ! validate ${PWD}; then continue; fi
 236
 237       cd ${tmpDetectorRunDir}
 238
 239     done
 240
 241     cd ${workingDirectory}
 242
 243     if [[ -z ${planB} ]]; then
 244       echo
 245       echo removing ${tmpDetectorRunDir}
 246       rm -rf ${tmpDetectorRunDir}
 247     else
 248       executePlanB
 249     fi
 250   done
 251
 252   #remove lock
 253   rm -f ${lockFile}
 254 }
 255
 256 executePlanB()
 257 {
 258   #in case of emergency
 259   if [[ -n ${MAILTO} ]]; then
 260     echo
 261     echo "trouble detected, sending email to ${MAILTO}"
 262
 263     cat ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 264   fi
 265 }
 266
 267 validate()
 268 {
 269   summarizeLogs ${1} >> ${logSummary}
 270   logStatus=$?
 271   if [[ ${logStatus} -ne 0 ]]; then
 272     echo "WARNING not validated: ${1}"
 273     planB=1
 274     return 1
 275   fi
 276   return 0
 277 }
 278
 279 summarizeLogs()
 280 {
 281   local dir=$1
 282   [[ ! -d ${dir} ]] && dir=${PWD}
 283
 284   #print a summary of logs
 285   logFiles=(
 286       "*.log"
 287       "stdout"
 288       "stderr"
 289   )
 290
 291   #check logs
 292   local logstatus=0
 293   for log in ${dir}/${logFiles[*]}; do
 294     finallog=${PWD%/}/${log}
 295     [[ ! -f ${log} ]] && continue
 296     errorSummary=$(validateLog ${log})
 297     validationStatus=$?
 298     [[ validationStatus -ne 0 ]] && logstatus=1
 299     if [[ ${validationStatus} -eq 0 ]]; then
 300       #in pretend mode randomly report an error in rec.log some cases
 301       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 302         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 303       else
 304         echo "${finallog} OK"
 305       fi
 306     elif [[ ${validationStatus} -eq 1 ]]; then
 307       echo "${finallog} BAD ${errorSummary}"
 308     elif [[ ${validationStatus} -eq 2 ]]; then
 309       echo "${finallog} OK MWAH ${errorSummary}"
 310     fi
 311   done
 312
 313   #report core files
 314   while read x; do
 315     echo ${x}
 316     chmod 644 ${x}
 317     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 318   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 319
 320   return ${logstatus}
 321 }
 322
 323 validateLog()
 324 {
 325   log=${1}
 326   errorConditions=(
 327             'There was a crash'
 328             'floating'
 329             'error while loading shared libraries'
 330             'std::bad_alloc'
 331             's_err_syswatch_'
 332             'Thread [0-9]* (Thread'
 333             'AliFatal'
 334             'core dumped'
 335             '\.C.*error:.*\.h: No such file'
 336             'segmentation'
 337             'Interpreter error recovered'
 338   )
 339
 340   warningConditions=(
 341             'This is serious'
 342   )
 343
 344   local logstatus=0
 345   local errorSummary=""
 346   local warningSummary=""
 347
 348   for ((i=0; i<${#errorConditions[@]};i++)); do
 349     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 350     [[ -n ${tmp} ]] && tmp+=" : "
 351     errorSummary+=${tmp}
 352   done
 353
 354   for ((i=0; i<${#warningConditions[@]};i++)); do
 355     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 356     [[ -n ${tmp} ]] && tmp+=" : "
 357     warningSummary+=${tmp}
 358   done
 359
 360   if [[ -n ${errorSummary} ]]; then
 361     echo "${errorSummary}"
 362     return 1
 363   fi
 364
 365   if [[ -n ${warningSummary} ]]; then
 366     echo "${warningSummary}"
 367     return 2
 368   fi
 369
 370   return 0
 371 }
 372
 373 parseConfig()
 374 {
 375   #config file
 376   configFile=""
 377   #where to search for qa files
 378   inputList=file.list
 379   #working directory
 380   workingDirectory="${PWD}"
 381   #where to place the final qa plots
 382   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 383   outputDirectory="${workingDirectory}/%DET"
 384   #filter out detector option
 385   excludeDetectors="EXAMPLE"
 386   #logs
 387   logDirectory=${workingDirectory}/logs
 388   #set aliroot
 389   #alirootEnv="/home/mkrzewic/alisoft/balice_master.sh"
 390   #OCDB storage
 391   #ocdbStorage="raw://"
 392   #email to
 393   #MAILTO="fbellini@cern.ch"
 394
 395   #first, check if the config file is configured
 396   #is yes - source it so that other options can override it
 397   #if any
 398   for opt in $@; do
 399     if [[ ${opt} =~ configFile=.* ]]; then
 400       eval "${opt}"
 401       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 402       source "${configFile}"
 403       break
 404     fi
 405   done
 406
 407   #then, parse the options as they override the options from file
 408   while [[ -n ${1} ]]; do
 409     local var=${1#--}
 410     if [[ ${var} =~ .*=.* ]]; then
 411       eval "${var}"
 412     else
 413       echo "badly formatted option ${var}, should be: option=value, stopping..."
 414       return 1
 415     fi
 416     shift
 417   done
 418 }
 419
 420 guessRunData()
 421 {
 422   #guess the period from the path, pick the rightmost one
 423   period=""
 424   runNumber=""
 425   year=""
 426   pass=""
 427   legoTrainRunNumber=""
 428   dataType=""
 429
 430   local shortRunNumber=""
 431   local IFS="/"
 432   declare -a path=( $1 )
 433   local dirDepth=$(( ${#path[*]}-1 ))
 434   i=0
 435   for ((x=${dirDepth};x>=0;x--)); do
 436
 437     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 438     local field=${path[${x}]}
 439     local fieldNext=${path[$((x+1))]}
 440
 441     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 442     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 443     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
 444     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 445     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 446     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 447     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 448     (( i++ ))
 449   done
 450   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 451   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
 452
 453   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 454   if [[ -z ${runNumber}} ]];
 455   then
 456     #error condition
 457     return 1
 458   else
 459     #ALL OK
 460     return 0
 461   fi
 462 }
 463
 464 substituteDetectorName()
 465 {
 466   local det=$1
 467   local dir=$2
 468   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 469   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 470 }
 471
 472 get_realpath()
 473 {
 474   if [[ -f "$1" ]]
 475   then
 476     # file *must* exist
 477     if cd "$(echo "${1%/*}")" &>/dev/null
 478     then
 479       # file *may* not be local
 480       # exception is ./file.ext
 481       # try 'cd .; cd -;' *works!*
 482       local tmppwd="$PWD"
 483       cd - &>/dev/null
 484     else
 485       # file *must* be local
 486       local tmppwd="$PWD"
 487     fi
 488   else
 489     # file *cannot* exist
 490     return 1 # failure
 491   fi
 492   # reassemble realpath
 493   echo "$tmppwd"/"${1##*/}"
 494   return 0 # success
 495 }
 496
 497 main $@