PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig $@; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init
  24   fi
  25
  26   updateQA $@
  27 }
  28
  29 updateQA()
  30 {
  31   umask 0002
  32   parseConfig $@
  33
  34   #be paranoid and make some full paths
  35   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  36   inputList=$(get_realpath ${inputList})
  37   mkdir -p ${workingDirectory}
  38   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  39   if [[ ! -d ${workingDirectory} ]]; then
  40     echo "working dir $workingDirectory does not exist and cannot be created"
  41     return 1
  42   fi
  43   cd ${workingDirectory}
  44
  45   echo JOB config:
  46   echo inputList=$inputList
  47   echo outputDirectory=$outputDirectory
  48   echo
  49
  50   dateString=$(date +%Y-%m-%d-%H-%M)
  51   echo "Start time QA process: $dateString"
  52
  53   #logging
  54   mkdir -p $logDirectory
  55   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  56   logFile="$logDirectory/${0##*/}.${dateString}.log"
  57   touch ${logFile}
  58   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  59   echo "logFile = $logFile"
  60
  61   #check lock
  62   lockFile=${logDirectory}/runQA.lock
  63   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
  64   touch ${lockFile}
  65   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
  66
  67   exec &>${logFile}
  68
  69   ################################################################
  70   #ze detector loop
  71   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  72     unset planB
  73     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
  74     detector=${detectorScript%.sh}
  75     detector=${detector##*/}
  76
  77     #skip if excluded
  78     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  79       echo "${detector} is excluded in config, skipping..."
  80       continue
  81     fi
  82
  83     #if includeDetectors set, only process thoe detectors specified there
  84     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  85       echo "${detector} not included in includeDetectors, skipping..."
  86       continue
  87     fi
  88
  89     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  90     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  91     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
  92     if ! mkdir -p ${tmpDetectorRunDir}; then
  93       echo "cannot create the temp dir $tmpDetectorRunDir"
  94       continue
  95     fi
  96     cd ${tmpDetectorRunDir}
  97
  98     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
  99     echo
 100     echo "##############################################"
 101     echo "running QA for ${detector}"
 102     echo "  outputDir=$outputDir"
 103     echo "  tmpPrefix=$tmpPrefix"
 104
 105     unset -f runLevelQA
 106     unset -f periodLevelQA
 107     unset -f runLevelHighPtTreeQA
 108     unset -f periodLevelHighPtTreeQA
 109     source ${detectorScript}
 110
 111     #################################################################
 112     #produce the QA and trending tree for each file (run)
 113     unset arrOfTouchedProductions
 114     declare -A arrOfTouchedProductions
 115     while read qaFile; do
 116       echo
 117
 118       if ! guessRunData ${qaFile}; then
 119         echo "could not guess run data from ${qaFile}"
 120         continue
 121       fi
 122
 123       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 124       tmpRunDir=${tmpProductionDir}/000${runNumber}
 125       mkdir -p ${tmpRunDir}
 126       cd ${tmpRunDir}
 127
 128       #by default we expect to have everything in the same archive
 129       highPtTree=${qaFile}
 130
 131       #maybe the input is not an archive, but a file
 132       [[ "${qaFile}" =~ "QAresults.root" ]] && highPtTree=""
 133       [[ "${qaFile}" =~ "FilterEvents_Trees.root" ]] && qaFile=""
 134
 135       #it is possible we get the highPt trees from somewhere else
 136       #search the list of high pt trees for the proper run number
 137       if [[ -n ${inputListHighPtTrees} ]]; then
 138         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 139         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 140       fi
 141
 142       echo qaFile=$qaFile
 143       echo highPtTree=$highPtTree
 144
 145       #what if we have a zip archive?
 146       if [[ "$qaFile" =~ .*.zip$ ]]; then
 147         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 148           qaFile="${qaFile}#QAresults.root"
 149         else
 150           qaFile=""
 151         fi
 152       fi
 153       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 154         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 155           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 156         else
 157           highPtTree=""
 158         fi
 159       fi
 160
 161       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 162         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 163         runLevelQA "${qaFile}" &> runLevelQA.log
 164         #perform some default actions:
 165         #if trending.root not created, create a default one
 166         if [[ ! -f trending.root ]]; then
 167           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" &>> runLevelQA.log
 168         fi
 169         arrOfTouchedProductions[${tmpProductionDir}]=1
 170       fi
 171       #expert QA based on high pt trees
 172       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 173         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 174         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 175         arrOfTouchedProductions[${tmpProductionDir}]=1
 176       fi
 177
 178       cd ${tmpDetectorRunDir}
 179
 180     done < ${inputList}
 181
 182     #################################################################
 183     #cache which productions were (re)done
 184     echo "list of processed productions:"
 185     echo "    ${!arrOfTouchedProductions[@]}"
 186     echo
 187
 188     #################################################################
 189     #(re)do the merging/trending
 190     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 191       cd ${tmpProductionDir}
 192       echo
 193       echo "running period level stuff in ${tmpProductionDir}"
 194
 195       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 196       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 197
 198       mkdir -p ${productionDir}
 199       if [[ ! -d ${productionDir} ]]; then
 200         echo "cannot make productionDir $productionDir" && continue
 201       fi
 202
 203       #move runs to final destination
 204       for dir in ${tmpProductionDir}/000*; do
 205         echo
 206         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 207         if ! guessRunData "${dir}/dummyName"; then
 208           echo "could not guess run data from ${dir}"
 209           continue
 210         fi
 211
 212         #before moving - VALIDATE!!!
 213         if ! validate ${dir}; then
 214           continue
 215         fi
 216
 217         if [[ -d ${oldRunDir} ]]; then
 218           echo "removing old ${oldRunDir}"
 219           rm -rf ${oldRunDir}
 220         fi
 221         echo "moving new ${runNumber} to ${productionDir}"
 222         mv -f ${dir} ${productionDir}
 223       done
 224
 225       #go to a temp dir to do the period level stuff
 226       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 227       echo
 228       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 229       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
 230       cd ${tmpPeriodLevelQAdir}
 231
 232       #link the final list of per-run dirs here, just the dirs
 233       #to have a clean working directory
 234       unset linkedStuff
 235       declare -a linkedStuff
 236       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
 237       ls
 238
 239       #merge trending files if any
 240       if /bin/ls 000*/trending.root &>/dev/null; then
 241         hadd trending.root 000*/trending.root &> periodLevelQA.log
 242       fi
 243
 244       #run the period level trending/QA
 245       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 246         echo running ${detector} periodLevelQA for production ${period}/${pass}
 247         periodLevelQA trending.root &>> periodLevelQA.log
 248       else
 249         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 250       fi
 251
 252       if ! validate ${PWD}; then continue; fi
 253
 254       #here we are validated so move the produced QA to the final place
 255       #clean up linked stuff first
 256       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
 257       #some of the output could be a directory, so handle that
 258       #TODO: maybe use rsync?
 259       for x in ${tmpPeriodLevelQAdir}/*; do
 260         if [[ -d ${x} ]]; then
 261           echo "removing ${productionDir}/${x##*/}"
 262           rm -rf ${productionDir}/${x##*/}
 263           echo "moving ${x} to ${productionDir}"
 264           mv ${x} ${productionDir}
 265         fi
 266         if [[ -f ${x} ]]; then
 267           echo "moving ${x} to ${productionDir}"
 268           mv -f ${x} ${productionDir}
 269         fi
 270       done
 271
 272       #remove the temp dir
 273       rm -rf ${tmpPeriodLevelQAdir}
 274
 275     done
 276
 277     cd ${workingDirectory}
 278
 279     if [[ -z ${planB} ]]; then
 280       echo
 281       echo removing ${tmpDetectorRunDir}
 282       rm -rf ${tmpDetectorRunDir}
 283     else
 284       executePlanB
 285     fi
 286   done
 287
 288   #remove lock
 289   rm -f ${lockFile}
 290 }
 291
 292 executePlanB()
 293 {
 294   #in case of emergency
 295   if [[ -n ${MAILTO} ]]; then
 296     echo
 297     echo "trouble detected, sending email to ${MAILTO}"
 298
 299     cat ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 300   fi
 301 }
 302
 303 validate()
 304 {
 305   summarizeLogs ${1} >> ${logSummary}
 306   logStatus=$?
 307   if [[ ${logStatus} -ne 0 ]]; then
 308     echo "WARNING not validated: ${1}"
 309     planB=1
 310     return 1
 311   fi
 312   return 0
 313 }
 314
 315 summarizeLogs()
 316 {
 317   local dir=$1
 318   [[ ! -d ${dir} ]] && dir=${PWD}
 319
 320   #print a summary of logs
 321   logFiles=(
 322       "*.log"
 323       "stdout"
 324       "stderr"
 325   )
 326
 327   #check logs
 328   local logstatus=0
 329   for log in ${dir}/${logFiles[*]}; do
 330     finallog=${PWD%/}/${log}
 331     [[ ! -f ${log} ]] && continue
 332     errorSummary=$(validateLog ${log})
 333     validationStatus=$?
 334     [[ validationStatus -ne 0 ]] && logstatus=1
 335     if [[ ${validationStatus} -eq 0 ]]; then
 336       #in pretend mode randomly report an error in rec.log some cases
 337       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 338         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 339       else
 340         echo "${finallog} OK"
 341       fi
 342     elif [[ ${validationStatus} -eq 1 ]]; then
 343       echo "${finallog} BAD ${errorSummary}"
 344     elif [[ ${validationStatus} -eq 2 ]]; then
 345       echo "${finallog} OK MWAH ${errorSummary}"
 346     fi
 347   done
 348
 349   #report core files
 350   while read x; do
 351     echo ${x}
 352     chmod 644 ${x}
 353     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 354   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 355
 356   return ${logstatus}
 357 }
 358
 359 validateLog()
 360 {
 361   log=${1}
 362   errorConditions=(
 363             'There was a crash'
 364             'floating'
 365             'error while loading shared libraries'
 366             'std::bad_alloc'
 367             's_err_syswatch_'
 368             'Thread [0-9]* (Thread'
 369             'AliFatal'
 370             'core dumped'
 371             '\.C.*error:.*\.h: No such file'
 372             'segmentation'
 373             'Interpreter error recovered'
 374   )
 375
 376   warningConditions=(
 377             'This is serious'
 378   )
 379
 380   local logstatus=0
 381   local errorSummary=""
 382   local warningSummary=""
 383
 384   for ((i=0; i<${#errorConditions[@]};i++)); do
 385     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 386     [[ -n ${tmp} ]] && tmp+=" : "
 387     errorSummary+=${tmp}
 388   done
 389
 390   for ((i=0; i<${#warningConditions[@]};i++)); do
 391     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 392     [[ -n ${tmp} ]] && tmp+=" : "
 393     warningSummary+=${tmp}
 394   done
 395
 396   if [[ -n ${errorSummary} ]]; then
 397     echo "${errorSummary}"
 398     return 1
 399   fi
 400
 401   if [[ -n ${warningSummary} ]]; then
 402     echo "${warningSummary}"
 403     return 2
 404   fi
 405
 406   return 0
 407 }
 408
 409 parseConfig()
 410 {
 411   #config file
 412   configFile=""
 413   #where to search for qa files
 414   inputList=file.list
 415   #working directory
 416   workingDirectory="${PWD}"
 417   #where to place the final qa plots
 418   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 419   outputDirectory="${workingDirectory}/%DET"
 420   #filter out detector option
 421   excludeDetectors="EXAMPLE"
 422   #logs
 423   logDirectory=${workingDirectory}/logs
 424   #set aliroot
 425   #alirootEnv="/home/mkrzewic/alisoft/balice_master.sh"
 426   #OCDB storage
 427   #ocdbStorage="raw://"
 428   #email to
 429   #MAILTO="fbellini@cern.ch"
 430
 431   #first, check if the config file is configured
 432   #is yes - source it so that other options can override it
 433   #if any
 434   for opt in $@; do
 435     if [[ ${opt} =~ configFile=.* ]]; then
 436       eval "${opt}"
 437       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 438       source "${configFile}"
 439       break
 440     fi
 441   done
 442
 443   #then, parse the options as they override the options from file
 444   while [[ -n ${1} ]]; do
 445     local var=${1#--}
 446     if [[ ${var} =~ .*=.* ]]; then
 447       eval "${var}"
 448     else
 449       echo "badly formatted option ${var}, should be: option=value, stopping..."
 450       return 1
 451     fi
 452     shift
 453   done
 454 }
 455
 456 guessRunData()
 457 {
 458   #guess the period from the path, pick the rightmost one
 459   period=""
 460   runNumber=""
 461   year=""
 462   pass=""
 463   legoTrainRunNumber=""
 464   dataType=""
 465
 466   local shortRunNumber=""
 467   local IFS="/"
 468   declare -a path=( $1 )
 469   local dirDepth=$(( ${#path[*]}-1 ))
 470   i=0
 471   for ((x=${dirDepth};x>=0;x--)); do
 472
 473     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 474     local field=${path[${x}]}
 475     local fieldNext=${path[$((x+1))]}
 476
 477     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 478     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 479     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
 480     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 481     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 482     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 483     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 484     (( i++ ))
 485   done
 486   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 487   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
 488
 489   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 490   if [[ -z ${runNumber}} ]];
 491   then
 492     #error condition
 493     return 1
 494   else
 495     #ALL OK
 496     return 0
 497   fi
 498 }
 499
 500 substituteDetectorName()
 501 {
 502   local det=$1
 503   local dir=$2
 504   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 505   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 506 }
 507
 508 get_realpath()
 509 {
 510   if [[ -f "$1" ]]
 511   then
 512     # file *must* exist
 513     if cd "$(echo "${1%/*}")" &>/dev/null
 514     then
 515       # file *may* not be local
 516       # exception is ./file.ext
 517       # try 'cd .; cd -;' *works!*
 518       local tmppwd="$PWD"
 519       cd - &>/dev/null
 520     else
 521       # file *must* be local
 522       local tmppwd="$PWD"
 523     fi
 524   else
 525     # file *cannot* exist
 526     return 1 # failure
 527   fi
 528   # reassemble realpath
 529   echo "$tmppwd"/"${1##*/}"
 530   return 0 # success
 531 }
 532
 533 main $@