PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig $@; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init
  24   fi
  25
  26   updateQA $@
  27 }
  28
  29 updateQA()
  30 {
  31   umask 0002
  32   parseConfig $@
  33
  34   #be paranoid and make some full paths
  35   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  36   inputList=$(get_realpath ${inputList})
  37   mkdir -p ${workingDirectory}
  38   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  39   if [[ ! -d ${workingDirectory} ]]; then
  40     echo "working dir $workingDirectory does not exist and cannot be created"
  41     return 1
  42   fi
  43   cd ${workingDirectory}
  44
  45   echo JOB config:
  46   echo inputList=$inputList
  47   echo outputDirectory=$outputDirectory
  48   echo
  49
  50   dateString=$(date +%Y-%m-%d-%H-%M)
  51   echo "Start time QA process: $dateString"
  52
  53   #logging
  54   mkdir -p $logDirectory
  55   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  56   logFile="$logDirectory/${0##*/}.${dateString}.log"
  57   touch ${logFile}
  58   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  59   echo "logFile = $logFile"
  60
  61   #check lock
  62   lockFile=${logDirectory}/runQA.lock
  63   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
  64   touch ${lockFile}
  65   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
  66
  67   exec &>${logFile}
  68
  69   ################################################################
  70   #ze detector loop
  71   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  72     unset planB
  73     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
  74     detector=${detectorScript%.sh}
  75     detector=${detector##*/}
  76
  77     #skip if excluded
  78     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  79       echo "${detector} is excluded in config, skipping..."
  80       continue
  81     fi
  82
  83     #if includeDetectors set, only process thoe detectors specified there
  84     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  85       echo "${detector} not included in includeDetectors, skipping..."
  86       continue
  87     fi
  88
  89     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  90     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  91     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
  92     if ! mkdir -p ${tmpDetectorRunDir}; then
  93       echo "cannot create the temp dir $tmpDetectorRunDir"
  94       continue
  95     fi
  96     cd ${tmpDetectorRunDir}
  97
  98     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
  99     echo
 100     echo "##############################################"
 101     echo "running QA for ${detector}"
 102     echo "  outputDir=$outputDir"
 103     echo "  tmpPrefix=$tmpPrefix"
 104
 105     unset -f runLevelQA
 106     unset -f periodLevelQA
 107     unset -f runLevelHighPtTreeQA
 108     unset -f periodLevelHighPtTreeQA
 109     source ${detectorScript}
 110
 111     #################################################################
 112     #produce the QA and trending tree for each file (run)
 113     unset arrOfTouchedProductions
 114     declare -A arrOfTouchedProductions
 115     while read qaFile; do
 116       echo
 117
 118       if ! guessRunData ${qaFile}; then
 119         echo "could not guess run data from ${qaFile}"
 120         continue
 121       fi
 122
 123       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 124       tmpRunDir=${tmpProductionDir}/000${runNumber}
 125       mkdir -p ${tmpRunDir}
 126       cd ${tmpRunDir}
 127
 128       #by default we expect to have everything in the same archive
 129       highPtTree=${qaFile}
 130
 131       #maybe the input is not an archive, but a file
 132       [[ "${qaFile}" =~ "QAresults.root" ]] && highPtTree=""
 133       [[ "${qaFile}" =~ "FilterEvents_Trees.root" ]] && qaFile=""
 134
 135       #it is possible we get the highPt trees from somewhere else
 136       #search the list of high pt trees for the proper run number
 137       if [[ -n ${inputListHighPtTrees} ]]; then
 138         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 139         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 140       fi
 141
 142       echo qaFile=$qaFile
 143       echo highPtTree=$highPtTree
 144
 145       #what if we have a zip archive?
 146       if [[ "$qaFile" =~ .*.zip$ ]]; then
 147         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 148           qaFile="${qaFile}#QAresults.root"
 149         else
 150           qaFile=""
 151         fi
 152       fi
 153       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 154         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 155           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 156         else
 157           highPtTree=""
 158         fi
 159       fi
 160
 161       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 162         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 163         runLevelQA "${qaFile}" &> runLevelQA.log
 164         #perform some default actions:
 165         #if trending.root not created, create a default one
 166         if [[ ! -f trending.root ]]; then
 167           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" &>> runLevelQA.log
 168         fi
 169         arrOfTouchedProductions[${tmpProductionDir}]=1
 170       fi
 171       #expert QA based on high pt trees
 172       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 173         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 174         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 175         arrOfTouchedProductions[${tmpProductionDir}]=1
 176       fi
 177
 178       cd ${tmpDetectorRunDir}
 179
 180     done < ${inputList}
 181
 182     #################################################################
 183     #cache which productions were (re)done
 184     echo "list of processed productions:"
 185     echo "    ${!arrOfTouchedProductions[@]}"
 186     echo
 187
 188     #################################################################
 189     #(re)do the merging/trending
 190     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 191       cd ${tmpProductionDir}
 192       echo
 193       echo "running period level stuff in ${tmpProductionDir}"
 194
 195       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 196       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 197
 198       mkdir -p ${productionDir}
 199       if [[ ! -d ${productionDir} ]]; then
 200         echo "cannot make productionDir $productionDir" && continue
 201       fi
 202
 203       #move runs to final destination
 204       for dir in ${tmpProductionDir}/000*; do
 205         echo
 206         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 207         if ! guessRunData "${dir}/dummyName"; then
 208           echo "could not guess run data from ${dir}"
 209           continue
 210         fi
 211
 212         #before moving - VALIDATE!!!
 213         if ! validate ${dir}; then
 214           continue
 215         fi
 216
 217         if [[ -d ${oldRunDir} ]]; then
 218           echo "removing old ${oldRunDir}"
 219           rm -rf ${oldRunDir}
 220         fi
 221         echo "moving new ${runNumber} to ${productionDir}"
 222         mv -f ${dir} ${productionDir}
 223       done
 224
 225       #go to a temp dir to do the period level stuff
 226       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 227       echo
 228       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 229       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
 230       cd ${tmpPeriodLevelQAdir}
 231
 232       #link the final list of per-run dirs here, just the dirs
 233       #to have a clean working directory
 234       unset linkedStuff
 235       declare -a linkedStuff
 236       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
 237
 238       #merge trending files if any
 239       if /bin/ls 000*/trending.root &>/dev/null; then
 240         hadd trending.root 000*/trending.root &> periodLevelQA.log
 241       fi
 242
 243       #run the period level trending/QA
 244       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 245         echo running ${detector} periodLevelQA for production ${period}/${pass}
 246         periodLevelQA trending.root &>> periodLevelQA.log
 247       else
 248         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 249       fi
 250
 251       if ! validate ${PWD}; then continue; fi
 252
 253       #here we are validated so move the produced QA to the final place
 254       #clean up linked stuff first
 255       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
 256       #some of the output could be a directory, so handle that
 257       #TODO: maybe use rsync?
 258       for x in ${tmpPeriodLevelQAdir}/*; do
 259         if [[ -d ${x} ]]; then
 260           echo "removing ${productionDir}/${x##*/}"
 261           rm -rf ${productionDir}/${x##*/}
 262           echo "moving ${x} to ${productionDir}"
 263           mv ${x} ${productionDir}
 264         fi
 265         if [[ -f ${x} ]]; then
 266           echo "moving ${x} to ${productionDir}"
 267           mv -f ${x} ${productionDir}
 268         fi
 269       done
 270
 271       #remove the temp dir
 272       rm -rf ${tmpPeriodLevelQAdir}
 273
 274     done
 275
 276     cd ${workingDirectory}
 277
 278     if [[ -z ${planB} ]]; then
 279       echo
 280       echo removing ${tmpDetectorRunDir}
 281       rm -rf ${tmpDetectorRunDir}
 282     else
 283       executePlanB
 284     fi
 285   done
 286
 287   #remove lock
 288   rm -f ${lockFile}
 289 }
 290
 291 executePlanB()
 292 {
 293   #in case of emergency
 294   if [[ -n ${MAILTO} ]]; then
 295     echo
 296     echo "trouble detected, sending email to ${MAILTO}"
 297
 298     cat ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 299   fi
 300 }
 301
 302 validate()
 303 {
 304   summarizeLogs ${1} >> ${logSummary}
 305   logStatus=$?
 306   if [[ ${logStatus} -ne 0 ]]; then
 307     echo "WARNING not validated: ${1}"
 308     planB=1
 309     return 1
 310   fi
 311   return 0
 312 }
 313
 314 summarizeLogs()
 315 {
 316   local dir=$1
 317   [[ ! -d ${dir} ]] && dir=${PWD}
 318
 319   #print a summary of logs
 320   logFiles=(
 321       "*.log"
 322       "stdout"
 323       "stderr"
 324   )
 325
 326   #check logs
 327   local logstatus=0
 328   for log in ${dir}/${logFiles[*]}; do
 329     finallog=${PWD%/}/${log}
 330     [[ ! -f ${log} ]] && continue
 331     errorSummary=$(validateLog ${log})
 332     validationStatus=$?
 333     [[ validationStatus -ne 0 ]] && logstatus=1
 334     if [[ ${validationStatus} -eq 0 ]]; then
 335       #in pretend mode randomly report an error in rec.log some cases
 336       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 337         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 338       else
 339         echo "${finallog} OK"
 340       fi
 341     elif [[ ${validationStatus} -eq 1 ]]; then
 342       echo "${finallog} BAD ${errorSummary}"
 343     elif [[ ${validationStatus} -eq 2 ]]; then
 344       echo "${finallog} OK MWAH ${errorSummary}"
 345     fi
 346   done
 347
 348   #report core files
 349   while read x; do
 350     echo ${x}
 351     chmod 644 ${x}
 352     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 353   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 354
 355   return ${logstatus}
 356 }
 357
 358 validateLog()
 359 {
 360   log=${1}
 361   errorConditions=(
 362             'There was a crash'
 363             'floating'
 364             'error while loading shared libraries'
 365             'std::bad_alloc'
 366             's_err_syswatch_'
 367             'Thread [0-9]* (Thread'
 368             'AliFatal'
 369             'core dumped'
 370             '\.C.*error:.*\.h: No such file'
 371             'segmentation'
 372             'Interpreter error recovered'
 373   )
 374
 375   warningConditions=(
 376             'This is serious'
 377   )
 378
 379   local logstatus=0
 380   local errorSummary=""
 381   local warningSummary=""
 382
 383   for ((i=0; i<${#errorConditions[@]};i++)); do
 384     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 385     [[ -n ${tmp} ]] && tmp+=" : "
 386     errorSummary+=${tmp}
 387   done
 388
 389   for ((i=0; i<${#warningConditions[@]};i++)); do
 390     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 391     [[ -n ${tmp} ]] && tmp+=" : "
 392     warningSummary+=${tmp}
 393   done
 394
 395   if [[ -n ${errorSummary} ]]; then
 396     echo "${errorSummary}"
 397     return 1
 398   fi
 399
 400   if [[ -n ${warningSummary} ]]; then
 401     echo "${warningSummary}"
 402     return 2
 403   fi
 404
 405   return 0
 406 }
 407
 408 parseConfig()
 409 {
 410   #config file
 411   configFile=""
 412   #where to search for qa files
 413   inputList=file.list
 414   #working directory
 415   workingDirectory="${PWD}"
 416   #where to place the final qa plots
 417   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 418   outputDirectory="${workingDirectory}/%DET"
 419   #filter out detector option
 420   excludeDetectors="EXAMPLE"
 421   #logs
 422   logDirectory=${workingDirectory}/logs
 423   #OCDB storage
 424   #ocdbStorage="raw://"
 425   #email to
 426   #MAILTO="fbellini@cern.ch"
 427
 428   #first, check if the config file is configured
 429   #is yes - source it so that other options can override it
 430   #if any
 431   for opt in $@; do
 432     if [[ ${opt} =~ configFile=.* ]]; then
 433       eval "${opt}"
 434       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 435       source "${configFile}"
 436       break
 437     fi
 438   done
 439
 440   #then, parse the options as they override the options from file
 441   while [[ -n ${1} ]]; do
 442     local var=${1#--}
 443     if [[ ${var} =~ .*=.* ]]; then
 444       eval "${var}"
 445     else
 446       echo "badly formatted option ${var}, should be: option=value, stopping..."
 447       return 1
 448     fi
 449     shift
 450   done
 451 }
 452
 453 guessRunData()
 454 {
 455   #guess the period from the path, pick the rightmost one
 456   period=""
 457   runNumber=""
 458   year=""
 459   pass=""
 460   legoTrainRunNumber=""
 461   dataType=""
 462
 463   local shortRunNumber=""
 464   local IFS="/"
 465   declare -a path=( $1 )
 466   local dirDepth=$(( ${#path[*]}-1 ))
 467   i=0
 468   for ((x=${dirDepth};x>=0;x--)); do
 469
 470     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 471     local field=${path[${x}]}
 472     local fieldNext=${path[$((x+1))]}
 473
 474     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 475     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 476     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
 477     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 478     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 479     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 480     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 481     (( i++ ))
 482   done
 483   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 484   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
 485
 486   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 487   if [[ -z ${runNumber}} ]];
 488   then
 489     #error condition
 490     return 1
 491   else
 492     #ALL OK
 493     return 0
 494   fi
 495 }
 496
 497 substituteDetectorName()
 498 {
 499   local det=$1
 500   local dir=$2
 501   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 502   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 503 }
 504
 505 get_realpath()
 506 {
 507   if [[ -f "$1" ]]
 508   then
 509     # file *must* exist
 510     if cd "$(echo "${1%/*}")" &>/dev/null
 511     then
 512       # file *may* not be local
 513       # exception is ./file.ext
 514       # try 'cd .; cd -;' *works!*
 515       local tmppwd="$PWD"
 516       cd - &>/dev/null
 517     else
 518       # file *must* be local
 519       local tmppwd="$PWD"
 520     fi
 521   else
 522     # file *cannot* exist
 523     return 1 # failure
 524   fi
 525   # reassemble realpath
 526   echo "$tmppwd"/"${1##*/}"
 527   return 0 # success
 528 }
 529
 530 main $@