PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig $@; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init ${alienUserName}
  24     #this is a hack! alien-token init seems not enough
  25     #but the gclient_env script messes up the LD_LIBRARY_PATH
  26     while read x; do
  27       eval ${x};
  28     done < <(grep -v "LD_LIBRARY_PATH" /tmp/gclient_env_${UID})
  29   fi
  30
  31   updateQA $@
  32 }
  33
  34 updateQA()
  35 {
  36   umask 0002
  37   parseConfig $@
  38
  39   #be paranoid and make some full paths
  40   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  41   inputList=$(get_realpath ${inputList})
  42   mkdir -p ${workingDirectory}
  43   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  44   if [[ ! -d ${workingDirectory} ]]; then
  45     echo "working dir $workingDirectory does not exist and cannot be created"
  46     return 1
  47   fi
  48   cd ${workingDirectory}
  49
  50   echo JOB config:
  51   echo inputList=$inputList
  52   echo outputDirectory=$outputDirectory
  53   echo
  54
  55   dateString=$(date +%Y-%m-%d-%H-%M)
  56   echo "Start time QA process: $dateString"
  57
  58   #logging
  59   mkdir -p $logDirectory
  60   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  61   logFile="$logDirectory/${0##*/}.${dateString}.log"
  62   touch ${logFile}
  63   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  64   echo "logFile = $logFile"
  65
  66   #check lock
  67   lockFile=${logDirectory}/runQA.lock
  68   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
  69   touch ${lockFile}
  70   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
  71
  72   exec &>${logFile}
  73
  74   ################################################################
  75   #ze detector loop
  76   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  77     echo
  78     echo "##############################################"
  79     unset planB
  80     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
  81     detector=${detectorScript%.sh}
  82     detector=${detector##*/}
  83
  84     #skip if excluded
  85     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  86       echo "${detector} is excluded in config, skipping..."
  87       continue
  88     fi
  89
  90     #if includeDetectors set, only process thoe detectors specified there
  91     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  92       echo "${detector} not included in includeDetectors, skipping..."
  93       continue
  94     fi
  95
  96     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  97     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  98     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
  99     if ! mkdir -p ${tmpDetectorRunDir}; then
 100       echo "cannot create the temp dir $tmpDetectorRunDir"
 101       continue
 102     fi
 103     cd ${tmpDetectorRunDir}
 104
 105     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
 106     echo "running QA for ${detector}"
 107     echo "  outputDir=$outputDir"
 108     echo "  tmpPrefix=$tmpPrefix"
 109
 110     unset -f runLevelQA
 111     unset -f periodLevelQA
 112     unset -f runLevelHighPtTreeQA
 113     unset -f periodLevelHighPtTreeQA
 114     source ${detectorScript}
 115
 116     #################################################################
 117     #produce the QA and trending tree for each file (run)
 118     unset arrOfTouchedProductions
 119     declare -A arrOfTouchedProductions
 120     while read qaFile; do
 121       echo
 122
 123       #first check if input file exists
 124       [[ ! -f ${qaFile%\#*} ]] && echo "file ${qaFile%\#*} not accessible" && continue
 125
 126       if ! guessRunData ${qaFile}; then
 127         echo "could not guess run data from ${qaFile}"
 128         continue
 129       fi
 130
 131       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 132       tmpRunDir=${tmpProductionDir}/000${runNumber}
 133       mkdir -p ${tmpRunDir}
 134       cd ${tmpRunDir}
 135
 136       #by default we expect to have everything in the same archive
 137       highPtTree=${qaFile}
 138
 139       #maybe the input is not an archive, but a file
 140       [[ "${qaFile}" =~ QAresults.root$ ]] && highPtTree=""
 141       [[ "${qaFile}" =~ FilterEvents_Trees.root$ ]] && qaFile=""
 142
 143       #it is possible we get the highPt trees from somewhere else
 144       #search the list of high pt trees for the proper run number
 145       if [[ -n ${inputListHighPtTrees} ]]; then
 146         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 147         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 148       fi
 149
 150       echo qaFile=$qaFile
 151       echo highPtTree=$highPtTree
 152
 153       #what if we have a zip archive?
 154       if [[ "$qaFile" =~ .*.zip$ ]]; then
 155         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 156           qaFile="${qaFile}#QAresults.root"
 157         else
 158           qaFile=""
 159         fi
 160       fi
 161       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 162         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 163           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 164         else
 165           highPtTree=""
 166         fi
 167       fi
 168
 169       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 170         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 171         runLevelQA "${qaFile}" &> runLevelQA.log
 172         #perform some default actions:
 173         #if trending.root not created, create a default one
 174         if [[ ! -f trending.root ]]; then
 175           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" 2>&1 | tee -a runLevelQA.log
 176         fi
 177         if [[ -f trending.root ]]; then
 178           arrOfTouchedProductions[${tmpProductionDir}]=1
 179         else
 180           echo "trending.root not created"
 181         fi
 182       fi
 183       #expert QA based on high pt trees
 184       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 185         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 186         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 187         arrOfTouchedProductions[${tmpProductionDir}]=1
 188       fi
 189
 190       cd ${tmpDetectorRunDir}
 191
 192     done < ${inputList}
 193
 194     #################################################################
 195     #cache which productions were (re)done
 196     echo "list of processed productions:"
 197     echo "    ${!arrOfTouchedProductions[@]}"
 198     echo
 199
 200     #################################################################
 201     #(re)do the merging/trending
 202     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 203       cd ${tmpProductionDir}
 204       echo
 205       echo "running period level stuff in ${tmpProductionDir}"
 206
 207       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 208       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 209
 210       mkdir -p ${productionDir}
 211       if [[ ! -d ${productionDir} ]]; then
 212         echo "cannot make productionDir $productionDir" && continue
 213       fi
 214
 215       #move runs to final destination
 216       for dir in ${tmpProductionDir}/000*; do
 217         echo
 218         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 219         if ! guessRunData "${dir}/dummyName"; then
 220           echo "could not guess run data from ${dir}"
 221           continue
 222         fi
 223
 224         #before moving - VALIDATE!!!
 225         if ! validate ${dir}; then
 226           continue
 227         fi
 228
 229         if [[ -d ${oldRunDir} ]]; then
 230           echo "removing old ${oldRunDir}"
 231           rm -rf ${oldRunDir}
 232         fi
 233         echo "moving new ${runNumber} to ${productionDir}"
 234         mv -f ${dir} ${productionDir}
 235       done
 236
 237       #go to a temp dir to do the period level stuff in a completely clean dir
 238       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 239       echo
 240       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 241       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
 242       cd ${tmpPeriodLevelQAdir}
 243
 244       #link the final list of per-run dirs here, just the dirs
 245       #to have a clean working directory
 246       unset linkedStuff
 247       declare -a linkedStuff
 248       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
 249
 250       #merge trending files if any
 251       if /bin/ls 000*/trending.root &>/dev/null; then
 252         hadd trending.root 000*/trending.root &> periodLevelQA.log
 253       fi
 254
 255       #run the period level trending/QA
 256       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 257         echo running ${detector} periodLevelQA for production ${period}/${pass}
 258         periodLevelQA trending.root &>> periodLevelQA.log
 259       else
 260         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 261       fi
 262
 263       if ! validate ${PWD}; then continue; fi
 264
 265       #here we are validated so move the produced QA to the final place
 266       #clean up linked stuff first
 267       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
 268       #some of the output could be a directory, so handle that
 269       #TODO: maybe use rsync?
 270       for x in ${tmpPeriodLevelQAdir}/*; do
 271         if [[ -d ${x} ]]; then
 272           echo "removing ${productionDir}/${x##*/}"
 273           rm -rf ${productionDir}/${x##*/}
 274           echo "moving ${x} to ${productionDir}"
 275           mv ${x} ${productionDir}
 276         fi
 277         if [[ -f ${x} ]]; then
 278           echo "moving ${x} to ${productionDir}"
 279           mv -f ${x} ${productionDir}
 280         fi
 281       done
 282
 283       #remove the temp dir
 284       rm -rf ${tmpPeriodLevelQAdir}
 285
 286     done
 287
 288     cd ${workingDirectory}
 289
 290     if [[ -z ${planB} ]]; then
 291       echo
 292       echo removing ${tmpDetectorRunDir}
 293       rm -rf ${tmpDetectorRunDir}
 294     else
 295       executePlanB
 296     fi
 297   done #end of detector loop
 298
 299   #remove lock
 300   rm -f ${lockFile}
 301 }
 302
 303 executePlanB()
 304 {
 305   #in case of emergency
 306   if [[ -n ${MAILTO} ]]; then
 307     echo
 308     echo "trouble detected, sending email to ${MAILTO}"
 309
 310     cat ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 311   fi
 312 }
 313
 314 validate()
 315 {
 316   summarizeLogs ${1} >> ${logSummary}
 317   logStatus=$?
 318   if [[ ${logStatus} -ne 0 ]]; then
 319     echo "WARNING not validated: ${1}"
 320     planB=1
 321     return 1
 322   fi
 323   return 0
 324 }
 325
 326 summarizeLogs()
 327 {
 328   local dir=$1
 329   [[ ! -d ${dir} ]] && dir=${PWD}
 330
 331   #print a summary of logs
 332   logFiles=(
 333       "*.log"
 334       "stdout"
 335       "stderr"
 336   )
 337
 338   #check logs
 339   local logstatus=0
 340   for log in ${dir}/${logFiles[*]}; do
 341     finallog=${PWD%/}/${log}
 342     [[ ! -f ${log} ]] && continue
 343     errorSummary=$(validateLog ${log})
 344     validationStatus=$?
 345     [[ validationStatus -ne 0 ]] && logstatus=1
 346     if [[ ${validationStatus} -eq 0 ]]; then
 347       #in pretend mode randomly report an error in rec.log some cases
 348       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 349         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 350       else
 351         echo "${finallog} OK"
 352       fi
 353     elif [[ ${validationStatus} -eq 1 ]]; then
 354       echo "${finallog} BAD ${errorSummary}"
 355     elif [[ ${validationStatus} -eq 2 ]]; then
 356       echo "${finallog} OK MWAH ${errorSummary}"
 357     fi
 358   done
 359
 360   #report core files
 361   while read x; do
 362     echo ${x}
 363     chmod 644 ${x}
 364     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 365   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 366
 367   return ${logstatus}
 368 }
 369
 370 validateLog()
 371 {
 372   log=${1}
 373   errorConditions=(
 374             'There was a crash'
 375             'floating'
 376             'error while loading shared libraries'
 377             'std::bad_alloc'
 378             's_err_syswatch_'
 379             'Thread [0-9]* (Thread'
 380             'AliFatal'
 381             'core dumped'
 382             '\.C.*error:.*\.h: No such file'
 383             'segmentation'
 384             'Interpreter error recovered'
 385   )
 386
 387   warningConditions=(
 388             'This is serious'
 389   )
 390
 391   local logstatus=0
 392   local errorSummary=""
 393   local warningSummary=""
 394
 395   for ((i=0; i<${#errorConditions[@]};i++)); do
 396     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 397     [[ -n ${tmp} ]] && tmp+=" : "
 398     errorSummary+=${tmp}
 399   done
 400
 401   for ((i=0; i<${#warningConditions[@]};i++)); do
 402     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 403     [[ -n ${tmp} ]] && tmp+=" : "
 404     warningSummary+=${tmp}
 405   done
 406
 407   if [[ -n ${errorSummary} ]]; then
 408     echo "${errorSummary}"
 409     return 1
 410   fi
 411
 412   if [[ -n ${warningSummary} ]]; then
 413     echo "${warningSummary}"
 414     return 2
 415   fi
 416
 417   return 0
 418 }
 419
 420 parseConfig()
 421 {
 422   #config file
 423   configFile=""
 424   #where to search for qa files
 425   inputList=file.list
 426   #working directory
 427   workingDirectory="${PWD}"
 428   #where to place the final qa plots
 429   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 430   outputDirectory="${workingDirectory}/%DET"
 431   #filter out detector option
 432   excludeDetectors="EXAMPLE"
 433   #logs
 434   logDirectory=${workingDirectory}/logs
 435   #OCDB storage
 436   #ocdbStorage="raw://"
 437   #email to
 438   #MAILTO="fbellini@cern.ch"
 439
 440   #first, check if the config file is configured
 441   #is yes - source it so that other options can override it
 442   #if any
 443   for opt in $@; do
 444     if [[ ${opt} =~ configFile=.* ]]; then
 445       eval "${opt}"
 446       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 447       source "${configFile}"
 448       break
 449     fi
 450   done
 451
 452   #then, parse the options as they override the options from file
 453   while [[ -n ${1} ]]; do
 454     local var=${1#--}
 455     if [[ ${var} =~ .*=.* ]]; then
 456       eval "${var}"
 457     else
 458       echo "badly formatted option ${var}, should be: option=value, stopping..."
 459       return 1
 460     fi
 461     shift
 462   done
 463 }
 464
 465 guessRunData()
 466 {
 467   #guess the period from the path, pick the rightmost one
 468   period=""
 469   runNumber=""
 470   year=""
 471   pass=""
 472   legoTrainRunNumber=""
 473   dataType=""
 474
 475   local shortRunNumber=""
 476   local IFS="/"
 477   declare -a path=( $1 )
 478   local dirDepth=$(( ${#path[*]}-1 ))
 479   i=0
 480   for ((x=${dirDepth};x>=0;x--)); do
 481
 482     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 483     local field=${path[${x}]}
 484     local fieldNext=${path[$((x+1))]}
 485
 486     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 487     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 488     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
 489     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 490     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 491     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 492     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 493     (( i++ ))
 494   done
 495   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 496   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
 497
 498   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 499   if [[ -z ${runNumber}} ]];
 500   then
 501     #error condition
 502     return 1
 503   else
 504     #ALL OK
 505     return 0
 506   fi
 507 }
 508
 509 substituteDetectorName()
 510 {
 511   local det=$1
 512   local dir=$2
 513   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 514   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 515 }
 516
 517 get_realpath()
 518 {
 519   if [[ -f "$1" ]]
 520   then
 521     # file *must* exist
 522     if cd "$(echo "${1%/*}")" &>/dev/null
 523     then
 524       # file *may* not be local
 525       # exception is ./file.ext
 526       # try 'cd .; cd -;' *works!*
 527       local tmppwd="$PWD"
 528       cd - &>/dev/null
 529     else
 530       # file *must* be local
 531       local tmppwd="$PWD"
 532     fi
 533   else
 534     # file *cannot* exist
 535     return 1 # failure
 536   fi
 537   # reassemble realpath
 538   echo "$tmppwd"/"${1##*/}"
 539   return 0 # success
 540 }
 541
 542 main $@