PWGPP/QA/scripts/runQA.sh

   1 #!/bin/bash
   2 main()
   3 {
   4   if [[ -z $1 ]]; then
   5     echo "Usage: "
   6     echo "  ${0##*/} option=value [option=value]"
   7     echo "  at least inputList should be specified, or configFile containing it:"
   8     echo "  ${0##*/} inputList=file.list"
   9     echo "  options override config file (if any), e.g.:"
  10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
  11     return 1
  12   fi
  13
  14   if ! parseConfig "$@"; then
  15     ${0}
  16     return 1
  17   fi
  18
  19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
  20
  21   ocdbregex='raw://'
  22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
  23     alien-token-init ${alienUserName}
  24     #this is a hack! alien-token init seems not enough
  25     #but the gclient_env script messes up the LD_LIBRARY_PATH
  26     while read x; do
  27       eval ${x};
  28     done < <(grep -v "LD_LIBRARY_PATH" /tmp/gclient_env_${UID})
  29   fi
  30
  31   updateQA "$@"
  32 }
  33
  34 updateQA()
  35 {
  36   umask 0002
  37   parseConfig "$@"
  38
  39   #be paranoid and make some full paths
  40   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
  41   inputList=$(get_realpath ${inputList})
  42   mkdir -p ${workingDirectory}
  43   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
  44   if [[ ! -d ${workingDirectory} ]]; then
  45     echo "working dir $workingDirectory does not exist and cannot be created"
  46     return 1
  47   fi
  48   cd ${workingDirectory}
  49
  50   echo JOB config:
  51   echo inputList=$inputList
  52   echo outputDirectory=$outputDirectory
  53   echo
  54
  55   dateString=$(date +%Y-%m-%d-%H-%M-%S-%N)
  56   echo "Start time QA process: $dateString"
  57
  58   #logging
  59   mkdir -p $logDirectory
  60   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
  61   logFile="$logDirectory/${0##*/}.${dateString}.log"
  62   touch ${logFile}
  63   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
  64   echo "logFile = $logFile"
  65
  66   #check lock
  67   lockFile=${workingDirectory}/runQA.lock
  68   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
  69   touch ${lockFile}
  70   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
  71
  72   exec &>${logFile}
  73
  74   ################################################################
  75   #ze detector loop
  76   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
  77     echo
  78     echo "##############################################"
  79     echo $(date)
  80     unset planB
  81     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
  82     detector=${detectorScript%.sh}
  83     detector=${detector##*/}
  84
  85     #skip if excluded
  86     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
  87       echo "${detector} is excluded in config, skipping..."
  88       continue
  89     fi
  90
  91     #if includeDetectors set, only process thoe detectors specified there
  92     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
  93       echo "${detector} not included in includeDetectors, skipping..."
  94       continue
  95     fi
  96
  97     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
  98     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
  99     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
 100     if ! mkdir -p ${tmpDetectorRunDir}; then
 101       echo "cannot create the temp dir $tmpDetectorRunDir"
 102       continue
 103     fi
 104     cd ${tmpDetectorRunDir}
 105
 106     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
 107     echo "running QA for ${detector}"
 108     echo "  outputDir=$outputDir"
 109     echo "  tmpPrefix=$tmpPrefix"
 110
 111     #unset the detector functions from previous iterations (detectors)
 112     unset -f runLevelQA
 113     unset -f periodLevelQA
 114     unset -f runLevelHighPtTreeQA
 115     unset -f periodLevelHighPtTreeQA
 116     source ${detectorScript}
 117
 118     #################################################################
 119     #produce the QA and trending tree for each file (run)
 120     unset arrOfTouchedProductions
 121     declare -A arrOfTouchedProductions
 122     while read qaFile; do
 123       echo
 124       echo $(date)
 125
 126       #first check if input file exists
 127       [[ ! -f ${qaFile%\#*} ]] && echo "file ${qaFile%\#*} not accessible" && continue
 128
 129       if ! guessRunData ${qaFile}; then
 130         echo "could not guess run data from ${qaFile}"
 131         continue
 132       fi
 133       echo "anchorYear for ${originalPeriod} is: ${anchorYear}"
 134
 135       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
 136       tmpRunDir=${tmpProductionDir}/000${runNumber}
 137       mkdir -p ${tmpRunDir}
 138       cd ${tmpRunDir}
 139
 140       #by default we expect to have everything in the same archive
 141       highPtTree=${qaFile}
 142
 143       #maybe the input is not an archive, but a file
 144       [[ "${qaFile}" =~ QAresults.root$ ]] && highPtTree=""
 145       [[ "${qaFile}" =~ FilterEvents_Trees.root$ ]] && qaFile=""
 146
 147       #it is possible we get the highPt trees from somewhere else
 148       #search the list of high pt trees for the proper run number
 149       if [[ -n ${inputListHighPtTrees} ]]; then
 150         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
 151         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
 152       fi
 153
 154       echo qaFile=$qaFile
 155       echo highPtTree=$highPtTree
 156       echo ocdbStorage=${ocdbStorage}
 157       echo
 158
 159       #what if we have a zip archive?
 160       if [[ "$qaFile" =~ .*.zip$ ]]; then
 161         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
 162           qaFile="${qaFile}#QAresults.root"
 163         else
 164           qaFile=""
 165         fi
 166       fi
 167       if [[ "$highPtTree" =~ .*.zip$ ]]; then
 168         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
 169           highPtTree="${highPtTree}#FilterEvents_Trees.root"
 170         else
 171           highPtTree=""
 172         fi
 173       fi
 174
 175       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
 176         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
 177         runLevelQA "${qaFile}" &> runLevelQA.log
 178         #perform some default actions:
 179         #if trending.root not created, create a default one
 180         if [[ ! -f trending.root ]]; then
 181           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" 2>&1 | tee -a runLevelQA.log
 182         fi
 183         if [[ -f trending.root ]]; then
 184           arrOfTouchedProductions[${tmpProductionDir}]=1
 185         else
 186           echo "trending.root not created"
 187         fi
 188       fi
 189       #expert QA based on high pt trees
 190       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
 191         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
 192         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
 193         arrOfTouchedProductions[${tmpProductionDir}]=1
 194       fi
 195
 196       cd ${tmpDetectorRunDir}
 197
 198     done < ${inputList}
 199
 200     #################################################################
 201     #cache which productions were (re)done
 202     echo "list of processed productions:"
 203     echo "    ${!arrOfTouchedProductions[@]}"
 204     echo
 205
 206     #################################################################
 207     #(re)do the merging/trending
 208     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
 209       cd ${tmpProductionDir}
 210       echo
 211       echo "running period level stuff in ${tmpProductionDir}"
 212       echo $(date)
 213
 214       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 215       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
 216
 217       mkdir -p ${productionDir}
 218       if [[ ! -d ${productionDir} ]]; then
 219         echo "cannot make productionDir $productionDir" && continue
 220       fi
 221
 222       #move runs to final destination
 223       for dir in ${tmpProductionDir}/000*; do
 224         echo
 225         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
 226         if ! guessRunData "${dir}/dummyName"; then
 227           echo "could not guess run data from ${dir}"
 228           continue
 229         fi
 230
 231         #before moving - VALIDATE!!!
 232         if ! validate ${dir}; then
 233           continue
 234         fi
 235
 236         #moving a dir is an atomic operation, no locking necessary
 237         if [[ -d ${oldRunDir} ]]; then
 238           echo "removing old ${oldRunDir}"
 239           rm -rf ${oldRunDir}
 240         fi
 241         echo "moving new ${runNumber} to ${productionDir}"
 242         mv -f ${dir} ${productionDir}
 243       done
 244
 245       #go to a temp dir to do the period level stuff in a completely clean dir
 246       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 247       echo
 248       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
 249       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
 250       cd ${tmpPeriodLevelQAdir}
 251
 252       #link the final list of per-run dirs here, just the dirs
 253       #to have a clean working directory
 254       unset linkedStuff
 255       declare -a linkedStuff
 256       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
 257
 258       #merge trending files if any
 259       if /bin/ls 000*/trending.root &>/dev/null; then
 260         hadd trending.root 000*/trending.root &> periodLevelQA.log
 261       fi
 262
 263       #run the period level trending/QA
 264       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
 265         echo running ${detector} periodLevelQA for production ${period}/${pass}
 266         periodLevelQA trending.root &>> periodLevelQA.log
 267       else
 268         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
 269       fi
 270
 271       if ! validate ${PWD}; then continue; fi
 272
 273       #here we are validated so move the produced QA to the final place
 274       #clean up linked stuff first
 275       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
 276       periodLevelLock=${productionDir}/runQA.lock
 277       if [[ ! -f ${periodLevelLock} ]]; then
 278         #some of the output could be a directory, so handle that
 279         #TODO: maybe use rsync?
 280         #lock to avoid conflicts:
 281         echo "${HOSTNAME} ${dateString}" > ${periodLevelLock}
 282         for x in ${tmpPeriodLevelQAdir}/*; do
 283           if [[ -d ${x} ]]; then
 284             echo "removing ${productionDir}/${x##*/}"
 285             rm -rf ${productionDir}/${x##*/}
 286             echo "moving ${x} to ${productionDir}"
 287             mv ${x} ${productionDir}
 288           fi
 289           if [[ -f ${x} ]]; then
 290             echo "moving ${x} to ${productionDir}"
 291             mv -f ${x} ${productionDir}
 292           fi
 293         done
 294         rm -f ${periodLevelLock}
 295         #remove the temp dir
 296         rm -rf ${tmpPeriodLevelQAdir}
 297       else
 298         echo "ERROR: cannot move to destination"                     >> ${logSummary}
 299         echo "production dir ${productionDir} locked!"               >> ${logSummary}
 300         echo "check and maybe manually do:"                          >> ${logSummary}
 301         echo " rm ${periodLevelLock}"                                >> ${logSummary}
 302         echo " rsync -av ${tmpPeriodLevelQAdir}/ ${productionDir}/"  >> ${logSummary}
 303         planB=1
 304       fi
 305
 306     done
 307
 308     cd ${workingDirectory}
 309
 310     if [[ -z ${planB} ]]; then
 311       echo
 312       echo removing ${tmpDetectorRunDir}
 313       rm -rf ${tmpDetectorRunDir}
 314     else
 315       executePlanB
 316     fi
 317   done #end of detector loop
 318
 319   #remove lock
 320   rm -f ${lockFile}
 321 }
 322
 323 executePlanB()
 324 {
 325   #in case of emergency
 326   if [[ -n ${MAILTO} ]]; then
 327     echo
 328     echo "trouble detected, sending email to ${MAILTO}"
 329
 330     grep BAD ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
 331   fi
 332 }
 333
 334 validate()
 335 {
 336   summarizeLogs ${1} >> ${logSummary}
 337   logStatus=$?
 338   if [[ ${logStatus} -ne 0 ]]; then
 339     echo "WARNING not validated: ${1}"
 340     planB=1
 341     return 1
 342   fi
 343   return 0
 344 }
 345
 346 summarizeLogs()
 347 {
 348   local dir=$1
 349   [[ ! -d ${dir} ]] && dir=${PWD}
 350
 351   #print a summary of logs
 352   logFiles=(
 353       "*.log"
 354       "stdout"
 355       "stderr"
 356   )
 357
 358   #check logs
 359   local logstatus=0
 360   for log in ${dir}/${logFiles[*]}; do
 361     finallog=${PWD%/}/${log}
 362     [[ ! -f ${log} ]] && continue
 363     errorSummary=$(validateLog ${log})
 364     validationStatus=$?
 365     [[ validationStatus -ne 0 ]] && logstatus=1
 366     if [[ ${validationStatus} -eq 0 ]]; then
 367       #in pretend mode randomly report an error in rec.log some cases
 368       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
 369         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
 370       else
 371         echo "${finallog} OK"
 372       fi
 373     elif [[ ${validationStatus} -eq 1 ]]; then
 374       echo "${finallog} BAD ${errorSummary}"
 375     elif [[ ${validationStatus} -eq 2 ]]; then
 376       echo "${finallog} OK MWAH ${errorSummary}"
 377     fi
 378   done
 379
 380   #report core files
 381   while read x; do
 382     echo ${x}
 383     chmod 644 ${x}
 384     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
 385   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
 386
 387   return ${logstatus}
 388 }
 389
 390 validateLog()
 391 {
 392   log=${1}
 393   errorConditions=(
 394             'There was a crash'
 395             'floating'
 396             'error while loading shared libraries'
 397             'std::bad_alloc'
 398             's_err_syswatch_'
 399             'Thread [0-9]* (Thread'
 400             'AliFatal'
 401             'core dumped'
 402             '\.C.*error:.*\.h: No such file'
 403             'segmentation'
 404             'Interpreter error recovered'
 405   )
 406
 407   warningConditions=(
 408             'This is serious'
 409   )
 410
 411   local logstatus=0
 412   local errorSummary=""
 413   local warningSummary=""
 414
 415   for ((i=0; i<${#errorConditions[@]};i++)); do
 416     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
 417     [[ -n ${tmp} ]] && tmp+=" : "
 418     errorSummary+=${tmp}
 419   done
 420
 421   for ((i=0; i<${#warningConditions[@]};i++)); do
 422     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
 423     [[ -n ${tmp} ]] && tmp+=" : "
 424     warningSummary+=${tmp}
 425   done
 426
 427   if [[ -n ${errorSummary} ]]; then
 428     echo "${errorSummary}"
 429     return 1
 430   fi
 431
 432   if [[ -n ${warningSummary} ]]; then
 433     echo "${warningSummary}"
 434     return 2
 435   fi
 436
 437   return 0
 438 }
 439
 440 parseConfig()
 441 {
 442   args=("$@")
 443
 444   #config file
 445   configFile=""
 446   #where to search for qa files
 447   inputList=file.list
 448   #working directory
 449   workingDirectory="${PWD}"
 450   #where to place the final qa plots
 451   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
 452   outputDirectory="${workingDirectory}/%DET"
 453   #filter out detector option
 454   excludeDetectors="EXAMPLE"
 455   #logs
 456   logDirectory=${workingDirectory}/logs
 457   #OCDB storage
 458   ocdbStorage="raw://"
 459   #email to
 460   #MAILTO="fbellini@cern.ch"
 461
 462   #first, check if the config file is configured
 463   #is yes - source it so that other options can override it
 464   #if any
 465   for opt in "${args[@]}"; do
 466     if [[ ${opt} =~ configFile=.* ]]; then
 467       eval "${opt}"
 468       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
 469       echo "using config file: ${configFile}"
 470       source "${configFile}"
 471       break
 472     fi
 473   done
 474
 475   #then, parse the options as they override the options from file
 476   for opt in "${args[@]}"; do
 477     if [[ ! "${opt}" =~ .*=.* ]]; then
 478       echo "badly formatted option ${var}, should be: option=value, stopping..."
 479       return 1
 480     fi
 481     local var="${opt%%=*}"
 482     local value="${opt#*=}"
 483     echo "${var}=${value}"
 484     export ${var}="${value}"
 485   done
 486 }
 487
 488 guessRunData()
 489 {
 490   #guess the period from the path, pick the rightmost one
 491   period=""
 492   runNumber=""
 493   year=""
 494   pass=""
 495   legoTrainRunNumber=""
 496   dataType=""
 497   originalPass=""
 498   originalPeriod=""
 499   anchorYear=""
 500
 501   shortRunNumber=""
 502   oldIFS=${IFS}
 503   local IFS="/"
 504   declare -a path=( $1 )
 505   IFS="${oldIFS}"
 506   local dirDepth=$(( ${#path[*]}-1 ))
 507   i=0
 508   for ((x=${dirDepth};x>=0;x--)); do
 509
 510     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
 511     local field=${path[${x}]}
 512     local fieldNext=${path[$((x+1))]}
 513
 514     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
 515     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
 516     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*} && originalPeriod=${field}
 517     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
 518     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
 519     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 520     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
 521     (( i++ ))
 522   done
 523   originalPass=${pass}
 524   [[ -n ${shortRunNumber} && "${legoTrainRunNumber}" =~ ${shortRunNumber} ]] && legoTrainRunNumber=""
 525   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
 526   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber} && originalPass="" #for MC not from lego, the runnumber is identified as lego train number, thus needs to be nulled
 527   [[ -n ${legoTrainRunNumber} ]] && pass+="_lego${legoTrainRunNumber}"
 528
 529   #modify the OCDB: set the year
 530   if [[ ${dataType} =~ sim ]]; then
 531     anchorYear=$(for x in $mcProductionMap ; do [[ "${x}" =~ ${originalPeriod} ]] && echo ${x} && break; done)
 532     anchorYear=${anchorYear#*=}
 533     ocdbStorage=$(setYear ${anchorYear} ${ocdbStorage})
 534   else
 535     ocdbStorage=$(setYear ${year} ${ocdbStorage})
 536   fi
 537
 538   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
 539   if [[ -z ${runNumber}} ]]
 540   then
 541     #error condition
 542     return 1
 543   else
 544     #ALL OK
 545     return 0
 546   fi
 547 }
 548
 549 substituteDetectorName()
 550 {
 551   local det=$1
 552   local dir=$2
 553   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
 554   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
 555 }
 556
 557 get_realpath()
 558 {
 559   if [[ -f "$1" ]]
 560   then
 561     # file *must* exist
 562     if cd "$(echo "${1%/*}")" &>/dev/null
 563     then
 564       # file *may* not be local
 565       # exception is ./file.ext
 566       # try 'cd .; cd -;' *works!*
 567       local tmppwd="$PWD"
 568       cd - &>/dev/null
 569     else
 570       # file *must* be local
 571       local tmppwd="$PWD"
 572     fi
 573   else
 574     # file *cannot* exist
 575     return 1 # failure
 576   fi
 577   # reassemble realpath
 578   echo "$tmppwd"/"${1##*/}"
 579   return 0 # success
 580 }
 581
 582 setYear()
 583 {
 584   #set the year
 585   #  ${1} - year to be set
 586   #  ${2} - where to set the year
 587   local year1=$(guessYear ${1})
 588   local year2=$(guessYear ${2})
 589   local path=${2}
 590   [[ ${year1} -ne ${year2} && -n ${year2} && -n ${year1} ]] && path=${2/\/${year2}\//\/${year1}\/}
 591   echo ${path}
 592   return 0
 593 }
 594
 595 guessYear()
 596 {
 597   #guess the year from the path, pick the rightmost one
 598   local IFS="/"
 599   declare -a pathArray=( ${1} )
 600   local field
 601   local year
 602   for field in ${pathArray[@]}; do
 603     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
 604   done
 605   echo ${year}
 606   return 0
 607 }
 608
 609 main "$@"