]> git.uio.no Git - u/mrichter/AliRoot.git/blob - PWGPP/QA/scripts/runQA.sh
additional safety mechanism: locking to protect against parallel trouble
[u/mrichter/AliRoot.git] / PWGPP / QA / scripts / runQA.sh
1 #!/bin/bash
2 main()
3 {
4   if [[ -z $1 ]]; then
5     echo "Usage: "
6     echo "  ${0##*/} option=value [option=value]"
7     echo "  at least inputList should be specified, or configFile containing it:"
8     echo "  ${0##*/} inputList=file.list"
9     echo "  options override config file (if any), e.g.:"
10     echo "  ${0##*/} configFile=runQA.config inputList=file.list outputDirectory=%det"
11     return 1
12   fi
13  
14   if ! parseConfig $@; then
15     ${0}
16     return 1
17   fi
18
19   [[ -z $ALICE_ROOT ]] && echo "ALICE_ROOT not defined" && return 1
20
21   ocdbregex='raw://'
22   if [[ ${ocdbStorage} =~ ${ocdbregex} ]]; then
23     alien-token-init ${alienUserName}
24     #this is a hack! alien-token init seems not enough
25     #but the gclient_env script messes up the LD_LIBRARY_PATH
26     while read x; do
27       eval ${x};
28     done < <(grep -v "LD_LIBRARY_PATH" /tmp/gclient_env_${UID})
29   fi
30
31   updateQA $@
32 }
33
34 updateQA()
35 {
36   umask 0002
37   parseConfig $@
38
39   #be paranoid and make some full paths
40   [[ ! -f ${inputList} ]] && echo "no input list: ${inputList}" && return 1
41   inputList=$(get_realpath ${inputList})
42   mkdir -p ${workingDirectory}
43   workingDirectory=$(workingDirectory=${workingDirectory%/}; cd ${workingDirectory%/*}; echo "${PWD}/${workingDirectory##*/}")
44   if [[ ! -d ${workingDirectory} ]]; then
45     echo "working dir $workingDirectory does not exist and cannot be created"
46     return 1
47   fi
48   cd ${workingDirectory}
49
50   echo JOB config:
51   echo inputList=$inputList
52   echo outputDirectory=$outputDirectory
53   echo
54
55   dateString=$(date +%Y-%m-%d-%H-%M)
56   echo "Start time QA process: $dateString"
57
58   #logging
59   mkdir -p $logDirectory
60   [[ ! -d $logDirectory ]] && echo "no log dir $logDirectory" && return 1
61   logFile="$logDirectory/${0##*/}.${dateString}.log"
62   touch ${logFile}
63   [[ ! -f ${logFile} ]] && echo "cannot write logfile $logfile" && return 1
64   echo "logFile = $logFile"
65
66   #check lock
67   lockFile=${logDirectory}/runQA.lock
68   [[ -f ${lockFile} ]] && echo "lock ${lockFile} exists!" | tee ${logFile} && return 1
69   touch ${lockFile}
70   [[ ! -f ${lockFile} ]] && echo "cannot lock $lockFile" | tee ${logFile} && return 1
71   
72   exec &>${logFile}
73
74   ################################################################
75   #ze detector loop
76   for detectorScript in $ALICE_ROOT/PWGPP/QA/detectorQAscripts/*; do
77     echo
78     echo "##############################################"
79     echo $(date)
80     unset planB
81     [[ ! ${detectorScript} =~ .*\.sh$ ]] && continue
82     detector=${detectorScript%.sh}
83     detector=${detector##*/}
84     
85     #skip if excluded
86     if [[ "${excludeDetectors}" =~ ${detector} ]]; then
87       echo "${detector} is excluded in config, skipping..."
88       continue
89     fi
90
91     #if includeDetectors set, only process thoe detectors specified there
92     if [[ -n ${includeDetectors} && ! "${includeDetectors}" =~ ${detector} ]]; then
93       echo "${detector} not included in includeDetectors, skipping..."
94       continue
95     fi
96
97     logSummary=${logDirectory}/summary-${detector}-${dateString}.log
98     outputDir=$(substituteDetectorName ${detector} ${outputDirectory})
99     tmpDetectorRunDir=${workingDirectory}/tmpQAtmpRunDir${detector}-${dateString}
100     if ! mkdir -p ${tmpDetectorRunDir}; then
101       echo "cannot create the temp dir $tmpDetectorRunDir"
102       continue
103     fi
104     cd ${tmpDetectorRunDir}
105
106     tmpPrefix=${tmpDetectorRunDir}/${outputDir}
107     echo "running QA for ${detector}"
108     echo "  outputDir=$outputDir"
109     echo "  tmpPrefix=$tmpPrefix"
110     
111     #unset the detector functions from previous iterations (detectors)
112     unset -f runLevelQA
113     unset -f periodLevelQA
114     unset -f runLevelHighPtTreeQA
115     unset -f periodLevelHighPtTreeQA
116     source ${detectorScript}
117
118     #################################################################
119     #produce the QA and trending tree for each file (run)
120     unset arrOfTouchedProductions
121     declare -A arrOfTouchedProductions
122     while read qaFile; do
123       echo
124       echo $(date)
125       
126       #first check if input file exists
127       [[ ! -f ${qaFile%\#*} ]] && echo "file ${qaFile%\#*} not accessible" && continue
128
129       if ! guessRunData ${qaFile}; then
130         echo "could not guess run data from ${qaFile}"
131         continue
132       fi
133
134       tmpProductionDir=${tmpPrefix}/${dataType}/${year}/${period}/${pass}
135       tmpRunDir=${tmpProductionDir}/000${runNumber}
136       mkdir -p ${tmpRunDir}
137       cd ${tmpRunDir}
138
139       #by default we expect to have everything in the same archive
140       highPtTree=${qaFile}
141
142       #maybe the input is not an archive, but a file
143       [[ "${qaFile}" =~ QAresults.root$ ]] && highPtTree=""
144       [[ "${qaFile}" =~ FilterEvents_Trees.root$ ]] && qaFile=""
145
146       #it is possible we get the highPt trees from somewhere else
147       #search the list of high pt trees for the proper run number
148       if [[ -n ${inputListHighPtTrees} ]]; then
149         highPtTree=$(egrep -m1 ${runNumber} ${inputListHighPtTrees})
150         echo "loaded the highPtTree ${highPtTree} from external file ${inputListHighPtTrees}"
151       fi
152       
153       echo qaFile=$qaFile
154       echo highPtTree=$highPtTree
155
156       #what if we have a zip archive?
157       if [[ "$qaFile" =~ .*.zip$ ]]; then
158         if unzip -l ${qaFile} | egrep "QAresults.root" &>/dev/null; then
159           qaFile="${qaFile}#QAresults.root"
160         else
161           qaFile=""
162         fi
163       fi
164       if [[ "$highPtTree" =~ .*.zip$ ]]; then
165         if unzip -l ${highPtTree} | egrep "FilterEvents_Trees.root" &>/dev/null; then
166           highPtTree="${highPtTree}#FilterEvents_Trees.root"
167         else
168           highPtTree=""
169         fi
170       fi
171      
172       if [[ -n ${qaFile} && $(type -t runLevelQA) =~ "function" ]]; then
173         echo running ${detector} runLevelQA for run ${runNumber} from ${qaFile}
174         runLevelQA "${qaFile}" &> runLevelQA.log
175         #perform some default actions:
176         #if trending.root not created, create a default one
177         if [[ ! -f trending.root ]]; then
178           aliroot -b -q -l "$ALICE_ROOT/PWGPP/macros/simpleTrending.C(\"${qaFile}\",${runNumber},\"${detector}\",\"trending.root\",\"trending\",\"recreate\")" 2>&1 | tee -a runLevelQA.log
179         fi
180         if [[ -f trending.root ]]; then
181           arrOfTouchedProductions[${tmpProductionDir}]=1
182         else
183           echo "trending.root not created"
184         fi
185       fi
186       #expert QA based on high pt trees
187       if [[ -n ${highPtTree} && $(type -t runLevelHighPtTreeQA) =~ "function" ]]; then
188         echo running ${detector} runLevelHighPtTreeQA for run ${runNumber} from ${highPtTree}
189         runLevelHighPtTreeQA "${highPtTree}" &> runLevelHighPtTreeQA.log
190         arrOfTouchedProductions[${tmpProductionDir}]=1
191       fi
192
193       cd ${tmpDetectorRunDir}
194     
195     done < ${inputList}
196
197     #################################################################
198     #cache which productions were (re)done
199     echo "list of processed productions:"
200     echo "    ${!arrOfTouchedProductions[@]}"
201     echo
202
203     #################################################################
204     #(re)do the merging/trending 
205     for tmpProductionDir in ${!arrOfTouchedProductions[@]}; do
206       cd ${tmpProductionDir}
207       echo
208       echo "running period level stuff in ${tmpProductionDir}"
209       echo $(date)
210     
211       productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
212       echo productionDir=${outputDir}/${tmpProductionDir#${tmpPrefix}}
213
214       mkdir -p ${productionDir}
215       if [[ ! -d ${productionDir} ]]; then 
216         echo "cannot make productionDir $productionDir" && continue
217       fi
218       
219       #move runs to final destination
220       for dir in ${tmpProductionDir}/000*; do
221         echo 
222         oldRunDir=${outputDir}/${dir#${tmpPrefix}}
223         if ! guessRunData "${dir}/dummyName"; then
224           echo "could not guess run data from ${dir}"
225           continue
226         fi
227
228         #before moving - VALIDATE!!!
229         if ! validate ${dir}; then 
230           continue
231         fi
232
233         #moving a dir is an atomic operation, no locking necessary
234         if [[ -d ${oldRunDir} ]]; then
235           echo "removing old ${oldRunDir}"
236           rm -rf ${oldRunDir}
237         fi
238         echo "moving new ${runNumber} to ${productionDir}"
239         mv -f ${dir} ${productionDir}
240       done
241    
242       #go to a temp dir to do the period level stuff in a completely clean dir
243       tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
244       echo
245       echo tmpPeriodLevelQAdir="${tmpProductionDir}/periodLevelQA"
246       if ! mkdir -p ${tmpPeriodLevelQAdir}; then continue; fi
247       cd ${tmpPeriodLevelQAdir}
248
249       #link the final list of per-run dirs here, just the dirs
250       #to have a clean working directory
251       unset linkedStuff
252       declare -a linkedStuff
253       for x in ${productionDir}/000*; do [[ -d $x ]] && ln -s $x && linkedStuff+=(${x##*/}); done
254
255       #merge trending files if any
256       if /bin/ls 000*/trending.root &>/dev/null; then
257         hadd trending.root 000*/trending.root &> periodLevelQA.log
258       fi
259       
260       #run the period level trending/QA
261       if [[ -f "trending.root" && $(type -t periodLevelQA) =~ "function" ]]; then
262         echo running ${detector} periodLevelQA for production ${period}/${pass}
263         periodLevelQA trending.root &>> periodLevelQA.log
264       else 
265         echo "WARNING: not running ${detector} periodLevelQA for production ${period}/${pass}, no trending.root"
266       fi
267
268       if ! validate ${PWD}; then continue; fi
269
270       #here we are validated so move the produced QA to the final place
271       #clean up linked stuff first
272       [[ -n ${linkedStuff[@]} ]] && rm ${linkedStuff[@]}
273       periodLevelLock=${productionDir}/runQA.lock
274       if [[ ! -f ${periodLevelLock} ]]; then
275         #some of the output could be a directory, so handle that
276         #TODO: maybe use rsync?
277         #lock to avoid conflicts:
278         echo "${HOSTNAME} ${dateString}" > ${periodLevelLock}
279         for x in ${tmpPeriodLevelQAdir}/*; do  
280           if [[ -d ${x} ]]; then
281             echo "removing ${productionDir}/${x##*/}"
282             rm -rf ${productionDir}/${x##*/}
283             echo "moving ${x} to ${productionDir}"
284             mv ${x} ${productionDir}
285           fi
286           if [[ -f ${x} ]]; then
287             echo "moving ${x} to ${productionDir}"
288             mv -f ${x} ${productionDir} 
289           fi
290         done
291         rm -f ${periodLevelLock}
292         #remove the temp dir
293         rm -rf ${tmpPeriodLevelQAdir}
294       else
295         echo "locked! cannot move to destination"
296         echo "check and maybe manually do:"
297         echo " rm ${periodLevelLock}"
298         echo " rsync -av ${tmpPeriodLevelQAdir}/ ${productionDir}/"
299       fi
300
301     done
302
303     cd ${workingDirectory}
304
305     if [[ -z ${planB} ]]; then
306       echo
307       echo removing ${tmpDetectorRunDir}
308       rm -rf ${tmpDetectorRunDir}
309     else
310       executePlanB
311     fi
312   done #end of detector loop
313
314   #remove lock
315   rm -f ${lockFile}
316 }
317
318 executePlanB()
319 {
320   #in case of emergency
321   if [[ -n ${MAILTO} ]]; then 
322     echo
323     echo "trouble detected, sending email to ${MAILTO}"
324
325     grep BAD ${logSummary} | mail -s "qa in need of assistance" ${MAILTO}
326   fi
327 }
328
329 validate()
330 {
331   summarizeLogs ${1} >> ${logSummary}
332   logStatus=$?
333   if [[ ${logStatus} -ne 0 ]]; then 
334     echo "WARNING not validated: ${1}"
335     planB=1
336     return 1
337   fi
338   return 0
339 }
340
341 summarizeLogs()
342 {
343   local dir=$1
344   [[ ! -d ${dir} ]] && dir=${PWD}
345
346   #print a summary of logs
347   logFiles=(
348       "*.log"
349       "stdout"
350       "stderr"
351   )
352
353   #check logs
354   local logstatus=0
355   for log in ${dir}/${logFiles[*]}; do
356     finallog=${PWD%/}/${log}
357     [[ ! -f ${log} ]] && continue
358     errorSummary=$(validateLog ${log})
359     validationStatus=$?
360     [[ validationStatus -ne 0 ]] && logstatus=1
361     if [[ ${validationStatus} -eq 0 ]]; then 
362       #in pretend mode randomly report an error in rec.log some cases
363       if [[ -n ${pretend} && "${log}" == "rec.log" ]]; then
364         [[ $(( ${RANDOM}%2 )) -ge 1 ]] && echo "${finallog} BAD random error" || echo "${finallog} OK"
365       else
366         echo "${finallog} OK"
367       fi
368     elif [[ ${validationStatus} -eq 1 ]]; then
369       echo "${finallog} BAD ${errorSummary}"
370     elif [[ ${validationStatus} -eq 2 ]]; then
371       echo "${finallog} OK MWAH ${errorSummary}"
372     fi
373   done
374
375   #report core files
376   while read x; do
377     echo ${x}
378     chmod 644 ${x}
379     gdb --batch --quiet -ex "bt" -ex "quit" aliroot ${x} > stacktrace_${x//\//_}.log
380   done < <(/bin/ls ${PWD}/*/core 2>/dev/null; /bin/ls ${PWD}/core 2>/dev/null)
381
382   return ${logstatus}
383 }
384
385 validateLog()
386 {
387   log=${1}
388   errorConditions=(
389             'There was a crash'
390             'floating'
391             'error while loading shared libraries'
392             'std::bad_alloc'
393             's_err_syswatch_'
394             'Thread [0-9]* (Thread'
395             'AliFatal'
396             'core dumped'
397             '\.C.*error:.*\.h: No such file'
398             'segmentation'
399             'Interpreter error recovered'
400   )
401
402   warningConditions=(
403             'This is serious'
404   )
405
406   local logstatus=0
407   local errorSummary=""
408   local warningSummary=""
409
410   for ((i=0; i<${#errorConditions[@]};i++)); do
411     local tmp=$(grep -m1 -e "${errorConditions[${i}]}" ${log})
412     [[ -n ${tmp} ]] && tmp+=" : "
413     errorSummary+=${tmp}
414   done
415
416   for ((i=0; i<${#warningConditions[@]};i++)); do
417     local tmp=$(grep -m1 -e "${warningConditions[${i}]}" ${log})
418     [[ -n ${tmp} ]] && tmp+=" : "
419     warningSummary+=${tmp}
420   done
421
422   if [[ -n ${errorSummary} ]]; then 
423     echo "${errorSummary}"
424     return 1
425   fi
426
427   if [[ -n ${warningSummary} ]]; then
428     echo "${warningSummary}"
429     return 2
430   fi
431
432   return 0
433 }
434
435 parseConfig()
436 {
437   args=("$@")
438
439   #config file
440   configFile=""
441   #where to search for qa files
442   inputList=file.list
443   #working directory
444   workingDirectory="${PWD}"
445   #where to place the final qa plots
446   #outputDirectory="/afs/cern.ch/work/a/aliqa%det/www/"
447   outputDirectory="${workingDirectory}/%DET"
448   #filter out detector option
449   excludeDetectors="EXAMPLE"
450   #logs
451   logDirectory=${workingDirectory}/logs
452   #OCDB storage
453   ocdbStorage="raw://"
454   #email to
455   #MAILTO="fbellini@cern.ch"
456
457   #first, check if the config file is configured
458   #is yes - source it so that other options can override it
459   #if any
460   for opt in "${args[@]}"; do
461     if [[ ${opt} =~ configFile=.* ]]; then
462       eval "${opt}"
463       [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
464       echo "using config file: ${configFile}"
465       source "${configFile}"
466       break
467     fi
468   done
469
470   #then, parse the options as they override the options from file
471   for opt in "${args[@]}"; do
472     if [[ ! "${opt}" =~ .*=.* ]]; then
473       echo "badly formatted option ${var}, should be: option=value, stopping..."
474       return 1
475     fi
476     local var="${opt%%=*}"
477     local value="${opt#*=}"
478     echo "${var} = ${value}"
479     export ${var}="${value}"
480   done
481 }
482
483 guessRunData()
484 {
485   #guess the period from the path, pick the rightmost one
486   period=""
487   runNumber=""
488   year=""
489   pass=""
490   legoTrainRunNumber=""
491   dataType=""
492
493   local shortRunNumber=""
494   local IFS="/"
495   declare -a path=( $1 )
496   local dirDepth=$(( ${#path[*]}-1 ))
497   i=0
498   for ((x=${dirDepth};x>=0;x--)); do
499
500     [[ $((x-1)) -ge 0 ]] && local fieldPrev=${path[$((x-1))]}
501     local field=${path[${x}]}
502     local fieldNext=${path[$((x+1))]}
503
504     [[ ${field} =~ ^[0-9]*$ && ${fieldNext} =~ (.*\.zip$|.*\.root$) ]] && legoTrainRunNumber=${field}
505     [[ -n ${legoTrainRunNumber} && -z ${pass} ]] && pass=${fieldPrev}
506     [[ ${field} =~ ^LHC[0-9][0-9][a-z].*$ ]] && period=${field%_*}
507     [[ ${field} =~ ^000[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && runNumber=${field#000}
508     [[ ${field} =~ ^[0-9][0-9][0-9][0-9][0-9][0-9]$ ]] && shortRunNumber=${field}
509     [[ ${field} =~ ^20[0-9][0-9]$ ]] && year=${field}
510     [[ ${field} =~ ^(^sim$|^data$) ]] && dataType=${field}
511     (( i++ ))
512   done
513   [[ -z ${legoTrainRunNumber} ]] && pass=${path[$((dirDepth-1))]}
514   [[ "${dataType}" =~ ^sim$ ]] && pass="passMC" && runNumber=${shortRunNumber}
515   
516   #if [[ -z ${dataType} || -z ${year} || -z ${period} || -z ${runNumber}} || -z ${pass} ]];
517   if [[ -z ${runNumber}} ]];
518   then
519     #error condition
520     return 1
521   else
522     #ALL OK
523     return 0
524   fi
525 }
526
527 substituteDetectorName()
528 {
529   local det=$1
530   local dir=$2
531   [[ ${dir} =~ \%det ]] && det=${det,,} && echo ${dir/\%det/${det}}
532   [[ ${dir} =~ \%DET ]] && det=${det} && echo ${dir/\%DET/${det}}
533 }
534
535 get_realpath() 
536 {
537   if [[ -f "$1" ]]
538   then
539     # file *must* exist
540     if cd "$(echo "${1%/*}")" &>/dev/null
541     then
542       # file *may* not be local
543       # exception is ./file.ext
544       # try 'cd .; cd -;' *works!*
545       local tmppwd="$PWD"
546       cd - &>/dev/null
547     else
548       # file *must* be local
549       local tmppwd="$PWD"
550     fi
551   else
552     # file *cannot* exist
553     return 1 # failure
554   fi
555   # reassemble realpath
556   echo "$tmppwd"/"${1##*/}"
557   return 0 # success
558 }
559
560 main $@