3 # - script to sync a group of files on alien with a local cache
4 # downloads only new and updated files
5 # - by default it mirrors the directory structure in a specified local location
6 # (the local chache location and paths can be manipulated.)
7 # - needs a configured config file (by default alienSync.config)
8 # and a working alien environment (token and at least $ALIEN_DIR or $ALIEN_ROOT set)
9 # - can be also used without a config file
11 # run the script without argument to see the examples
13 # origin: Mikolaj Krzewicki, mikolaj.krzewicki@cern.ch
15 if [ ${BASH_VERSINFO} -lt 4 ]; then
16 echo "bash version >= 4 needed, you have ${BASH_VERSION}, exiting..."
22 if [[ $# -lt 1 ]]; then
23 echo "Usage: ${0##*/} configFile=/path/to/config"
24 echo "expert: ${0##*/} alienFindCommand=\"alien_find /some/path/ file\" [opt=value]"
25 echo " ${0##*/} alienFindCommand=\"alien_find /some/path/ file\" localPathPrefix=\${PWD}"
27 echo "by default files are downloaded to current dir, or \${alienSync_localPathPrefix}, if set."
28 echo "At least specify alienFindCommand, either on command line or in the configFile."
29 echo "the logs go by default to localPathPrefix/alienSyncLogs"
33 #be nice and allow group members access as well (002 will create dirs with 775 and files with 664)
36 # try to load the config file
37 #[[ ! -f $1 ]] && echo "config file $1 not found, exiting..." | tee -a $logFile && exit 1
38 if ! parseConfig "$@"; then return 1; fi
40 [[ -z ${alienFindCommand} ]] && echo "alienFindCommand not defined!" && return 1
42 #if not set, use the default group
43 [[ -z ${alienSyncFilesGroupOwnership} ]] && alienSyncFilesGroupOwnership=$(id -gn)
46 [[ ! -d $logOutputPath ]] && echo "logOutputPath not available, creating..." && mkdir -p $logOutputPath && chgrp ${alienSyncFilesGroupOwnership} ${logOutputPath}
47 [[ ! -d $logOutputPath ]] && echo "could not create log dir, exiting..." && exit 1
48 dateString=$(date +%Y-%m-%d-%H-%M)
49 logFile=$logOutputPath/alienSync-$dateString.log
50 echo "$0 $@"|tee -a $logFile
51 echo ""|tee -a $logFile
55 lockFile=$logOutputPath/runningNow.lock
56 [[ -f $lockFile && ${allowConcurrent} -ne 1 ]] && echo "locked. Another process running? ($lockFile)" | tee -a $logFile && exit 1
58 [[ ! -f $lockFile ]] && echo "unable to create lock. exiting..." | tee -a $logFile && exit 1
60 #redirect all output to a log
61 if [[ $allOutputToLog -eq 1 ]]; then
66 newFilesList=$logOutputPath/"newFiles.list"
69 redoneFilesList=$logOutputPath/"redoneFiles.list"
70 rm -f $redoneFilesList
71 touch $redoneFilesList
72 updatedFilesList="${logOutputPath}/updatedFiles.list"
73 failedDownloadList="${logOutputPath}/failedDownload.list"
74 touch ${failedDownloadList}
78 [[ -z $alienFindCommand ]] && echo "alienFindCommand not defined, exiting..." && exitScript 1
79 [[ -z ${localPathPrefix} ]] && echo "localPathPrefix not defined, exiting..." && exitScript 1
80 [[ -z $logOutputPath ]] && echo "logOutputPath not defined, exiting..." && exitScript 1
81 [[ -z $secondsToSuicide ]] && echo "setting default secondsToSuicide of 10 hrs..." && secondsToSuicide=$(( 10*3600 ))
84 [[ -z $ALIEN_ROOT && -n $ALIEN_DIR ]] && ALIEN_ROOT=$ALIEN_DIR
85 #if ! haveAlienToken; then
86 # $ALIEN_ROOT/api/bin/alien-token-destroy
87 $ALIEN_ROOT/api/bin/alien-token-init $alienUserName
89 #if ! haveAlienToken; then
90 # if [[ $allOutputToLog -eq 1 ]]; then
93 # echo "problems getting token! exiting..." | tee -a $logFile
96 #ls -ltr /tmp/gclient_env_$UID
97 #cat /tmp/gclient_env_$UID
98 source /tmp/gclient_env_$UID
100 #set a default timeout for grid access
101 [[ -z $copyTimeout ]] && copyTimeout=600
102 export GCLIENT_COMMAND_MAXWAIT=$copyTimeout
104 localAlienDatabase=$logOutputPath/localAlienDatabase.list
105 localFileList=$logOutputPath/localFile.list
107 alienFileListCurrent=$logOutputPath/alienFileDatabase.list
108 [[ ! -f $localFileList ]] && touch $localFileList
109 candidateLocalFileDatabase=$logOutputPath/candidateLocalFileDatabase.list
111 #here we produce the current alien file list
112 if [[ -n ${useExistingAlienFileDatabase} && -f ${localAlienDatabase} ]]; then
114 echo "using ${localAlienDatabase} instead of full alien search"
115 echo cp -f ${localAlienDatabase} ${alienFileListCurrent}
116 cp -f ${localAlienDatabase} ${alienFileListCurrent}
119 echo "eval $alienFindCommand > $alienFileListCurrent"
120 eval "$alienFindCommand" > $alienFileListCurrent
123 echo "number of files in the collection: $(wc -l $alienFileListCurrent)"
124 #create a list of candidate destination locations
125 #this is in case there are more files on alien trying to get to the same local destination
126 #in which case we take the one with the youngest ctime (later in code)
127 if [[ -n ${destinationModifyCommand} ]]; then
128 echo eval "cat $alienFileListCurrent | ${destinationModifyCommand} | sed \"s,^,${localPathPrefix},\" > ${candidateLocalFileDatabase}"
129 eval "cat $alienFileListCurrent | ${destinationModifyCommand} | sed \"s,^,${localPathPrefix},\" > ${candidateLocalFileDatabase}"
132 #logic is: if file list is missing we force the md5 recalculation
133 [[ ! -f $localAlienDatabase ]] && forceLocalMD5recalculation=1 && echo "forcing local MD5 sum recalculation" && cp -f $alienFileListCurrent $localAlienDatabase
135 #since we grep through the files frequently, copy some stuff to tmpfs for fast access
136 tmp=$(mktemp -d 2>/dev/null)
137 if [[ -d $tmp ]]; then
138 cp $localAlienDatabase $tmp
139 cp $localFileList $tmp
140 cp $alienFileListCurrent $tmp
141 [[ -f ${candidateLocalFileDatabase} ]] && cp ${candidateLocalFileDatabase} ${tmp}
146 echo "starting downloading:"
150 downloadedFileCounter=0
151 while read -r alienFile md5alien timestamp size
155 #sometimes the md5 turns out empty and is then stored as a "." to avoid problems parsing
156 [[ "$md5alien" =~ "." ]] && md5alien=""
158 [[ -n $timeStampInLog ]] && date
159 [[ $SECONDS -ge $secondsToSuicide ]] && echo "$SECONDS seconds passed, exiting by suicide..." && break
160 [[ "$alienFile" != "/"*"/"?* ]] && echo "WARNING: read line not path-like: $alienFile" && continue
161 ((alienFileCounter++))
162 destination=${localPathPrefix}/${alienFile}
163 destination=${destination//\/\///} #remove double slashes
164 [[ -n ${destinationModifyCommand} ]] && destination=$( eval "echo ${destination} | ${destinationModifyCommand}" )
165 destinationdir=${destination%/*}
166 [[ -n $softLinkName ]] && softlinktodestination=${destinationdir}/${softLinkName}
167 tmpdestination="${destination}.aliensyncTMP"
169 #if we allow concurrent running (DANGEROUS) check if somebody is already trying to process this file
170 if [[ -f ${tmpdestination} && ${allowConcurrent} -eq 1 ]]; then
171 echo "$tmpdestination exists - concurrent donwload? skipping..."
175 if [[ -n ${destinationModifyCommand} ]]; then
176 #find the candidate in the database, in case there are more files trying to go to the same
177 #place due to $destinationModifyCommand which alters the final path, find the one
178 #with the largest ctime (3rd field in the database list) and check if that is the current one
180 #echo grep -n ${destination} $candidateLocalFileDatabase | sed "s/:/ /" | sort -rk4
181 #grep -n ${destination} $candidateLocalFileDatabase| sed "s/:/ /" | sort -rk4
182 #this guy contains: index of the original entry, local file name, md5, ctime
183 candidateDBrecord=($(grep -n ${destination} $tmp/${candidateLocalFileDatabase##*/}| sed "s/:/ /" | sort -rk4|head -n1 ))
184 originalEntryIndex=${candidateDBrecord[0]}
185 [[ $lineNumber -ne $originalEntryIndex ]] && continue
189 if [[ -f ${destination} ]]; then
190 #soft link the downloaded file (maybe to provide a consistent link to the latest version)
191 if [[ -n $softlinktodestination ]]; then
192 echo ln -sf ${destination} ${softlinktodestination}
193 ln -sf ${destination} ${softlinktodestination}
195 ((localFileCounter++))
197 localDBrecord=($(grep $alienFile $tmp/${localAlienDatabase##*/}))
198 md5local=${localDBrecord[1]}
200 #sometimes the md5 turns out empty and is then stored as a "." to avoid problems parsing
201 [[ "$md5local" =~ "." ]] && md5local=""
203 if [[ $forceLocalMD5recalculation -eq 1 || -z $md5local ]]; then
204 md5recalculated=$(checkMD5sum ${destination})
205 [[ "$md5local" != "$md5recalculated" ]] && echo "WARNING: local copy change ${destination}"
206 md5local=${md5recalculated}
208 if [[ "$md5local" == "$md5alien" && -n $md5alien ]]; then
209 echo "OK ${destination} $md5alien"
210 if ! grep -q ${destination} $tmp/${localFileList##*/}; then
211 echo ${destination} >> $localFileList
215 if [[ -z $md5alien ]]; then
216 if ! grep -q ${destination} $tmp/${localFileList##*/}; then
217 echo ${destination} >> $localFileList
219 echo "WARNING: missing alien md5, leaving the local file as it is"
222 echo "WARNING: md5 mismatch ${destination}"
223 echo " $md5local $md5alien"
227 [[ -f $tmpdestination ]] && echo "WARNING: stale $tmpdestination, removing" && rm $tmpdestination
229 mkdir -p ${destinationdir} && chgrp ${alienSyncFilesGroupOwnership} ${destinationdir}
230 [[ ! -d $destinationdir ]] && echo cannot access $destinationdir && continue
233 #if ! haveAlienToken; then
234 # $ALIEN_ROOT/api/bin/alien-token-init $alienUserName
235 # #source /tmp/gclient_env_$UID
240 export copyTimeoutHard
241 echo copyFromAlien "$alienFile" "$tmpdestination"
242 [[ $pretend -eq 1 ]] && continue
243 copyFromAlien $alienFile $tmpdestination
244 chgrp ${alienSyncFilesGroupOwnership} $tmpdestination
246 # if we didn't download remove the destination in case we tried to redownload
248 [[ ! -f $tmpdestination ]] && echo "file not downloaded" && rm -f ${destination} && continue
251 #verify the downloaded md5 if available, validate otherwise...
252 if [[ -n $md5alien ]]; then
253 md5recalculated=$(checkMD5sum ${tmpdestination})
254 if [[ ${md5alien} == ${md5recalculated} ]]; then
255 echo "OK md5 after download"
258 echo "failed verifying md5 $md5alien of $tmpdestination"
264 #handle zip files - check the checksums
265 if [[ $alienFile =~ '.zip' && $downloadOK -eq 1 ]]; then
266 echo "checking integrity of zip archive $tmpdestination"
267 if unzip -t $tmpdestination; then
274 if [[ $downloadOK -eq 1 ]]; then
275 echo mv $tmpdestination ${destination}
276 mv $tmpdestination ${destination}
277 chgrp ${alienSyncFilesGroupOwnership} ${destination}
278 ((downloadedFileCounter++))
279 if [[ -n $softlinktodestination ]]; then
280 echo ln -s ${destination} $softlinktodestination
281 ln -s ${destination} $softlinktodestination
283 [[ -z $redownloading ]] && echo ${destination} >> $newFilesList
284 [[ -n $redownloading ]] && echo ${destination} >> $redoneFilesList
285 if ! grep -q ${destination} $tmp/${localFileList##*/}; then
286 echo ${destination} >> $localFileList
288 [[ -n ${postCommand} ]] && ( cd ${destinationdir}; eval "${postCommand}" )
289 if grep -q ${alienFile} ${failedDownloadList}; then
290 echo "removing ${alienFile} from ${failedDownloadList}"
291 grep -v ${alienFile} ${failedDownloadList} >tmpUpdatedFailed
292 mv tmpUpdatedFailed ${failedDownloadList}
295 echo "download not validated, NOT moving to ${destination}..."
296 echo "removing $tmpdestination"
297 rm -f $tmpdestination
298 echo ${alienFile} >> ${failedDownloadList}
302 [[ -f $tmpdestination ]] && echo "WARNING: tmpdestination should not still be here! removing..." && rm -r ${tmpdestination}
304 if [[ $unzipFiles -eq 1 ]]; then
305 echo unzip -o ${destination} -d ${destinationdir}
306 unzip -o ${destination} -d ${destinationdir}
310 done < ${alienFileListCurrent}
312 [[ $alienFileCounter -gt 0 ]] && mv -f $alienFileListCurrent $localAlienDatabase
316 if [[ $allOutputToLog -eq 1 ]]; then
320 cat ${newFilesList} ${redoneFilesList} > ${updatedFilesList}
322 echo alienFindCommand:
323 echo " $alienFindCommand"
325 echo "files on alien: $alienFileCounter"
326 echo "local files before: $localFileCounter"
327 echo "files downloaded: $downloadedFileCounter"
339 #output the list of failed files to stdout, so the cronjob can mail it
340 echo '###############################'
341 echo "failed to download from alien:"
343 local tmpfailed=$(mktemp)
344 [[ "$(cat ${failedDownloadList} | wc -l)" -gt 0 ]] && sort ${failedDownloadList} | uniq -c | awk 'BEGIN{print "#tries\t file" }{print $1 "\t " $2}' | tee ${tmpfailed}
346 [[ -n ${MAILTO} ]] && echo $logFile | mail -s "alienSync ${alienFindCommand} done" ${MAILTO}
348 if [[ -n ${executeEnd} ]]; then
351 echo '###############################'
352 echo "eval ${executeEnd}"
362 echo removing $lockFile
371 # like a regular alien_find command
372 # output is a list with md5 sums and ctimes
373 executable="$ALIEN_ROOT/api/bin/gbbox find"
374 [[ ! -x ${executable% *} ]] && echo "### error, no $executable..." && return 1
375 [[ -z $logOutputPath ]] && logOutputPath="./"
377 maxCollectionLength=10000
379 export GCLIENT_COMMAND_MAXWAIT=600
380 export GCLIENT_COMMAND_RETRY=20
381 export GCLIENT_SERVER_RESELECT=4
382 export GCLIENT_SERVER_RECONNECT=2
383 export GCLIENT_RETRY_DAMPING=1.2
384 export GCLIENT_RETRY_SLEEPTIME=2
387 numberOfFiles=$maxCollectionLength
388 rm -f $logOutputPath/alien_find.err
389 while [[ $numberOfFiles -ge $maxCollectionLength && $iterationNumber -lt 100 ]]; do
391 offset=$((maxCollectionLength*iterationNumber-1));
392 [[ $offset -lt 0 ]] && offset=0;
393 $executable -x coll -l ${maxCollectionLength} -o ${offset} "$@" 2>>$logOutputPath/alien_find.err \
394 | while read -a fields;
396 nfields=${#fields[*]}
401 for ((x=1;x<=${nfields};x++)); do
402 field=${fields[${x}]}
403 if [[ "${field}" == "md5="* ]]; then
406 if [[ "${field}" == "turl="* ]]; then
409 if [[ "${field}" == "ctime="* ]]; then
410 eval ${field}" "${fields[((x+1))]}
412 if [[ "${field}" == "size="* ]]; then
413 eval ${field}" "${fields[((x+1))]}
416 ctime=$( date -d "${ctime}" +%s 2>/dev/null)
417 [[ -z $md5 ]] && md5="."
418 [[ -n "$turl" ]] && echo "${turl//"alien://"/} ${md5} ${ctime} ${size}" && ((numberOfFiles++))
420 ((iterationNumber++))
427 #split the search in sub searches in the subdirectories of the base path
430 subPathSelection=${3}
431 [[ -z ${subPathSelection} ]] && subPathSelection=".*"
432 gbbox ls ${basePath} 2>/dev/null | \
433 while read subPath; do
434 [[ ! ${subPath} =~ ${subPathSelection} ]] && continue
435 alien_find ${basePath}/${subPath} ${searchTerm}
439 listCollectionContents()
441 #find the xml collections and print the list of filenames and hashes
442 while read -a fields; do
443 nfields=${#fields[*]}
447 for ((x=1;x<=${nfields};x++)); do
448 field=${fields[${x}]}
449 if [[ "${field}" == "md5="* ]]; then
452 if [[ "${field}" == "turl="* ]]; then
455 if [[ "${field}" == "ctime="* ]]; then
456 eval "${field} ${fields[((x+1))]}"
459 ctime=$( date -d "${ctime}" +%s 2>/dev/null)
460 [[ -n "$turl" ]] && echo "${turl//"alien://"/} ${md5} ${ctime}"
461 done < <(catCollections $1 $2 2>/dev/null)
466 #print the contents of collection(s)
467 if [[ $# -eq 2 ]]; then
468 while read collection; do
469 [[ $collection != "/"*"/"?* ]] && continue
470 gbbox cat $collection
471 done < <(alien_find $1 $2)
472 elif [[ $# -eq 1 ]]; then
479 #only get a new token if the old one expires soon
481 [[ -z $maxExpireTime ]] && maxExpireTime=4000
482 [[ -z $ALIEN_ROOT ]] && echo "no ALIEN_ROOT!" && return 1
484 tokenExpirationTime=$($ALIEN_ROOT/api/bin/alien-token-info|grep Expires)
485 tokenExpirationTime=$(date -d "${tokenExpirationTime#*:}" "+%s")
486 secondsToExpire=$(( tokenExpirationTime-now ))
487 if [[ $secondsToExpire -lt $maxExpireTime ]]; then
490 echo "token valid for another $secondsToExpire seconds"
497 #copy the file $1 to $2 using a specified method
498 #uses the "timeout" command to make sure the
499 #download processes will not hang forever.
501 [[ -z $copyTimeout ]] && copyTimeout=600
502 [[ -z $copyTimeoutHard ]] && copyTimeoutHard=1200
503 src=${1//"alien://"/}
506 if [[ "$copyMethod" == "tfilecp" ]]; then
507 if which timeout &>/dev/null; then
508 echo timeout $copyTimeout "TFile::Cp(\"$src\",\"$dst\")"
509 timeout $copyTimeout root -b <<EOF
510 TGrid::Connect("alien://");
511 TFile::Cp("${src}","${dst}");
515 echo "TFile::Cp(\"$src\",\"$dst\")"
517 TGrid::Connect("alien://");
518 TFile::Cp("${src}","${dst}");
522 if which timeout &>/dev/null; then
523 echo timeout $copyTimeout $ALIEN_ROOT/api/bin/alien_cp $src $dst
524 timeout $copyTimeout $ALIEN_ROOT/api/bin/alien_cp $src $dst
526 echo $ALIEN_ROOT/api/bin/alien_cp $src $dst
527 $ALIEN_ROOT/api/bin/alien_cp $src $dst
537 secondsToSuicide=$(( 10*3600 ))
538 localPathPrefix="${PWD}"
539 #define alienSync_localPathPrefix in your env to have a default central location
540 [[ -n ${alienSync_localPathPrefix} ]] && localPathPrefix=${alienSync_localPathPrefix}
546 #first, check if the config file is configured
547 #is yes - source it so that other options can override it
549 for opt in "${args[@]}"; do
550 if [[ ${opt} =~ configFile=.* ]]; then
552 [[ ! -f ${configFile} ]] && echo "configFile ${configFile} not found, exiting..." && return 1
553 echo "using config file: ${configFile}"
554 source "${configFile}"
559 #then, parse the options as they override the options from file
560 for opt in "${args[@]}"; do
561 if [[ ! "${opt}" =~ .*=.* ]]; then
562 echo "badly formatted option ${var}, should be: option=value, stopping..."
565 local var="${opt%%=*}"
566 local value="${opt#*=}"
567 echo "${var} = ${value}"
568 export ${var}="${value}"
571 #things that by default depend on other variables should be set here, after the dependencies
572 [[ -z ${logOutputPath} ]] && logOutputPath="${localPathPrefix}/alienSyncLogs"
580 [[ ! -f ${file} ]] && return 1
581 if which md5sum &>/dev/null; then
582 local tmp=($(md5sum ${file}))
584 elif which md5 &>/dev/null; then
585 local tmp=($(md5 ${file}))