]>
Commit | Line | Data |
---|---|---|
d477ad88 | 1 | /************************************************************************** |
2 | * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. * | |
3 | * * | |
4 | * Author: The ALICE Off-line Project. * | |
5 | * Contributors are mentioned in the code where appropriate. * | |
6 | * * | |
7 | * Permission to use, copy, modify and distribute this software and its * | |
8 | * documentation strictly for non-commercial purposes is hereby granted * | |
9 | * without fee, provided that the above copyright notice appears in all * | |
10 | * copies and that both the copyright notice and this permission notice * | |
11 | * appear in the supporting documentation. The authors make no claims * | |
12 | * about the suitability of this software for any purpose. It is * | |
13 | * provided "as is" without express or implied warranty. * | |
14 | **************************************************************************/ | |
15 | ||
16 | /* | |
17 | $Log$ | |
1abfbb60 | 18 | Revision 1.15 2007/12/10 18:29:23 acolla |
19 | Some log added to the listen mode | |
20 | ||
6a926ad4 | 21 | Revision 1.14 2007/12/07 19:14:36 acolla |
22 | in AliShuttleTrigger: | |
23 | ||
24 | Added automatic collection of new runs on a regular time basis (settable from the configuration) | |
25 | ||
26 | in AliShuttleConfig: new members | |
27 | ||
28 | - triggerWait: time to wait for DIM trigger (s) before starting automatic collection of new runs | |
29 | - mode: run mode (test, prod) -> used to build log folder (logs or logs_PROD) | |
30 | ||
31 | in AliShuttle: | |
32 | ||
33 | - logs now stored in logs/#RUN/DET_#RUN.log | |
34 | ||
7d4cf768 | 35 | Revision 1.13 2006/11/16 16:16:48 jgrosseo |
36 | introducing strict run ordering flag | |
37 | removed giving preprocessor name to preprocessor, they have to know their name themselves ;-) | |
38 | ||
be48e3ea | 39 | Revision 1.12 2006/10/20 15:22:59 jgrosseo |
40 | o) Adding time out to the execution of the preprocessors: The Shuttle forks and the parent process monitors the child | |
41 | o) Merging Collect, CollectAll, CollectNew function | |
42 | o) Removing implementation of empty copy constructors (declaration still there!) | |
43 | ||
cb343cfd | 44 | Revision 1.11 2006/10/02 16:38:39 jgrosseo |
45 | update (alberto): | |
46 | fixed memory leaks | |
47 | storing of objects that failed to be stored to the grid before | |
48 | interfacing of shuttle status table in daq system | |
49 | ||
2bb7b766 | 50 | Revision 1.10 2006/08/15 10:50:00 jgrosseo |
51 | effc++ corrections (alberto) | |
52 | ||
4f0ab988 | 53 | Revision 1.9 2006/08/08 14:19:29 jgrosseo |
54 | Update to shuttle classes (Alberto) | |
55 | ||
56 | - Possibility to set the full object's path in the Preprocessor's and | |
57 | Shuttle's Store functions | |
58 | - Possibility to extend the object's run validity in the same classes | |
59 | ("startValidity" and "validityInfinite" parameters) | |
60 | - Implementation of the StoreReferenceData function to store reference | |
61 | data in a dedicated CDB storage. | |
62 | ||
84090f85 | 63 | Revision 1.8 2006/07/21 07:37:20 jgrosseo |
64 | last run is stored after each run | |
65 | ||
7bfb2090 | 66 | Revision 1.7 2006/07/20 09:54:40 jgrosseo |
67 | introducing status management: The processing per subdetector is divided into several steps, | |
68 | after each step the status is stored on disk. If the system crashes in any of the steps the Shuttle | |
69 | can keep track of the number of failures and skips further processing after a certain threshold is | |
70 | exceeded. These thresholds can be configured in LDAP. | |
71 | ||
5164a766 | 72 | Revision 1.6 2006/07/19 10:09:55 jgrosseo |
73 | new configuration, accesst to DAQ FES (Alberto) | |
74 | ||
57f50b3c | 75 | Revision 1.5 2006/07/10 13:01:41 jgrosseo |
76 | enhanced storing of last sucessfully processed run (alberto) | |
77 | ||
a7160fe9 | 78 | Revision 1.4 2006/07/04 14:59:57 jgrosseo |
79 | revision of AliDCSValue: Removed wrapper classes, reduced storage size per value by factor 2 | |
80 | ||
45a493ce | 81 | Revision 1.3 2006/06/12 09:11:16 jgrosseo |
82 | coding conventions (Alberto) | |
83 | ||
58bc3020 | 84 | Revision 1.2 2006/06/06 14:26:40 jgrosseo |
85 | o) removed files that were moved to STEER | |
86 | o) shuttle updated to follow the new interface (Alberto) | |
87 | ||
b948db8d | 88 | Revision 1.1 2006/03/07 07:52:34 hristov |
89 | New version (B.Yordanov) | |
90 | ||
d477ad88 | 91 | Revision 1.5 2005/11/21 09:03:48 byordano |
92 | one more print added | |
93 | ||
94 | Revision 1.4 2005/11/20 10:12:37 byordano | |
95 | comments added to AliShuttleTrigger | |
96 | ||
97 | */ | |
98 | ||
99 | ||
100 | // | |
101 | // This class is to deal with DAQ LogBook and DAQ "end of run" notification. | |
102 | // It has severeal two modes: | |
cb343cfd | 103 | // 1) synchronized - Collect() |
b948db8d | 104 | // 2) asynchronized - Run() - starts listening for DAQ "end of run" |
d477ad88 | 105 | // notification by DIM service. |
106 | // | |
107 | ||
108 | #include "AliShuttleTrigger.h" | |
109 | ||
d477ad88 | 110 | #include <TSystem.h> |
51657f6d | 111 | #include <TGrid.h> |
fb2975a2 | 112 | #include <TObjString.h> |
cb343cfd | 113 | |
d477ad88 | 114 | #include "AliLog.h" |
d477ad88 | 115 | #include "AliShuttleConfig.h" |
116 | #include "AliShuttle.h" | |
117 | #include "DATENotifier.h" | |
118 | ||
fb2975a2 | 119 | #include <fstream> |
120 | ||
d477ad88 | 121 | ClassImp(TerminateSignalHandler) |
cb343cfd | 122 | ClassImp(AliShuttleTrigger) |
58bc3020 | 123 | |
b948db8d | 124 | //______________________________________________________________________________________________ |
cb343cfd | 125 | Bool_t TerminateSignalHandler::Notify() |
58bc3020 | 126 | { |
127 | // Sentd terminate command to the Shuttle trigger | |
d477ad88 | 128 | |
129 | AliInfo("Terminate signal received ..."); | |
130 | fTrigger->Terminate(); | |
131 | ||
132 | return kTRUE; | |
133 | } | |
134 | ||
b948db8d | 135 | //______________________________________________________________________________________________ |
ff3781ad | 136 | AliShuttleTrigger::AliShuttleTrigger(const AliShuttleConfig* config): |
b948db8d | 137 | fConfig(config), fShuttle(NULL), |
2bb7b766 | 138 | fNotified(kFALSE), fTerminate(kFALSE), |
4f0ab988 | 139 | fMutex(), fCondition(&fMutex), |
cb343cfd | 140 | fQuitSignalHandler(0), |
fb2975a2 | 141 | fInterruptSignalHandler(0), |
142 | fLastMailDiskSpace(0) | |
d477ad88 | 143 | { |
144 | // | |
145 | // config - pointer to the AliShuttleConfig object which represents | |
146 | // the configuration | |
b948db8d | 147 | // mainStorage - pointer to AliCDBStorage for the undelying CDBStorage |
148 | // localStorage (local) CDB storage to be used if mainStorage is unavailable | |
d477ad88 | 149 | // |
150 | ||
7d4cf768 | 151 | if (!fConfig->IsValid()) AliFatal("********** !!!!! Invalid configuration !!!!! **********"); |
ff3781ad | 152 | UInt_t timeout = fConfig->GetDCSTimeOut(); |
153 | Int_t retries = fConfig->GetDCSRetries(); | |
b948db8d | 154 | fShuttle = new AliShuttle(config, timeout, retries); |
d477ad88 | 155 | |
28a94b8e | 156 | fQuitSignalHandler = new TerminateSignalHandler(this, kSigQuit); |
157 | fInterruptSignalHandler = new TerminateSignalHandler(this, kSigInterrupt); | |
58bc3020 | 158 | |
cb343cfd | 159 | gSystem->AddSignalHandler(fQuitSignalHandler); |
160 | gSystem->AddSignalHandler(fInterruptSignalHandler); | |
58bc3020 | 161 | |
162 | } | |
163 | ||
b948db8d | 164 | //______________________________________________________________________________________________ |
58bc3020 | 165 | AliShuttleTrigger::~AliShuttleTrigger() |
166 | { | |
cb343cfd | 167 | // destructor |
d477ad88 | 168 | |
cb343cfd | 169 | gSystem->RemoveSignalHandler(fQuitSignalHandler); |
170 | gSystem->RemoveSignalHandler(fInterruptSignalHandler); | |
d477ad88 | 171 | |
172 | delete fShuttle; | |
cb343cfd | 173 | |
174 | delete fQuitSignalHandler; | |
175 | fQuitSignalHandler = 0; | |
176 | ||
177 | delete fInterruptSignalHandler; | |
178 | fInterruptSignalHandler = 0; | |
d477ad88 | 179 | } |
180 | ||
b948db8d | 181 | //______________________________________________________________________________________________ |
d477ad88 | 182 | Bool_t AliShuttleTrigger::Notify() { |
183 | // | |
cb343cfd | 184 | // Trigger Collect() methods in asynchronized (listen) mode. |
d477ad88 | 185 | // Usually called automaticly by DATENotifier on "end of run" |
186 | // notification event. | |
187 | // | |
188 | ||
189 | fMutex.Lock(); | |
190 | ||
191 | fNotified = kTRUE; | |
192 | fCondition.Signal(); | |
193 | ||
194 | fMutex.UnLock(); | |
195 | ||
196 | return kTRUE; | |
197 | } | |
198 | ||
b948db8d | 199 | //______________________________________________________________________________________________ |
d477ad88 | 200 | void AliShuttleTrigger::Terminate() { |
201 | // | |
202 | // Stop triggers listen mode and exist from Run() | |
203 | // Usually called automaticly by TerminateSignalHandler. | |
204 | // | |
205 | ||
206 | fTerminate = kTRUE; | |
207 | fCondition.Signal(); | |
208 | } | |
209 | ||
a5ecdb19 | 210 | //______________________________________________________________________________________________ |
211 | void AliShuttleTrigger::CheckTerminate() | |
212 | { | |
213 | // | |
214 | // Checks if the Shuttle got an external terminate request by a created file | |
215 | // This is an alternative to the signal which causes problems with the API libraries | |
216 | // | |
217 | ||
218 | if (strlen(fConfig->GetTerminateFilePath()) == 0) | |
219 | return; | |
220 | ||
94bf758b | 221 | if (gSystem->AccessPathName(fConfig->GetTerminateFilePath()) == kFALSE) |
a5ecdb19 | 222 | { |
223 | AliInfo("Terminate file exists. Terminating Shuttle..."); | |
224 | fTerminate = kTRUE; | |
225 | } | |
226 | } | |
227 | ||
b948db8d | 228 | //______________________________________________________________________________________________ |
d477ad88 | 229 | void AliShuttleTrigger::Run() { |
230 | // | |
231 | // AliShuttleTrigger main loop for asynchronized (listen) mode. | |
232 | // It spawns DIM service listener and waits for DAQ "end of run" | |
cb343cfd | 233 | // notification. Calls Collect() on notification. |
d477ad88 | 234 | // |
235 | ||
236 | fTerminate = kFALSE; | |
237 | ||
4a5d9e0d | 238 | DATENotifier* notifier = new DATENotifier(this, "/LOGBOOK/SUBSCRIBE/ECS_EOR"); |
d477ad88 | 239 | |
6a926ad4 | 240 | Int_t nTry=0; |
241 | Int_t nMaxTry = fConfig->GetMaxRetries()+1; | |
242 | Int_t received=0; | |
243 | ||
244 | AliInfo("Listening for ECS trigger"); | |
7d4cf768 | 245 | |
d477ad88 | 246 | while (1) { |
247 | ||
248 | fMutex.Lock(); | |
249 | ||
250 | while (!(fNotified || fTerminate)) { | |
51657f6d | 251 | for (Int_t iwait = 0; iwait < 10; iwait++){ |
252 | received = fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait()/10); // to keep the connection to the server alive every minute while waiting for new runs | |
253 | if (received == 1) { | |
254 | if (gGrid) { | |
255 | AliInfo(Form("Keeping the connection to the server alive while waiting for new runs - %d waited from last one", fConfig->GetTriggerWait()/10*iwait)); | |
256 | gGrid->Pwd(); | |
257 | } | |
258 | else { | |
259 | AliInfo("No gGrid initialized so far, we cannot keep the connection to the server alive while waiting for new runs"); | |
260 | } | |
261 | } | |
262 | } | |
263 | //received=fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait()); | |
264 | CheckTerminate(); | |
265 | if (received==1) break; // 1 = timeout | |
d477ad88 | 266 | } |
267 | ||
268 | fNotified = kFALSE; | |
269 | ||
270 | fMutex.UnLock(); | |
271 | ||
272 | if (fTerminate) { | |
273 | AliInfo("Terminated."); | |
274 | break; | |
275 | } | |
7d4cf768 | 276 | |
6a926ad4 | 277 | if (received == 0) |
278 | { | |
279 | AliInfo("Trigger from ECS received!"); | |
280 | } else if (received == 1) { | |
281 | AliInfo(Form("Timeout (%d s) waiting for trigger. " | |
282 | "Starting collection of new runs!", | |
283 | fConfig->GetTriggerWait())); | |
284 | } else { | |
285 | AliInfo("Error receiving trigger from ECS!"); | |
286 | break; | |
287 | } | |
288 | ||
1abfbb60 | 289 | nTry++; |
290 | AliInfo(Form("Received %d triggers so far", nTry)); | |
291 | ||
6a926ad4 | 292 | if (fConfig->GetRunMode() == AliShuttleConfig::kTest) |
293 | { | |
6a926ad4 | 294 | if(nTry>=nMaxTry) |
295 | { | |
296 | AliInfo(Form("Collect() ran more than %d times -> Exiting!", | |
297 | nMaxTry)); | |
298 | break; | |
299 | } | |
300 | } | |
d477ad88 | 301 | |
cb343cfd | 302 | Collect(); |
94bf758b | 303 | CheckTerminate(); |
d477ad88 | 304 | } |
305 | ||
306 | delete notifier; | |
307 | } | |
308 | ||
b948db8d | 309 | //______________________________________________________________________________________________ |
a7160fe9 | 310 | Bool_t AliShuttleTrigger::Collect(Int_t run) |
58bc3020 | 311 | { |
d477ad88 | 312 | // |
cb343cfd | 313 | // this function creates a thread that runs the shuttle |
314 | // then it checks if the shuttle is still running by checking the monitoring functions of the shuttle | |
d477ad88 | 315 | // |
316 | ||
fb2975a2 | 317 | // first checking disk space |
318 | Long_t id = 0; | |
319 | Long_t bsize = 0; | |
320 | Long_t blocks = 0; | |
321 | Long_t bfree = 0; | |
322 | ||
323 | gSystem->GetFsInfo(fConfig->GetShuttleFileSystem(), &id, &bsize, &blocks, &bfree); | |
324 | ||
0a0b087a | 325 | AliInfo(Form("n. of free blocks = %ld, total n. of blocks = %ld",bfree,blocks)); |
fb2975a2 | 326 | Int_t spaceFree = (Int_t)(((Float_t)bfree/(Float_t)blocks)*100); |
327 | ||
328 | if (spaceFree < fConfig->GetFreeDiskWarningThreshold()) { | |
329 | AliWarning(Form("************** Free space left = %d%%, below the Warning Threshold (%d%%)",spaceFree,fConfig->GetFreeDiskWarningThreshold())); | |
330 | if (TMath::Abs(time(0) - fLastMailDiskSpace) >= 86400){ // 86400 = n. of seconds in 1 d | |
331 | SendMailDiskSpace(fConfig->GetFreeDiskWarningThreshold()); | |
332 | fLastMailDiskSpace = time(0); // resetting fLastMailDiskSpace to time(0) = now | |
333 | } | |
334 | if (spaceFree < fConfig->GetFreeDiskFatalThreshold()){ | |
335 | AliError(Form("*************** Free space left = %d%%, below the Fatal Threshold (%d%%), terminating....",spaceFree,fConfig->GetFreeDiskFatalThreshold())); | |
336 | SendMailDiskSpace(fConfig->GetFreeDiskFatalThreshold()); | |
337 | fTerminate = kTRUE; // terminating.... | |
338 | } | |
339 | } | |
340 | ||
341 | if (fTerminate) { | |
342 | return kFALSE; | |
343 | } | |
344 | ||
345 | return fShuttle->Collect(run); | |
346 | } | |
347 | //______________________________________________________________________________________________ | |
348 | Bool_t AliShuttleTrigger::SendMailDiskSpace(Short_t percentage) | |
349 | { | |
350 | // | |
351 | // sends a mail to the shuttle experts in case of free disk space < theshold | |
352 | // | |
353 | ||
354 | ||
355 | AliInfo("******************* Sending the Mail!! *********************"); | |
356 | if (!fConfig->SendMail()) | |
357 | return kTRUE; | |
358 | ||
359 | Int_t runMode = (Int_t)fConfig->GetRunMode(); | |
360 | TString tmpStr; | |
361 | if (runMode == 0) tmpStr = " Nightly Test:"; | |
362 | else tmpStr = " Data Taking:"; | |
363 | void* dir = gSystem->OpenDirectory(fShuttle->GetShuttleLogDir()); | |
364 | if (dir == NULL) | |
365 | { | |
366 | if (gSystem->mkdir(fShuttle->GetShuttleLogDir(), kTRUE)) | |
367 | { | |
368 | AliWarning(Form("SendMail - Can't open directory <%s>", fShuttle->GetShuttleLogDir())); | |
369 | return kFALSE; | |
370 | } | |
371 | ||
372 | } else { | |
373 | gSystem->FreeDirectory(dir); | |
374 | } | |
375 | ||
376 | // SHUTTLE responsibles in to | |
377 | TString to=""; | |
378 | TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); | |
379 | TObjString *anAdmin=0; | |
380 | while ((anAdmin = (TObjString*) iterAdmins.Next())) | |
381 | { | |
382 | to += Form("%s,", anAdmin->GetName()); | |
383 | } | |
384 | if (to.Length() > 0) | |
385 | to.Remove(to.Length()-1); | |
386 | AliDebug(2, Form("to: %s",to.Data())); | |
387 | ||
51657f6d | 388 | // mail body |
fb2975a2 | 389 | TString bodyFileName; |
390 | bodyFileName.Form("%s/mail.body", fShuttle->GetShuttleLogDir()); | |
391 | gSystem->ExpandPathName(bodyFileName); | |
392 | ||
393 | ofstream mailBody; | |
394 | mailBody.open(bodyFileName, ofstream::out); | |
395 | ||
396 | if (!mailBody.is_open()) | |
397 | { | |
398 | AliWarning(Form("Could not open mail body file %s", bodyFileName.Data())); | |
399 | return kFALSE; | |
400 | } | |
401 | ||
402 | TString subject; | |
403 | TString body; | |
404 | ||
eee6253d | 405 | Int_t percentage_used = 100 - percentage; |
fb2975a2 | 406 | subject = Form("%s CRITICAL Disk Space usage exceeds %d%c!", |
eee6253d | 407 | tmpStr.Data(),percentage_used,'%'); |
fb2975a2 | 408 | AliDebug(2, Form("subject: %s", subject.Data())); |
fb2975a2 | 409 | |
410 | body = "Dear SHUTTLE experts, \n\n"; | |
411 | body += "The usage of the disk space on the shuttle machine has overcome \n"; | |
412 | body += Form("the threshold of %d%%. \n \n",percentage_used); | |
413 | body += "Please check! \n \n"; | |
414 | body += "Please do not answer this message directly, it is automatically generated.\n\n"; | |
415 | body += "Greetings,\n\n \t\t\tthe SHUTTLE\n"; | |
416 | ||
417 | AliDebug(2, Form("Body : %s", body.Data())); | |
418 | ||
419 | mailBody << body.Data(); | |
420 | mailBody.close(); | |
421 | ||
422 | // send mail! | |
423 | TString mailCommand = Form("mail -s \"%s\" %s < %s", | |
424 | subject.Data(), | |
425 | to.Data(), | |
426 | bodyFileName.Data()); | |
427 | AliDebug(2, Form("mail command: %s", mailCommand.Data())); | |
428 | ||
429 | Bool_t result = gSystem->Exec(mailCommand.Data()); | |
430 | ||
431 | return result == 0; | |
d477ad88 | 432 | } |