]>
Commit | Line | Data |
---|---|---|
1 | /************************************************************************** | |
2 | * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. * | |
3 | * * | |
4 | * Author: The ALICE Off-line Project. * | |
5 | * Contributors are mentioned in the code where appropriate. * | |
6 | * * | |
7 | * Permission to use, copy, modify and distribute this software and its * | |
8 | * documentation strictly for non-commercial purposes is hereby granted * | |
9 | * without fee, provided that the above copyright notice appears in all * | |
10 | * copies and that both the copyright notice and this permission notice * | |
11 | * appear in the supporting documentation. The authors make no claims * | |
12 | * about the suitability of this software for any purpose. It is * | |
13 | * provided "as is" without express or implied warranty. * | |
14 | **************************************************************************/ | |
15 | ||
16 | /* | |
17 | $Log$ | |
18 | Revision 1.15 2007/12/10 18:29:23 acolla | |
19 | Some log added to the listen mode | |
20 | ||
21 | Revision 1.14 2007/12/07 19:14:36 acolla | |
22 | in AliShuttleTrigger: | |
23 | ||
24 | Added automatic collection of new runs on a regular time basis (settable from the configuration) | |
25 | ||
26 | in AliShuttleConfig: new members | |
27 | ||
28 | - triggerWait: time to wait for DIM trigger (s) before starting automatic collection of new runs | |
29 | - mode: run mode (test, prod) -> used to build log folder (logs or logs_PROD) | |
30 | ||
31 | in AliShuttle: | |
32 | ||
33 | - logs now stored in logs/#RUN/DET_#RUN.log | |
34 | ||
35 | Revision 1.13 2006/11/16 16:16:48 jgrosseo | |
36 | introducing strict run ordering flag | |
37 | removed giving preprocessor name to preprocessor, they have to know their name themselves ;-) | |
38 | ||
39 | Revision 1.12 2006/10/20 15:22:59 jgrosseo | |
40 | o) Adding time out to the execution of the preprocessors: The Shuttle forks and the parent process monitors the child | |
41 | o) Merging Collect, CollectAll, CollectNew function | |
42 | o) Removing implementation of empty copy constructors (declaration still there!) | |
43 | ||
44 | Revision 1.11 2006/10/02 16:38:39 jgrosseo | |
45 | update (alberto): | |
46 | fixed memory leaks | |
47 | storing of objects that failed to be stored to the grid before | |
48 | interfacing of shuttle status table in daq system | |
49 | ||
50 | Revision 1.10 2006/08/15 10:50:00 jgrosseo | |
51 | effc++ corrections (alberto) | |
52 | ||
53 | Revision 1.9 2006/08/08 14:19:29 jgrosseo | |
54 | Update to shuttle classes (Alberto) | |
55 | ||
56 | - Possibility to set the full object's path in the Preprocessor's and | |
57 | Shuttle's Store functions | |
58 | - Possibility to extend the object's run validity in the same classes | |
59 | ("startValidity" and "validityInfinite" parameters) | |
60 | - Implementation of the StoreReferenceData function to store reference | |
61 | data in a dedicated CDB storage. | |
62 | ||
63 | Revision 1.8 2006/07/21 07:37:20 jgrosseo | |
64 | last run is stored after each run | |
65 | ||
66 | Revision 1.7 2006/07/20 09:54:40 jgrosseo | |
67 | introducing status management: The processing per subdetector is divided into several steps, | |
68 | after each step the status is stored on disk. If the system crashes in any of the steps the Shuttle | |
69 | can keep track of the number of failures and skips further processing after a certain threshold is | |
70 | exceeded. These thresholds can be configured in LDAP. | |
71 | ||
72 | Revision 1.6 2006/07/19 10:09:55 jgrosseo | |
73 | new configuration, accesst to DAQ FES (Alberto) | |
74 | ||
75 | Revision 1.5 2006/07/10 13:01:41 jgrosseo | |
76 | enhanced storing of last sucessfully processed run (alberto) | |
77 | ||
78 | Revision 1.4 2006/07/04 14:59:57 jgrosseo | |
79 | revision of AliDCSValue: Removed wrapper classes, reduced storage size per value by factor 2 | |
80 | ||
81 | Revision 1.3 2006/06/12 09:11:16 jgrosseo | |
82 | coding conventions (Alberto) | |
83 | ||
84 | Revision 1.2 2006/06/06 14:26:40 jgrosseo | |
85 | o) removed files that were moved to STEER | |
86 | o) shuttle updated to follow the new interface (Alberto) | |
87 | ||
88 | Revision 1.1 2006/03/07 07:52:34 hristov | |
89 | New version (B.Yordanov) | |
90 | ||
91 | Revision 1.5 2005/11/21 09:03:48 byordano | |
92 | one more print added | |
93 | ||
94 | Revision 1.4 2005/11/20 10:12:37 byordano | |
95 | comments added to AliShuttleTrigger | |
96 | ||
97 | */ | |
98 | ||
99 | ||
100 | // | |
101 | // This class is to deal with DAQ LogBook and DAQ "end of run" notification. | |
102 | // It has severeal two modes: | |
103 | // 1) synchronized - Collect() | |
104 | // 2) asynchronized - Run() - starts listening for DAQ "end of run" | |
105 | // notification by DIM service. | |
106 | // | |
107 | ||
108 | #include "AliShuttleTrigger.h" | |
109 | ||
110 | #include <TSystem.h> | |
111 | #include <TGrid.h> | |
112 | #include <TObjString.h> | |
113 | ||
114 | #include "AliLog.h" | |
115 | #include "AliShuttleConfig.h" | |
116 | #include "AliShuttle.h" | |
117 | #include "DATENotifier.h" | |
118 | ||
119 | #include <fstream> | |
120 | ||
121 | ClassImp(TerminateSignalHandler) | |
122 | ClassImp(AliShuttleTrigger) | |
123 | ||
124 | //______________________________________________________________________________________________ | |
125 | Bool_t TerminateSignalHandler::Notify() | |
126 | { | |
127 | // Sentd terminate command to the Shuttle trigger | |
128 | ||
129 | AliInfo("Terminate signal received ..."); | |
130 | fTrigger->Terminate(); | |
131 | ||
132 | return kTRUE; | |
133 | } | |
134 | ||
135 | //______________________________________________________________________________________________ | |
136 | AliShuttleTrigger::AliShuttleTrigger(const AliShuttleConfig* config): | |
137 | fConfig(config), fShuttle(NULL), | |
138 | fNotified(kFALSE), fTerminate(kFALSE), | |
139 | fMutex(), fCondition(&fMutex), | |
140 | fQuitSignalHandler(0), | |
141 | fInterruptSignalHandler(0), | |
142 | fLastMailDiskSpace(0) | |
143 | { | |
144 | // | |
145 | // config - pointer to the AliShuttleConfig object which represents | |
146 | // the configuration | |
147 | // mainStorage - pointer to AliCDBStorage for the undelying CDBStorage | |
148 | // localStorage (local) CDB storage to be used if mainStorage is unavailable | |
149 | // | |
150 | ||
151 | if (!fConfig->IsValid()) AliFatal("********** !!!!! Invalid configuration !!!!! **********"); | |
152 | UInt_t timeout = fConfig->GetDCSTimeOut(); | |
153 | Int_t retries = fConfig->GetDCSRetries(); | |
154 | fShuttle = new AliShuttle(config, timeout, retries); | |
155 | ||
156 | fQuitSignalHandler = new TerminateSignalHandler(this, kSigQuit); | |
157 | fInterruptSignalHandler = new TerminateSignalHandler(this, kSigInterrupt); | |
158 | ||
159 | gSystem->AddSignalHandler(fQuitSignalHandler); | |
160 | gSystem->AddSignalHandler(fInterruptSignalHandler); | |
161 | ||
162 | } | |
163 | ||
164 | //______________________________________________________________________________________________ | |
165 | AliShuttleTrigger::~AliShuttleTrigger() | |
166 | { | |
167 | // destructor | |
168 | ||
169 | gSystem->RemoveSignalHandler(fQuitSignalHandler); | |
170 | gSystem->RemoveSignalHandler(fInterruptSignalHandler); | |
171 | ||
172 | delete fShuttle; | |
173 | ||
174 | delete fQuitSignalHandler; | |
175 | fQuitSignalHandler = 0; | |
176 | ||
177 | delete fInterruptSignalHandler; | |
178 | fInterruptSignalHandler = 0; | |
179 | } | |
180 | ||
181 | //______________________________________________________________________________________________ | |
182 | Bool_t AliShuttleTrigger::Notify() { | |
183 | // | |
184 | // Trigger Collect() methods in asynchronized (listen) mode. | |
185 | // Usually called automaticly by DATENotifier on "end of run" | |
186 | // notification event. | |
187 | // | |
188 | ||
189 | fMutex.Lock(); | |
190 | ||
191 | fNotified = kTRUE; | |
192 | fCondition.Signal(); | |
193 | ||
194 | fMutex.UnLock(); | |
195 | ||
196 | return kTRUE; | |
197 | } | |
198 | ||
199 | //______________________________________________________________________________________________ | |
200 | void AliShuttleTrigger::Terminate() { | |
201 | // | |
202 | // Stop triggers listen mode and exist from Run() | |
203 | // Usually called automaticly by TerminateSignalHandler. | |
204 | // | |
205 | ||
206 | fTerminate = kTRUE; | |
207 | fCondition.Signal(); | |
208 | } | |
209 | ||
210 | //______________________________________________________________________________________________ | |
211 | void AliShuttleTrigger::CheckTerminate() | |
212 | { | |
213 | // | |
214 | // Checks if the Shuttle got an external terminate request by a created file | |
215 | // This is an alternative to the signal which causes problems with the API libraries | |
216 | // | |
217 | ||
218 | if (strlen(fConfig->GetTerminateFilePath()) == 0) | |
219 | return; | |
220 | ||
221 | if (gSystem->AccessPathName(fConfig->GetTerminateFilePath()) == kFALSE) | |
222 | { | |
223 | AliInfo("Terminate file exists. Terminating Shuttle..."); | |
224 | fTerminate = kTRUE; | |
225 | } | |
226 | } | |
227 | ||
228 | //______________________________________________________________________________________________ | |
229 | void AliShuttleTrigger::Run() { | |
230 | // | |
231 | // AliShuttleTrigger main loop for asynchronized (listen) mode. | |
232 | // It spawns DIM service listener and waits for DAQ "end of run" | |
233 | // notification. Calls Collect() on notification. | |
234 | // | |
235 | ||
236 | fTerminate = kFALSE; | |
237 | ||
238 | DATENotifier* notifier = new DATENotifier(this, "/LOGBOOK/SUBSCRIBE/ECS_EOR"); | |
239 | ||
240 | Int_t nTry=0; | |
241 | Int_t nMaxTry = fConfig->GetMaxRetries()+1; | |
242 | Int_t received=0; | |
243 | ||
244 | AliInfo("Listening for ECS trigger"); | |
245 | ||
246 | while (1) { | |
247 | ||
248 | fMutex.Lock(); | |
249 | ||
250 | while (!(fNotified || fTerminate)) { | |
251 | for (Int_t iwait = 0; iwait < 10; iwait++){ | |
252 | received = fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait()/10); // to keep the connection to the server alive every minute while waiting for new runs | |
253 | if (received == 1) { | |
254 | if (gGrid) { | |
255 | AliInfo(Form("Keeping the connection to the server alive while waiting for new runs - %d waited from last one", fConfig->GetTriggerWait()/10*iwait)); | |
256 | gGrid->Pwd(); | |
257 | } | |
258 | else { | |
259 | AliInfo("No gGrid initialized so far, we cannot keep the connection to the server alive while waiting for new runs"); | |
260 | } | |
261 | } | |
262 | } | |
263 | //received=fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait()); | |
264 | CheckTerminate(); | |
265 | if (received==1) break; // 1 = timeout | |
266 | } | |
267 | ||
268 | fNotified = kFALSE; | |
269 | ||
270 | fMutex.UnLock(); | |
271 | ||
272 | if (fTerminate) { | |
273 | AliInfo("Terminated."); | |
274 | break; | |
275 | } | |
276 | ||
277 | if (received == 0) | |
278 | { | |
279 | AliInfo("Trigger from ECS received!"); | |
280 | } else if (received == 1) { | |
281 | AliInfo(Form("Timeout (%d s) waiting for trigger. " | |
282 | "Starting collection of new runs!", | |
283 | fConfig->GetTriggerWait())); | |
284 | } else { | |
285 | AliInfo("Error receiving trigger from ECS!"); | |
286 | break; | |
287 | } | |
288 | ||
289 | nTry++; | |
290 | AliInfo(Form("Received %d triggers so far", nTry)); | |
291 | ||
292 | if (fConfig->GetRunMode() == AliShuttleConfig::kTest) | |
293 | { | |
294 | if(nTry>=nMaxTry) | |
295 | { | |
296 | AliInfo(Form("Collect() ran more than %d times -> Exiting!", | |
297 | nMaxTry)); | |
298 | break; | |
299 | } | |
300 | } | |
301 | ||
302 | Collect(); | |
303 | CheckTerminate(); | |
304 | } | |
305 | ||
306 | delete notifier; | |
307 | } | |
308 | ||
309 | //______________________________________________________________________________________________ | |
310 | Bool_t AliShuttleTrigger::Collect(Int_t run) | |
311 | { | |
312 | // | |
313 | // this function creates a thread that runs the shuttle | |
314 | // then it checks if the shuttle is still running by checking the monitoring functions of the shuttle | |
315 | // | |
316 | ||
317 | // first checking disk space | |
318 | Long_t id = 0; | |
319 | Long_t bsize = 0; | |
320 | Long_t blocks = 0; | |
321 | Long_t bfree = 0; | |
322 | ||
323 | gSystem->GetFsInfo(fConfig->GetShuttleFileSystem(), &id, &bsize, &blocks, &bfree); | |
324 | ||
325 | AliInfo(Form("n. of free blocks = %ld, total n. of blocks = %ld",bfree,blocks)); | |
326 | Int_t spaceFree = (Int_t)(((Float_t)bfree/(Float_t)blocks)*100); | |
327 | ||
328 | if (spaceFree < fConfig->GetFreeDiskWarningThreshold()) { | |
329 | AliWarning(Form("************** Free space left = %d%%, below the Warning Threshold (%d%%)",spaceFree,fConfig->GetFreeDiskWarningThreshold())); | |
330 | if (TMath::Abs(time(0) - fLastMailDiskSpace) >= 86400){ // 86400 = n. of seconds in 1 d | |
331 | SendMailDiskSpace(fConfig->GetFreeDiskWarningThreshold()); | |
332 | fLastMailDiskSpace = time(0); // resetting fLastMailDiskSpace to time(0) = now | |
333 | } | |
334 | if (spaceFree < fConfig->GetFreeDiskFatalThreshold()){ | |
335 | AliError(Form("*************** Free space left = %d%%, below the Fatal Threshold (%d%%), terminating....",spaceFree,fConfig->GetFreeDiskFatalThreshold())); | |
336 | SendMailDiskSpace(fConfig->GetFreeDiskFatalThreshold()); | |
337 | fTerminate = kTRUE; // terminating.... | |
338 | } | |
339 | } | |
340 | ||
341 | if (fTerminate) { | |
342 | return kFALSE; | |
343 | } | |
344 | ||
345 | return fShuttle->Collect(run); | |
346 | } | |
347 | //______________________________________________________________________________________________ | |
348 | Bool_t AliShuttleTrigger::SendMailDiskSpace(Short_t percentage) | |
349 | { | |
350 | // | |
351 | // sends a mail to the shuttle experts in case of free disk space < theshold | |
352 | // | |
353 | ||
354 | ||
355 | AliInfo("******************* Sending the Mail!! *********************"); | |
356 | if (!fConfig->SendMail()) | |
357 | return kTRUE; | |
358 | ||
359 | Int_t runMode = (Int_t)fConfig->GetRunMode(); | |
360 | TString tmpStr; | |
361 | if (runMode == 0) tmpStr = " Nightly Test:"; | |
362 | else tmpStr = " Data Taking:"; | |
363 | void* dir = gSystem->OpenDirectory(fShuttle->GetShuttleLogDir()); | |
364 | if (dir == NULL) | |
365 | { | |
366 | if (gSystem->mkdir(fShuttle->GetShuttleLogDir(), kTRUE)) | |
367 | { | |
368 | AliWarning(Form("SendMail - Can't open directory <%s>", fShuttle->GetShuttleLogDir())); | |
369 | return kFALSE; | |
370 | } | |
371 | ||
372 | } else { | |
373 | gSystem->FreeDirectory(dir); | |
374 | } | |
375 | ||
376 | // SHUTTLE responsibles in to | |
377 | TString to=""; | |
378 | TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); | |
379 | TObjString *anAdmin=0; | |
380 | while ((anAdmin = (TObjString*) iterAdmins.Next())) | |
381 | { | |
382 | to += Form("%s,", anAdmin->GetName()); | |
383 | } | |
384 | if (to.Length() > 0) | |
385 | to.Remove(to.Length()-1); | |
386 | AliDebug(2, Form("to: %s",to.Data())); | |
387 | ||
388 | // mail body | |
389 | TString bodyFileName; | |
390 | bodyFileName.Form("%s/mail.body", fShuttle->GetShuttleLogDir()); | |
391 | gSystem->ExpandPathName(bodyFileName); | |
392 | ||
393 | ofstream mailBody; | |
394 | mailBody.open(bodyFileName, ofstream::out); | |
395 | ||
396 | if (!mailBody.is_open()) | |
397 | { | |
398 | AliWarning(Form("Could not open mail body file %s", bodyFileName.Data())); | |
399 | return kFALSE; | |
400 | } | |
401 | ||
402 | TString subject; | |
403 | TString body; | |
404 | ||
405 | Int_t percentage_used = 100 - percentage; | |
406 | subject = Form("%s CRITICAL Disk Space usage exceeds %d%c!", | |
407 | tmpStr.Data(),percentage_used,'%'); | |
408 | AliDebug(2, Form("subject: %s", subject.Data())); | |
409 | ||
410 | body = "Dear SHUTTLE experts, \n\n"; | |
411 | body += "The usage of the disk space on the shuttle machine has overcome \n"; | |
412 | body += Form("the threshold of %d%%. \n \n",percentage_used); | |
413 | body += "Please check! \n \n"; | |
414 | body += "Please do not answer this message directly, it is automatically generated.\n\n"; | |
415 | body += "Greetings,\n\n \t\t\tthe SHUTTLE\n"; | |
416 | ||
417 | AliDebug(2, Form("Body : %s", body.Data())); | |
418 | ||
419 | mailBody << body.Data(); | |
420 | mailBody.close(); | |
421 | ||
422 | // send mail! | |
423 | TString mailCommand = Form("mail -s \"%s\" %s < %s", | |
424 | subject.Data(), | |
425 | to.Data(), | |
426 | bodyFileName.Data()); | |
427 | AliDebug(2, Form("mail command: %s", mailCommand.Data())); | |
428 | ||
429 | Bool_t result = gSystem->Exec(mailCommand.Data()); | |
430 | ||
431 | return result == 0; | |
432 | } |