]>
Commit | Line | Data |
---|---|---|
1 | /************************************************************************** | |
2 | * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. * | |
3 | * * | |
4 | * Author: The ALICE Off-line Project. * | |
5 | * Contributors are mentioned in the code where appropriate. * | |
6 | * * | |
7 | * Permission to use, copy, modify and distribute this software and its * | |
8 | * documentation strictly for non-commercial purposes is hereby granted * | |
9 | * without fee, provided that the above copyright notice appears in all * | |
10 | * copies and that both the copyright notice and this permission notice * | |
11 | * appear in the supporting documentation. The authors make no claims * | |
12 | * about the suitability of this software for any purpose. It is * | |
13 | * provided "as is" without express or implied warranty. * | |
14 | **************************************************************************/ | |
15 | ||
16 | /* | |
17 | $Log$ | |
18 | Revision 1.15 2007/12/10 18:29:23 acolla | |
19 | Some log added to the listen mode | |
20 | ||
21 | Revision 1.14 2007/12/07 19:14:36 acolla | |
22 | in AliShuttleTrigger: | |
23 | ||
24 | Added automatic collection of new runs on a regular time basis (settable from the configuration) | |
25 | ||
26 | in AliShuttleConfig: new members | |
27 | ||
28 | - triggerWait: time to wait for DIM trigger (s) before starting automatic collection of new runs | |
29 | - mode: run mode (test, prod) -> used to build log folder (logs or logs_PROD) | |
30 | ||
31 | in AliShuttle: | |
32 | ||
33 | - logs now stored in logs/#RUN/DET_#RUN.log | |
34 | ||
35 | Revision 1.13 2006/11/16 16:16:48 jgrosseo | |
36 | introducing strict run ordering flag | |
37 | removed giving preprocessor name to preprocessor, they have to know their name themselves ;-) | |
38 | ||
39 | Revision 1.12 2006/10/20 15:22:59 jgrosseo | |
40 | o) Adding time out to the execution of the preprocessors: The Shuttle forks and the parent process monitors the child | |
41 | o) Merging Collect, CollectAll, CollectNew function | |
42 | o) Removing implementation of empty copy constructors (declaration still there!) | |
43 | ||
44 | Revision 1.11 2006/10/02 16:38:39 jgrosseo | |
45 | update (alberto): | |
46 | fixed memory leaks | |
47 | storing of objects that failed to be stored to the grid before | |
48 | interfacing of shuttle status table in daq system | |
49 | ||
50 | Revision 1.10 2006/08/15 10:50:00 jgrosseo | |
51 | effc++ corrections (alberto) | |
52 | ||
53 | Revision 1.9 2006/08/08 14:19:29 jgrosseo | |
54 | Update to shuttle classes (Alberto) | |
55 | ||
56 | - Possibility to set the full object's path in the Preprocessor's and | |
57 | Shuttle's Store functions | |
58 | - Possibility to extend the object's run validity in the same classes | |
59 | ("startValidity" and "validityInfinite" parameters) | |
60 | - Implementation of the StoreReferenceData function to store reference | |
61 | data in a dedicated CDB storage. | |
62 | ||
63 | Revision 1.8 2006/07/21 07:37:20 jgrosseo | |
64 | last run is stored after each run | |
65 | ||
66 | Revision 1.7 2006/07/20 09:54:40 jgrosseo | |
67 | introducing status management: The processing per subdetector is divided into several steps, | |
68 | after each step the status is stored on disk. If the system crashes in any of the steps the Shuttle | |
69 | can keep track of the number of failures and skips further processing after a certain threshold is | |
70 | exceeded. These thresholds can be configured in LDAP. | |
71 | ||
72 | Revision 1.6 2006/07/19 10:09:55 jgrosseo | |
73 | new configuration, accesst to DAQ FES (Alberto) | |
74 | ||
75 | Revision 1.5 2006/07/10 13:01:41 jgrosseo | |
76 | enhanced storing of last sucessfully processed run (alberto) | |
77 | ||
78 | Revision 1.4 2006/07/04 14:59:57 jgrosseo | |
79 | revision of AliDCSValue: Removed wrapper classes, reduced storage size per value by factor 2 | |
80 | ||
81 | Revision 1.3 2006/06/12 09:11:16 jgrosseo | |
82 | coding conventions (Alberto) | |
83 | ||
84 | Revision 1.2 2006/06/06 14:26:40 jgrosseo | |
85 | o) removed files that were moved to STEER | |
86 | o) shuttle updated to follow the new interface (Alberto) | |
87 | ||
88 | Revision 1.1 2006/03/07 07:52:34 hristov | |
89 | New version (B.Yordanov) | |
90 | ||
91 | Revision 1.5 2005/11/21 09:03:48 byordano | |
92 | one more print added | |
93 | ||
94 | Revision 1.4 2005/11/20 10:12:37 byordano | |
95 | comments added to AliShuttleTrigger | |
96 | ||
97 | */ | |
98 | ||
99 | ||
100 | // | |
101 | // This class is to deal with DAQ LogBook and DAQ "end of run" notification. | |
102 | // It has severeal two modes: | |
103 | // 1) synchronized - Collect() | |
104 | // 2) asynchronized - Run() - starts listening for DAQ "end of run" | |
105 | // notification by DIM service. | |
106 | // | |
107 | ||
108 | #include "AliShuttleTrigger.h" | |
109 | ||
110 | #include <TSystem.h> | |
111 | #include <TObjString.h> | |
112 | ||
113 | #include "AliLog.h" | |
114 | #include "AliShuttleConfig.h" | |
115 | #include "AliShuttle.h" | |
116 | #include "DATENotifier.h" | |
117 | ||
118 | #include <fstream> | |
119 | ||
120 | ClassImp(TerminateSignalHandler) | |
121 | ClassImp(AliShuttleTrigger) | |
122 | ||
123 | //______________________________________________________________________________________________ | |
124 | Bool_t TerminateSignalHandler::Notify() | |
125 | { | |
126 | // Sentd terminate command to the Shuttle trigger | |
127 | ||
128 | AliInfo("Terminate signal received ..."); | |
129 | fTrigger->Terminate(); | |
130 | ||
131 | return kTRUE; | |
132 | } | |
133 | ||
134 | //______________________________________________________________________________________________ | |
135 | AliShuttleTrigger::AliShuttleTrigger(const AliShuttleConfig* config): | |
136 | fConfig(config), fShuttle(NULL), | |
137 | fNotified(kFALSE), fTerminate(kFALSE), | |
138 | fMutex(), fCondition(&fMutex), | |
139 | fQuitSignalHandler(0), | |
140 | fInterruptSignalHandler(0), | |
141 | fLastMailDiskSpace(0) | |
142 | { | |
143 | // | |
144 | // config - pointer to the AliShuttleConfig object which represents | |
145 | // the configuration | |
146 | // mainStorage - pointer to AliCDBStorage for the undelying CDBStorage | |
147 | // localStorage (local) CDB storage to be used if mainStorage is unavailable | |
148 | // | |
149 | ||
150 | if (!fConfig->IsValid()) AliFatal("********** !!!!! Invalid configuration !!!!! **********"); | |
151 | UInt_t timeout = fConfig->GetDCSTimeOut(); | |
152 | Int_t retries = fConfig->GetDCSRetries(); | |
153 | fShuttle = new AliShuttle(config, timeout, retries); | |
154 | ||
155 | fQuitSignalHandler = new TerminateSignalHandler(this, kSigQuit); | |
156 | fInterruptSignalHandler = new TerminateSignalHandler(this, kSigInterrupt); | |
157 | ||
158 | gSystem->AddSignalHandler(fQuitSignalHandler); | |
159 | gSystem->AddSignalHandler(fInterruptSignalHandler); | |
160 | ||
161 | } | |
162 | ||
163 | //______________________________________________________________________________________________ | |
164 | AliShuttleTrigger::~AliShuttleTrigger() | |
165 | { | |
166 | // destructor | |
167 | ||
168 | gSystem->RemoveSignalHandler(fQuitSignalHandler); | |
169 | gSystem->RemoveSignalHandler(fInterruptSignalHandler); | |
170 | ||
171 | delete fShuttle; | |
172 | ||
173 | delete fQuitSignalHandler; | |
174 | fQuitSignalHandler = 0; | |
175 | ||
176 | delete fInterruptSignalHandler; | |
177 | fInterruptSignalHandler = 0; | |
178 | } | |
179 | ||
180 | //______________________________________________________________________________________________ | |
181 | Bool_t AliShuttleTrigger::Notify() { | |
182 | // | |
183 | // Trigger Collect() methods in asynchronized (listen) mode. | |
184 | // Usually called automaticly by DATENotifier on "end of run" | |
185 | // notification event. | |
186 | // | |
187 | ||
188 | fMutex.Lock(); | |
189 | ||
190 | fNotified = kTRUE; | |
191 | fCondition.Signal(); | |
192 | ||
193 | fMutex.UnLock(); | |
194 | ||
195 | return kTRUE; | |
196 | } | |
197 | ||
198 | //______________________________________________________________________________________________ | |
199 | void AliShuttleTrigger::Terminate() { | |
200 | // | |
201 | // Stop triggers listen mode and exist from Run() | |
202 | // Usually called automaticly by TerminateSignalHandler. | |
203 | // | |
204 | ||
205 | fTerminate = kTRUE; | |
206 | fCondition.Signal(); | |
207 | } | |
208 | ||
209 | //______________________________________________________________________________________________ | |
210 | void AliShuttleTrigger::CheckTerminate() | |
211 | { | |
212 | // | |
213 | // Checks if the Shuttle got an external terminate request by a created file | |
214 | // This is an alternative to the signal which causes problems with the API libraries | |
215 | // | |
216 | ||
217 | if (strlen(fConfig->GetTerminateFilePath()) == 0) | |
218 | return; | |
219 | ||
220 | if (gSystem->AccessPathName(fConfig->GetTerminateFilePath()) == kFALSE) | |
221 | { | |
222 | AliInfo("Terminate file exists. Terminating Shuttle..."); | |
223 | fTerminate = kTRUE; | |
224 | } | |
225 | } | |
226 | ||
227 | //______________________________________________________________________________________________ | |
228 | void AliShuttleTrigger::Run() { | |
229 | // | |
230 | // AliShuttleTrigger main loop for asynchronized (listen) mode. | |
231 | // It spawns DIM service listener and waits for DAQ "end of run" | |
232 | // notification. Calls Collect() on notification. | |
233 | // | |
234 | ||
235 | fTerminate = kFALSE; | |
236 | ||
237 | DATENotifier* notifier = new DATENotifier(this, "/LOGBOOK/SUBSCRIBE/ECS_EOR"); | |
238 | ||
239 | Int_t nTry=0; | |
240 | Int_t nMaxTry = fConfig->GetMaxRetries()+1; | |
241 | Int_t received=0; | |
242 | ||
243 | AliInfo("Listening for ECS trigger"); | |
244 | ||
245 | while (1) { | |
246 | ||
247 | fMutex.Lock(); | |
248 | ||
249 | while (!(fNotified || fTerminate)) { | |
250 | received=fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait()); | |
251 | CheckTerminate(); | |
252 | if (received==1) break; // 1 = timeout | |
253 | } | |
254 | ||
255 | fNotified = kFALSE; | |
256 | ||
257 | fMutex.UnLock(); | |
258 | ||
259 | if (fTerminate) { | |
260 | AliInfo("Terminated."); | |
261 | break; | |
262 | } | |
263 | ||
264 | if (received == 0) | |
265 | { | |
266 | AliInfo("Trigger from ECS received!"); | |
267 | } else if (received == 1) { | |
268 | AliInfo(Form("Timeout (%d s) waiting for trigger. " | |
269 | "Starting collection of new runs!", | |
270 | fConfig->GetTriggerWait())); | |
271 | } else { | |
272 | AliInfo("Error receiving trigger from ECS!"); | |
273 | break; | |
274 | } | |
275 | ||
276 | nTry++; | |
277 | AliInfo(Form("Received %d triggers so far", nTry)); | |
278 | ||
279 | if (fConfig->GetRunMode() == AliShuttleConfig::kTest) | |
280 | { | |
281 | if(nTry>=nMaxTry) | |
282 | { | |
283 | AliInfo(Form("Collect() ran more than %d times -> Exiting!", | |
284 | nMaxTry)); | |
285 | break; | |
286 | } | |
287 | } | |
288 | ||
289 | Collect(); | |
290 | CheckTerminate(); | |
291 | } | |
292 | ||
293 | delete notifier; | |
294 | } | |
295 | ||
296 | //______________________________________________________________________________________________ | |
297 | Bool_t AliShuttleTrigger::Collect(Int_t run) | |
298 | { | |
299 | // | |
300 | // this function creates a thread that runs the shuttle | |
301 | // then it checks if the shuttle is still running by checking the monitoring functions of the shuttle | |
302 | // | |
303 | ||
304 | // first checking disk space | |
305 | Long_t id = 0; | |
306 | Long_t bsize = 0; | |
307 | Long_t blocks = 0; | |
308 | Long_t bfree = 0; | |
309 | ||
310 | gSystem->GetFsInfo(fConfig->GetShuttleFileSystem(), &id, &bsize, &blocks, &bfree); | |
311 | ||
312 | AliInfo(Form("n. of free blocks = %ld, total n. of blocks = %ld",bfree,blocks)); | |
313 | Int_t spaceFree = (Int_t)(((Float_t)bfree/(Float_t)blocks)*100); | |
314 | ||
315 | if (spaceFree < fConfig->GetFreeDiskWarningThreshold()) { | |
316 | AliWarning(Form("************** Free space left = %d%%, below the Warning Threshold (%d%%)",spaceFree,fConfig->GetFreeDiskWarningThreshold())); | |
317 | if (TMath::Abs(time(0) - fLastMailDiskSpace) >= 86400){ // 86400 = n. of seconds in 1 d | |
318 | SendMailDiskSpace(fConfig->GetFreeDiskWarningThreshold()); | |
319 | fLastMailDiskSpace = time(0); // resetting fLastMailDiskSpace to time(0) = now | |
320 | } | |
321 | if (spaceFree < fConfig->GetFreeDiskFatalThreshold()){ | |
322 | AliError(Form("*************** Free space left = %d%%, below the Fatal Threshold (%d%%), terminating....",spaceFree,fConfig->GetFreeDiskFatalThreshold())); | |
323 | SendMailDiskSpace(fConfig->GetFreeDiskFatalThreshold()); | |
324 | fTerminate = kTRUE; // terminating.... | |
325 | } | |
326 | } | |
327 | ||
328 | if (fTerminate) { | |
329 | return kFALSE; | |
330 | } | |
331 | ||
332 | return fShuttle->Collect(run); | |
333 | } | |
334 | //______________________________________________________________________________________________ | |
335 | Bool_t AliShuttleTrigger::SendMailDiskSpace(Short_t percentage) | |
336 | { | |
337 | // | |
338 | // sends a mail to the shuttle experts in case of free disk space < theshold | |
339 | // | |
340 | ||
341 | ||
342 | AliInfo("******************* Sending the Mail!! *********************"); | |
343 | if (!fConfig->SendMail()) | |
344 | return kTRUE; | |
345 | ||
346 | Int_t runMode = (Int_t)fConfig->GetRunMode(); | |
347 | TString tmpStr; | |
348 | if (runMode == 0) tmpStr = " Nightly Test:"; | |
349 | else tmpStr = " Data Taking:"; | |
350 | void* dir = gSystem->OpenDirectory(fShuttle->GetShuttleLogDir()); | |
351 | if (dir == NULL) | |
352 | { | |
353 | if (gSystem->mkdir(fShuttle->GetShuttleLogDir(), kTRUE)) | |
354 | { | |
355 | AliWarning(Form("SendMail - Can't open directory <%s>", fShuttle->GetShuttleLogDir())); | |
356 | return kFALSE; | |
357 | } | |
358 | ||
359 | } else { | |
360 | gSystem->FreeDirectory(dir); | |
361 | } | |
362 | ||
363 | // SHUTTLE responsibles in to | |
364 | TString to=""; | |
365 | TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); | |
366 | TObjString *anAdmin=0; | |
367 | while ((anAdmin = (TObjString*) iterAdmins.Next())) | |
368 | { | |
369 | to += Form("%s,", anAdmin->GetName()); | |
370 | } | |
371 | if (to.Length() > 0) | |
372 | to.Remove(to.Length()-1); | |
373 | AliDebug(2, Form("to: %s",to.Data())); | |
374 | ||
375 | // mail body | |
376 | TString bodyFileName; | |
377 | bodyFileName.Form("%s/mail.body", fShuttle->GetShuttleLogDir()); | |
378 | gSystem->ExpandPathName(bodyFileName); | |
379 | ||
380 | ofstream mailBody; | |
381 | mailBody.open(bodyFileName, ofstream::out); | |
382 | ||
383 | if (!mailBody.is_open()) | |
384 | { | |
385 | AliWarning(Form("Could not open mail body file %s", bodyFileName.Data())); | |
386 | return kFALSE; | |
387 | } | |
388 | ||
389 | TString subject; | |
390 | TString body; | |
391 | ||
392 | Int_t percentage_used = 100 - percentage; | |
393 | subject = Form("%s CRITICAL Disk Space usage exceeds %d%c!", | |
394 | tmpStr.Data(),percentage_used,'%'); | |
395 | AliDebug(2, Form("subject: %s", subject.Data())); | |
396 | ||
397 | body = "Dear SHUTTLE experts, \n\n"; | |
398 | body += "The usage of the disk space on the shuttle machine has overcome \n"; | |
399 | body += Form("the threshold of %d%%. \n \n",percentage_used); | |
400 | body += "Please check! \n \n"; | |
401 | body += "Please do not answer this message directly, it is automatically generated.\n\n"; | |
402 | body += "Greetings,\n\n \t\t\tthe SHUTTLE\n"; | |
403 | ||
404 | AliDebug(2, Form("Body : %s", body.Data())); | |
405 | ||
406 | mailBody << body.Data(); | |
407 | mailBody.close(); | |
408 | ||
409 | // send mail! | |
410 | TString mailCommand = Form("mail -s \"%s\" %s < %s", | |
411 | subject.Data(), | |
412 | to.Data(), | |
413 | bodyFileName.Data()); | |
414 | AliDebug(2, Form("mail command: %s", mailCommand.Data())); | |
415 | ||
416 | Bool_t result = gSystem->Exec(mailCommand.Data()); | |
417 | ||
418 | return result == 0; | |
419 | } |