]> git.uio.no Git - u/mrichter/AliRoot.git/blame_incremental - SHUTTLE/AliShuttleTrigger.cxx
Attemting bug fix, again
[u/mrichter/AliRoot.git] / SHUTTLE / AliShuttleTrigger.cxx
... / ...
CommitLineData
1/**************************************************************************
2 * Copyright(c) 1998-1999, ALICE Experiment at CERN, All rights reserved. *
3 * *
4 * Author: The ALICE Off-line Project. *
5 * Contributors are mentioned in the code where appropriate. *
6 * *
7 * Permission to use, copy, modify and distribute this software and its *
8 * documentation strictly for non-commercial purposes is hereby granted *
9 * without fee, provided that the above copyright notice appears in all *
10 * copies and that both the copyright notice and this permission notice *
11 * appear in the supporting documentation. The authors make no claims *
12 * about the suitability of this software for any purpose. It is *
13 * provided "as is" without express or implied warranty. *
14 **************************************************************************/
15
16/*
17 $Log$
18 Revision 1.15 2007/12/10 18:29:23 acolla
19 Some log added to the listen mode
20
21 Revision 1.14 2007/12/07 19:14:36 acolla
22 in AliShuttleTrigger:
23
24 Added automatic collection of new runs on a regular time basis (settable from the configuration)
25
26 in AliShuttleConfig: new members
27
28 - triggerWait: time to wait for DIM trigger (s) before starting automatic collection of new runs
29 - mode: run mode (test, prod) -> used to build log folder (logs or logs_PROD)
30
31 in AliShuttle:
32
33 - logs now stored in logs/#RUN/DET_#RUN.log
34
35 Revision 1.13 2006/11/16 16:16:48 jgrosseo
36 introducing strict run ordering flag
37 removed giving preprocessor name to preprocessor, they have to know their name themselves ;-)
38
39 Revision 1.12 2006/10/20 15:22:59 jgrosseo
40 o) Adding time out to the execution of the preprocessors: The Shuttle forks and the parent process monitors the child
41 o) Merging Collect, CollectAll, CollectNew function
42 o) Removing implementation of empty copy constructors (declaration still there!)
43
44 Revision 1.11 2006/10/02 16:38:39 jgrosseo
45 update (alberto):
46 fixed memory leaks
47 storing of objects that failed to be stored to the grid before
48 interfacing of shuttle status table in daq system
49
50 Revision 1.10 2006/08/15 10:50:00 jgrosseo
51 effc++ corrections (alberto)
52
53 Revision 1.9 2006/08/08 14:19:29 jgrosseo
54 Update to shuttle classes (Alberto)
55
56 - Possibility to set the full object's path in the Preprocessor's and
57 Shuttle's Store functions
58 - Possibility to extend the object's run validity in the same classes
59 ("startValidity" and "validityInfinite" parameters)
60 - Implementation of the StoreReferenceData function to store reference
61 data in a dedicated CDB storage.
62
63 Revision 1.8 2006/07/21 07:37:20 jgrosseo
64 last run is stored after each run
65
66 Revision 1.7 2006/07/20 09:54:40 jgrosseo
67 introducing status management: The processing per subdetector is divided into several steps,
68 after each step the status is stored on disk. If the system crashes in any of the steps the Shuttle
69 can keep track of the number of failures and skips further processing after a certain threshold is
70 exceeded. These thresholds can be configured in LDAP.
71
72 Revision 1.6 2006/07/19 10:09:55 jgrosseo
73 new configuration, accesst to DAQ FES (Alberto)
74
75 Revision 1.5 2006/07/10 13:01:41 jgrosseo
76 enhanced storing of last sucessfully processed run (alberto)
77
78 Revision 1.4 2006/07/04 14:59:57 jgrosseo
79 revision of AliDCSValue: Removed wrapper classes, reduced storage size per value by factor 2
80
81 Revision 1.3 2006/06/12 09:11:16 jgrosseo
82 coding conventions (Alberto)
83
84 Revision 1.2 2006/06/06 14:26:40 jgrosseo
85 o) removed files that were moved to STEER
86 o) shuttle updated to follow the new interface (Alberto)
87
88 Revision 1.1 2006/03/07 07:52:34 hristov
89 New version (B.Yordanov)
90
91 Revision 1.5 2005/11/21 09:03:48 byordano
92 one more print added
93
94 Revision 1.4 2005/11/20 10:12:37 byordano
95 comments added to AliShuttleTrigger
96
97 */
98
99
100//
101// This class is to deal with DAQ LogBook and DAQ "end of run" notification.
102// It has severeal two modes:
103// 1) synchronized - Collect()
104// 2) asynchronized - Run() - starts listening for DAQ "end of run"
105// notification by DIM service.
106//
107
108#include "AliShuttleTrigger.h"
109
110#include <TSystem.h>
111#include <TObjString.h>
112
113#include "AliLog.h"
114#include "AliShuttleConfig.h"
115#include "AliShuttle.h"
116#include "DATENotifier.h"
117
118#include <fstream>
119
120ClassImp(TerminateSignalHandler)
121ClassImp(AliShuttleTrigger)
122
123//______________________________________________________________________________________________
124Bool_t TerminateSignalHandler::Notify()
125{
126// Sentd terminate command to the Shuttle trigger
127
128 AliInfo("Terminate signal received ...");
129 fTrigger->Terminate();
130
131 return kTRUE;
132}
133
134//______________________________________________________________________________________________
135AliShuttleTrigger::AliShuttleTrigger(const AliShuttleConfig* config):
136 fConfig(config), fShuttle(NULL),
137 fNotified(kFALSE), fTerminate(kFALSE),
138 fMutex(), fCondition(&fMutex),
139 fQuitSignalHandler(0),
140 fInterruptSignalHandler(0),
141 fLastMailDiskSpace(0)
142{
143 //
144 // config - pointer to the AliShuttleConfig object which represents
145 // the configuration
146 // mainStorage - pointer to AliCDBStorage for the undelying CDBStorage
147 // localStorage (local) CDB storage to be used if mainStorage is unavailable
148 //
149
150 if (!fConfig->IsValid()) AliFatal("********** !!!!! Invalid configuration !!!!! **********");
151 UInt_t timeout = fConfig->GetDCSTimeOut();
152 Int_t retries = fConfig->GetDCSRetries();
153 fShuttle = new AliShuttle(config, timeout, retries);
154
155 fQuitSignalHandler = new TerminateSignalHandler(this, kSigQuit);
156 fInterruptSignalHandler = new TerminateSignalHandler(this, kSigInterrupt);
157
158 gSystem->AddSignalHandler(fQuitSignalHandler);
159 gSystem->AddSignalHandler(fInterruptSignalHandler);
160
161}
162
163//______________________________________________________________________________________________
164AliShuttleTrigger::~AliShuttleTrigger()
165{
166 // destructor
167
168 gSystem->RemoveSignalHandler(fQuitSignalHandler);
169 gSystem->RemoveSignalHandler(fInterruptSignalHandler);
170
171 delete fShuttle;
172
173 delete fQuitSignalHandler;
174 fQuitSignalHandler = 0;
175
176 delete fInterruptSignalHandler;
177 fInterruptSignalHandler = 0;
178}
179
180//______________________________________________________________________________________________
181Bool_t AliShuttleTrigger::Notify() {
182 //
183 // Trigger Collect() methods in asynchronized (listen) mode.
184 // Usually called automaticly by DATENotifier on "end of run"
185 // notification event.
186 //
187
188 fMutex.Lock();
189
190 fNotified = kTRUE;
191 fCondition.Signal();
192
193 fMutex.UnLock();
194
195 return kTRUE;
196}
197
198//______________________________________________________________________________________________
199void AliShuttleTrigger::Terminate() {
200 //
201 // Stop triggers listen mode and exist from Run()
202 // Usually called automaticly by TerminateSignalHandler.
203 //
204
205 fTerminate = kTRUE;
206 fCondition.Signal();
207}
208
209//______________________________________________________________________________________________
210void AliShuttleTrigger::CheckTerminate()
211{
212 //
213 // Checks if the Shuttle got an external terminate request by a created file
214 // This is an alternative to the signal which causes problems with the API libraries
215 //
216
217 if (strlen(fConfig->GetTerminateFilePath()) == 0)
218 return;
219
220 if (gSystem->AccessPathName(fConfig->GetTerminateFilePath()) == kFALSE)
221 {
222 AliInfo("Terminate file exists. Terminating Shuttle...");
223 fTerminate = kTRUE;
224 }
225}
226
227//______________________________________________________________________________________________
228void AliShuttleTrigger::Run() {
229 //
230 // AliShuttleTrigger main loop for asynchronized (listen) mode.
231 // It spawns DIM service listener and waits for DAQ "end of run"
232 // notification. Calls Collect() on notification.
233 //
234
235 fTerminate = kFALSE;
236
237 DATENotifier* notifier = new DATENotifier(this, "/LOGBOOK/SUBSCRIBE/ECS_EOR");
238
239 Int_t nTry=0;
240 Int_t nMaxTry = fConfig->GetMaxRetries()+1;
241 Int_t received=0;
242
243 AliInfo("Listening for ECS trigger");
244
245 while (1) {
246
247 fMutex.Lock();
248
249 while (!(fNotified || fTerminate)) {
250 received=fCondition.TimedWaitRelative(1000*fConfig->GetTriggerWait());
251 CheckTerminate();
252 if (received==1) break; // 1 = timeout
253 }
254
255 fNotified = kFALSE;
256
257 fMutex.UnLock();
258
259 if (fTerminate) {
260 AliInfo("Terminated.");
261 break;
262 }
263
264 if (received == 0)
265 {
266 AliInfo("Trigger from ECS received!");
267 } else if (received == 1) {
268 AliInfo(Form("Timeout (%d s) waiting for trigger. "
269 "Starting collection of new runs!",
270 fConfig->GetTriggerWait()));
271 } else {
272 AliInfo("Error receiving trigger from ECS!");
273 break;
274 }
275
276 nTry++;
277 AliInfo(Form("Received %d triggers so far", nTry));
278
279 if (fConfig->GetRunMode() == AliShuttleConfig::kTest)
280 {
281 if(nTry>=nMaxTry)
282 {
283 AliInfo(Form("Collect() ran more than %d times -> Exiting!",
284 nMaxTry));
285 break;
286 }
287 }
288
289 Collect();
290 CheckTerminate();
291 }
292
293 delete notifier;
294}
295
296//______________________________________________________________________________________________
297Bool_t AliShuttleTrigger::Collect(Int_t run)
298{
299 //
300 // this function creates a thread that runs the shuttle
301 // then it checks if the shuttle is still running by checking the monitoring functions of the shuttle
302 //
303
304 // first checking disk space
305 Long_t id = 0;
306 Long_t bsize = 0;
307 Long_t blocks = 0;
308 Long_t bfree = 0;
309
310 gSystem->GetFsInfo(fConfig->GetShuttleFileSystem(), &id, &bsize, &blocks, &bfree);
311
312 AliInfo(Form("n. of free blocks = %ld, total n. of blocks = %ld",bfree,blocks));
313 Int_t spaceFree = (Int_t)(((Float_t)bfree/(Float_t)blocks)*100);
314
315 if (spaceFree < fConfig->GetFreeDiskWarningThreshold()) {
316 AliWarning(Form("************** Free space left = %d%%, below the Warning Threshold (%d%%)",spaceFree,fConfig->GetFreeDiskWarningThreshold()));
317 if (TMath::Abs(time(0) - fLastMailDiskSpace) >= 86400){ // 86400 = n. of seconds in 1 d
318 SendMailDiskSpace(fConfig->GetFreeDiskWarningThreshold());
319 fLastMailDiskSpace = time(0); // resetting fLastMailDiskSpace to time(0) = now
320 }
321 if (spaceFree < fConfig->GetFreeDiskFatalThreshold()){
322 AliError(Form("*************** Free space left = %d%%, below the Fatal Threshold (%d%%), terminating....",spaceFree,fConfig->GetFreeDiskFatalThreshold()));
323 SendMailDiskSpace(fConfig->GetFreeDiskFatalThreshold());
324 fTerminate = kTRUE; // terminating....
325 }
326 }
327
328 if (fTerminate) {
329 return kFALSE;
330 }
331
332 return fShuttle->Collect(run);
333}
334//______________________________________________________________________________________________
335Bool_t AliShuttleTrigger::SendMailDiskSpace(Short_t percentage)
336{
337 //
338 // sends a mail to the shuttle experts in case of free disk space < theshold
339 //
340
341
342 AliInfo("******************* Sending the Mail!! *********************");
343 if (!fConfig->SendMail())
344 return kTRUE;
345
346 Int_t runMode = (Int_t)fConfig->GetRunMode();
347 TString tmpStr;
348 if (runMode == 0) tmpStr = " Nightly Test:";
349 else tmpStr = " Data Taking:";
350 void* dir = gSystem->OpenDirectory(fShuttle->GetShuttleLogDir());
351 if (dir == NULL)
352 {
353 if (gSystem->mkdir(fShuttle->GetShuttleLogDir(), kTRUE))
354 {
355 AliWarning(Form("SendMail - Can't open directory <%s>", fShuttle->GetShuttleLogDir()));
356 return kFALSE;
357 }
358
359 } else {
360 gSystem->FreeDirectory(dir);
361 }
362
363 // SHUTTLE responsibles in to
364 TString to="";
365 TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal));
366 TObjString *anAdmin=0;
367 while ((anAdmin = (TObjString*) iterAdmins.Next()))
368 {
369 to += Form("%s,", anAdmin->GetName());
370 }
371 if (to.Length() > 0)
372 to.Remove(to.Length()-1);
373 AliDebug(2, Form("to: %s",to.Data()));
374
375 // mail body
376 TString bodyFileName;
377 bodyFileName.Form("%s/mail.body", fShuttle->GetShuttleLogDir());
378 gSystem->ExpandPathName(bodyFileName);
379
380 ofstream mailBody;
381 mailBody.open(bodyFileName, ofstream::out);
382
383 if (!mailBody.is_open())
384 {
385 AliWarning(Form("Could not open mail body file %s", bodyFileName.Data()));
386 return kFALSE;
387 }
388
389 TString subject;
390 TString body;
391
392 Int_t percentage_used = 100 - percentage;
393 subject = Form("%s CRITICAL Disk Space usage exceeds %d%c!",
394 tmpStr.Data(),percentage_used,'%');
395 AliDebug(2, Form("subject: %s", subject.Data()));
396
397 body = "Dear SHUTTLE experts, \n\n";
398 body += "The usage of the disk space on the shuttle machine has overcome \n";
399 body += Form("the threshold of %d%%. \n \n",percentage_used);
400 body += "Please check! \n \n";
401 body += "Please do not answer this message directly, it is automatically generated.\n\n";
402 body += "Greetings,\n\n \t\t\tthe SHUTTLE\n";
403
404 AliDebug(2, Form("Body : %s", body.Data()));
405
406 mailBody << body.Data();
407 mailBody.close();
408
409 // send mail!
410 TString mailCommand = Form("mail -s \"%s\" %s < %s",
411 subject.Data(),
412 to.Data(),
413 bodyFileName.Data());
414 AliDebug(2, Form("mail command: %s", mailCommand.Data()));
415
416 Bool_t result = gSystem->Exec(mailCommand.Data());
417
418 return result == 0;
419}