]> git.uio.no Git - u/mrichter/AliRoot.git/blobdiff - SHUTTLE/AliShuttle.cxx
differentiate status that is set upon timeout (Chiara)
[u/mrichter/AliRoot.git] / SHUTTLE / AliShuttle.cxx
index bdc3a7405b8d25b8d8928f9b7b21735e0a0063e3..565606e873f2d6b10ee18ae2b6cc0585c5bd7278 100644 (file)
@@ -256,7 +256,7 @@ Bool_t AliShuttle::StoreOCDB()
        //
        
        UpdateShuttleStatus(AliShuttleStatus::kStoreStarted);
-                               
+                       
        if (fTestMode & kErrorGrid)
        {
                Log("SHUTTLE", "StoreOCDB - In TESTMODE - Simulating error while storing in the Grid");
@@ -375,6 +375,8 @@ Int_t AliShuttle::StoreOCDB(const TString& gridURI)
                AliCDBId aLocId = aLocEntry->GetId();
                aLocEntry->SetVersion(-1);
                aLocEntry->SetSubVersion(-1);
+                                               
+               Log(fCurrentDetector.Data(), Form("Attempting to store %s", aLocId.ToString().Data()));
 
                // If local object is valid up to infinity we store it only if it is
                // the first unprocessed run!
@@ -395,47 +397,51 @@ Int_t AliShuttle::StoreOCDB(const TString& gridURI)
                Bool_t store = kTRUE;
                TIter gridIter(gridIds);
                AliCDBId* aGridId = 0;
-               while((aGridId = dynamic_cast<AliCDBId*> (gridIter.Next()))){
-                       if(aGridId->GetPath() != aLocId.GetPath()) continue;
+               while ((aGridId = dynamic_cast<AliCDBId*> (gridIter.Next()))) {
+                       if (aGridId->GetPath() != aLocId.GetPath()) 
+                               continue;
                        // skip all objects valid up to infinity
-                       if(aGridId->GetLastRun() == AliCDBRunRange::Infinity()) continue;
+                       if (aGridId->GetLastRun() == AliCDBRunRange::Infinity()) 
+                               continue;
+                       
                        // if we get here, it means there's already some more recent object stored on Grid!
+                       Log(fCurrentDetector.Data(),
+                               Form("StoreOCDB - A more recent object already exists in %s storage: <%s>",
+                               type, aGridId->ToString().Data()));
+                       
                        store = kFALSE;
                        break;
                }
 
-               // If we get here, the file can be stored!
-               Bool_t storeOk = gridSto->Put(aLocEntry);
-               if(!store || storeOk){
-
-                       if (!store)
-                       {
-                               Log(fCurrentDetector.Data(),
-                                       Form("StoreOCDB - A more recent object already exists in %s storage: <%s>",
-                                               type, aGridId->ToString().Data()));
-                       } else {
+               Bool_t storeOk = kFALSE;
+               if (store)
+               {
+                       Log(fCurrentDetector.Data(), Form("Prechecks succeeded. Ready to store %s", aLocId.ToString().Data()));
+                       storeOk = gridSto->Put(aLocEntry);
+                       if (storeOk) {
                                Log("SHUTTLE",
-                                       Form("StoreOCDB - Object <%s> successfully put into %s storage",
-                                               aLocId.ToString().Data(), type));
+                               Form("StoreOCDB - Object <%s> successfully put into %s storage",
+                                       aLocId.ToString().Data(), type));
                                Log(fCurrentDetector.Data(),
                                        Form("StoreOCDB - Object <%s> successfully put into %s storage",
-                                               aLocId.ToString().Data(), type));
+                                       aLocId.ToString().Data(), type));
+                       } else  {
+                               Log("SHUTTLE",
+                                       Form("StoreOCDB - Grid %s storage of object <%s> failed",
+                                       type, aLocId.ToString().Data()));
+                               Log(fCurrentDetector.Data(),
+                                       Form("StoreOCDB - Grid %s storage of object <%s> failed",
+                                       type, aLocId.ToString().Data()));
+                               result = kFALSE;
                        }
-
-                       // removing local filename...
+               }
+               
+               if (!store || storeOk) {
+                       // removing local file...
                        TString filename;
                        localSto->IdToFilename(aLocId, filename);
                        Log("SHUTTLE", Form("StoreOCDB - Removing local file %s", filename.Data()));
                        RemoveFile(filename.Data());
-                       continue;
-               } else  {
-                       Log("SHUTTLE",
-                               Form("StoreOCDB - Grid %s storage of object <%s> failed",
-                                       type, aLocId.ToString().Data()));
-                       Log(fCurrentDetector.Data(),
-                               Form("StoreOCDB - Grid %s storage of object <%s> failed",
-                                       type, aLocId.ToString().Data()));
-                       result = kFALSE;
                }
        }
        localEntries->Clear();
@@ -982,7 +988,7 @@ Bool_t AliShuttle::WriteShuttleStatus(AliShuttleStatus* status)
                return kFALSE;
        }
        
-       SendMLInfo();
+       SendMLDetInfo();
 
        return kTRUE;
 }
@@ -1018,11 +1024,11 @@ void AliShuttle::UpdateShuttleStatus(AliShuttleStatus::Status newStatus, Bool_t
 
        AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry);
 
-       SendMLInfo();
+       SendMLDetInfo();
 }
 
 //______________________________________________________________________________________________
-void AliShuttle::SendMLInfo()
+void AliShuttle::SendMLDetInfo()
 {
        //
        // sends ML information about the current status of the current detector being processed
@@ -1031,7 +1037,7 @@ void AliShuttle::SendMLInfo()
        AliShuttleStatus* status = dynamic_cast<AliShuttleStatus*> (fStatusEntry->GetObject());
        
        if (!status){
-               Log("SHUTTLE", "SendMLInfo - UNEXPECTED: status could not be read from current CDB entry");
+               Log("SHUTTLE", "SendMLDetInfo - UNEXPECTED: status could not be read from current CDB entry");
                return;
        }
        
@@ -1180,6 +1186,68 @@ Bool_t AliShuttle::ContinueProcessing()
        return cont;
 }
 
+//______________________________________________________________________________________________
+void AliShuttle::SendMLRunInfo(const char* status)
+{
+       // 
+       // Send information about this run to ML
+       
+       TMonaLisaText  mlStatus("SHUTTLE_status", status);
+       TString runType(fLogbookEntry->GetRunType());
+       if (strlen(fLogbookEntry->GetRunParameter("log")) > 0){
+
+               runType += "(";
+               runType += fLogbookEntry->GetRunParameter("log");
+               runType += ")";
+       }
+       TMonaLisaText  mlRunType("SHUTTLE_runtype", runType);
+
+       TList mlList;
+       mlList.Add(&mlStatus);
+       mlList.Add(&mlRunType);
+
+       TString mlID;
+       mlID.Form("%d", GetCurrentRun());
+       fMonaLisa->SendParameters(&mlList, mlID);       
+}
+
+//______________________________________________________________________________________________
+Int_t AliShuttle::GetMem(Int_t pid)
+{
+       // invokes ps to get the memory consumption of the process <pid>
+       // returns -1 in case of error
+       
+       TString checkStr;
+       checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid);
+       FILE* pipe = gSystem->OpenPipe(checkStr, "r");
+       if (!pipe)
+       {
+               Log("SHUTTLE", Form("Process - Error: "
+                       "Could not open pipe to %s", checkStr.Data()));
+               return -1;
+       }
+               
+       char buffer[100];
+       if (!fgets(buffer, 100, pipe))
+       {
+               Log("SHUTTLE", "Process - Error: ps did not return anything");
+               gSystem->ClosePipe(pipe);
+               return -1;
+       }
+       gSystem->ClosePipe(pipe);
+       
+       //Log("SHUTTLE", Form("ps returned %s", buffer));
+       
+       Int_t mem = 0;
+       if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem)
+       {
+               Log("SHUTTLE", "Process - Error: Could not parse output of ps");
+               return -1;
+       }
+       
+       return mem;
+}
+
 //______________________________________________________________________________________________
 Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
 {
@@ -1197,26 +1265,10 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
        Log("SHUTTLE", Form("\t\t\t^*^*^*^*^*^*^*^*^*^*^*^* run %d: START ^*^*^*^*^*^*^*^*^*^*^*^*",
                                        GetCurrentRun()));
 
-       // Send the information to ML
        CountOpenRuns();
        
-       TMonaLisaText  mlStatus("SHUTTLE_status", "Processing");
-       TString runType(entry->GetRunType());
-       if (strlen(entry->GetRunParameter("log")) > 0){
-
-               runType += "(";
-               runType += entry->GetRunParameter("log");
-               runType += ")";
-       }
-       TMonaLisaText  mlRunType("SHUTTLE_runtype", runType);
-
-       TList mlList;
-       mlList.Add(&mlStatus);
-       mlList.Add(&mlRunType);
-
-       TString mlID;
-       mlID.Form("%d", GetCurrentRun());
-       fMonaLisa->SendParameters(&mlList, mlID);
+       // Send the information to ML
+       SendMLRunInfo("Processing");
 
        if (fLogbookEntry->IsDone())
        {
@@ -1306,6 +1358,9 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
                                                GetCurrentRun(), aDetector->GetName()));
 
                for(Int_t iSys=0;iSys<3;iSys++) fFXSCalled[iSys]=kFALSE;
+               
+               Int_t initialMem = GetMem(getpid());
+               Log("SHUTTLE", Form("Memory consumption before forking is %d", initialMem));
 
                Log(fCurrentDetector.Data(), "Process - Starting processing");
 
@@ -1330,16 +1385,42 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
 
                                if (expiredTime > fConfig->GetPPTimeOut())
                                {
-                                       TString tmp;
-                                       tmp.Form("Process - Process of %s time out. "
-                                                       "Run time: %d seconds. Killing...",
-                                                       fCurrentDetector.Data(), expiredTime);
-                                       Log("SHUTTLE", tmp);
-                                       Log(fCurrentDetector, tmp);
+                                        TString logMsg;
+                                       AliShuttleStatus *currentStatus = ReadShuttleStatus();
+                                       AliShuttleStatus::Status newStatus = AliShuttleStatus::kInvalid;
+                                       
+                                       if (currentStatus->GetStatus() <= AliShuttleStatus::kPPDone)
+                                       {
+                                               // in case pp not yet done set status to kPPTimeOut
+                                       
+                                               logMsg.Form("Process - Process of %s timed out. Run time: %d seconds. Killing...",
+                                                               fCurrentDetector.Data(), expiredTime);
+                                               newStatus = AliShuttleStatus::kPPTimeOut;
+                                       }
+                                       else if (currentStatus->GetStatus() == AliShuttleStatus::kStoreStarted)
+                                       {
+                                               // in case the pp goes in TimeOut while storing the objects in the OCDB
+                                               // set status to kStoreError
+                                               
+                                               logMsg.Form("Process - Process of %s timed out while storing the OCDB object. Run time: %d seconds. Killing... and setting status to StoreError.",
+                                                               fCurrentDetector.Data(), expiredTime);
+                                               newStatus = AliShuttleStatus::kStoreError;
+                                       }
+                                       else 
+                                       {
+                                               // in other cases don't change the status
+                                               
+                                               logMsg.Form("Process - Process of %s timed out in status = %s. Run time: %d seconds. Killing... without changing the status",
+                                                               fCurrentDetector.Data(), currentStatus->GetStatusName(), expiredTime);
+                                       }
+                               
+                                       Log("SHUTTLE", logMsg);
+                                       Log(fCurrentDetector, logMsg);
 
                                        kill(pid, 9);
 
-                                       UpdateShuttleStatus(AliShuttleStatus::kPPTimeOut);
+                                       if (newStatus != AliShuttleStatus::kInvalid)
+                                               UpdateShuttleStatus(newStatus);
                                        hasError = kTRUE;
 
                                        gSystem->Sleep(1000);
@@ -1348,33 +1429,14 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
                                {
                                        gSystem->Sleep(1000);
                                        
-                                       TString checkStr;
-                                       checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid);
-                                       FILE* pipe = gSystem->OpenPipe(checkStr, "r");
-                                       if (!pipe)
-                                       {
-                                               Log("SHUTTLE", Form("Process - Error: "
-                                                       "Could not open pipe to %s", checkStr.Data()));
+                                       Int_t mem = GetMem(pid);
+
+                                       if (mem < 0)
                                                continue;
-                                       }
                                                
-                                       char buffer[100];
-                                       if (!fgets(buffer, 100, pipe))
-                                       {
-                                               Log("SHUTTLE", "Process - Error: ps did not return anything");
-                                               gSystem->ClosePipe(pipe);
-                                               continue;
-                                       }
-                                       gSystem->ClosePipe(pipe);
-                                       
-                                       //Log("SHUTTLE", Form("ps returned %s", buffer));
-                                       
-                                       Int_t mem = 0;
-                                       if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem)
-                                       {
-                                               Log("SHUTTLE", "Process - Error: Could not parse output of ps");
-                                               continue;
-                                       }
+                                       mem -= initialMem;
+                                       if (mem < 0)
+                                               mem = 0;
                                        
                                        if (expiredTime % 60 == 0)
                                        {
@@ -1541,10 +1603,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
                                        fFirstUnprocessed[iDet] = kFALSE;
                                }
                        }
-                       TMonaLisaText  mlStatusPending("SHUTTLE_status", "Pending");
-                       mlList.Clear();
-                       mlList.Add(&mlStatusPending);
-                       fMonaLisa->SendParameters(&mlList, mlID);
+                       SendMLRunInfo("Pending");
                }
        }
 
@@ -1905,16 +1964,28 @@ AliShuttleLogbookEntry* AliShuttle::QueryRunParameters(Int_t run)
        TString totEventsStr = entry->GetRunParameter("totalEvents");  
        Int_t totEvents = totEventsStr.Atoi();
        
-       if (startTime != 0 && endTime != 0 && endTime > startTime && totEvents > 0 && ecsSuccess)
+       UInt_t now = time(0);
+       // TODO make this a configuration parameter
+       Int_t dcsDelay = 120;
+       
+       // runs are accepted if they have ecsSuccess set or more than 1 event
+       if (startTime != 0 && endTime != 0 && endTime > startTime && (totEvents > 1 || ecsSuccess) && (endTime < now - dcsDelay))
+       {
+               if (ecsSuccess == kFALSE)
+                       Log("SHUTTLE", Form("Processing run %d although in status ECS failure, Reason: %s", run, entry->GetRunParameter("eor_reason")));
                return entry;
-               
-       if (ecsSuccess == kFALSE)
+       }
+
+       Bool_t skip = kFALSE;
+                               
+       if (totEvents <= 1) 
        {
-               Log("SHUTTLE", Form("Skipped run %d due to ECS failure, Reason: %s", run, entry->GetRunParameter("eor_reason")));
+               Log("SHUTTLE", Form("QueryRunParameters - Run %d has 1 event or less - Skipping!", run));
+               skip = kTRUE;
        }
-       else if (totEvents < 1) 
+       else if (endTime != 0 && endTime >= now - dcsDelay)
        {
-               Log("SHUTTLE", Form("QueryRunParameters - Run %d has 0 events - Skipping!", run));
+               Log("SHUTTLE", Form("Skipping run %d for now, because DCS buffer time is not yet expired", run));
        }
        else
        {
@@ -1922,14 +1993,17 @@ AliShuttleLogbookEntry* AliShuttle::QueryRunParameters(Int_t run)
                        "startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!",
                        run, startTime, endTime));
        }
-
-       //Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run));
-       //fLogbookEntry = entry;        
-       //if (!UpdateShuttleLogbook("shuttle_done"))
-       //{
-       //      AliError(Form("Could not update logbook for run %d !", run));
-       //}
-       //fLogbookEntry = 0;
+       
+       if (skip)
+       {
+               Log("SHUTTLE", Form("Marking SHUTTLE skipped for run %d", run));
+               fLogbookEntry = entry;
+               if (!UpdateShuttleLogbook("shuttle_skipped"))
+               {
+                       AliError(Form("Could not update logbook for run %d !", run));
+               }
+               fLogbookEntry = 0;
+       }
                        
        delete entry;
        return 0;
@@ -2586,29 +2660,24 @@ Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status
 
        TString detName(detector);
        TString setClause;
-       if (detName == "shuttle_done" || detName == "shuttle_ignored")
+       if (detName == "shuttle_done" || detName == "shuttle_skipped")
        {
                setClause = "set shuttle_done=1";
-
+               
                if (detName == "shuttle_done")
                {
-                       if (TouchFile()==kTRUE){
-                               //Send the information to ML
-                               TMonaLisaText  mlStatus("SHUTTLE_status", "Done");
-
-                               TList mlList;
-                               mlList.Add(&mlStatus);
-                               
-                               TString mlID;
-                               mlID.Form("%d", GetCurrentRun());
-                               fMonaLisa->SendParameters(&mlList, mlID);
-                       }
-                       else{
+                       if (TouchFile() != kTRUE)
+                       {
+                               SendMLRunInfo("Pending");
                                return kFALSE;
                        }
-                                       
+                       
+                       SendMLRunInfo("Done");
                }
-       } else {
+               else 
+                       SendMLRunInfo("Skipped");
+       } 
+       else {
                TString statusStr(status);
                if(statusStr.Contains("done", TString::kIgnoreCase) ||
                   statusStr.Contains("failed", TString::kIgnoreCase)){
@@ -3087,7 +3156,7 @@ Bool_t AliShuttle::SendMail(EMailTarget target, Int_t system)
        TString body;
 
        if (target == kDCSEMail){
-               subject = Form("%s Retrieval of data points for %s FAILED in run %d !",
+               subject = Form("%s CRITICAL Retrieval of data points for %s FAILED in run %d !",
                                tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun());
                AliDebug(2, Form("subject: %s", subject.Data()));
                
@@ -3096,7 +3165,7 @@ Bool_t AliShuttle::SendMail(EMailTarget target, Int_t system)
                             "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun());
        }
        else if (target == kFXSEMail){
-               subject = Form("%s FXS communication for %s FAILED in run %d !",
+               subject = Form("%s CRITICAL FXS communication for %s FAILED in run %d !",
                                tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun());
                AliDebug(2, Form("subject: %s", subject.Data()));
                TString sys;
@@ -3123,9 +3192,9 @@ Bool_t AliShuttle::SendMail(EMailTarget target, Int_t system)
                                fCurrentDetector.Data());
        if (fConfig->GetRunMode() == AliShuttleConfig::kTest)
        {
-               body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n");
+               body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?time=24 \n\n");
        } else {
-               body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=168 \n\n");
+               body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=24 \n\n");
        }
        
        
@@ -3205,7 +3274,7 @@ Bool_t AliShuttle::GetHLTStatus()
        }
 
        // TODO implement when HLTMode is inserted in run logbook
-       TString hltMode = fLogbookEntry->GetRunParameter("HLTMode");
+       TString hltMode = fLogbookEntry->GetRunParameter("HLTmode");
        TSubString firstChar = hltMode(0,1);
        AliDebug(2,Form("First char = %s ",firstChar.Data())); 
        if (firstChar == "A") {
@@ -3313,6 +3382,8 @@ Bool_t AliShuttle::TouchFile()
        TMap *mapLs = dynamic_cast<TMap*>(resultLs->At(0));
        if (!mapLs){
                Log("SHUTTLE",Form("No map for %s command, returning without touching",commandLs.Data()));
+               delete resultLs;
+               resultLs = 0x0;
                return kFALSE;
        }
        TObjString *valueLsPath = dynamic_cast<TObjString*>(mapLs->GetValue("path"));
@@ -3324,6 +3395,8 @@ Bool_t AliShuttle::TouchFile()
                Bool_t boolMkdir = gGrid->Mkdir(dir.Data());
                if (!boolMkdir) {
                        Log("SHUTTLE",Form("Impossible to create dir %s in alien catalogue for run %i!",dir.Data(),GetCurrentRun()));
+                       delete resultLs;
+                       resultLs = 0x0;
                        return kFALSE;
                }
                Log("SHUTTLE",Form("Directory %s successfully created in alien catalogue for run %i",dir.Data(),GetCurrentRun()));
@@ -3332,6 +3405,9 @@ Bool_t AliShuttle::TouchFile()
                Log("SHUTTLE",Form("Directory %s correctly found for run %i",dir.Data(),GetCurrentRun()));
        }
 
+       delete resultLs;
+       resultLs = 0x0;
+
        TString command;
        command.Form("touch %s/%i", dir.Data(), GetCurrentRun());
        Log("SHUTTLE", Form("Creating entry in file catalog: %s", command.Data()));
@@ -3343,17 +3419,26 @@ Bool_t AliShuttle::TouchFile()
        TMap *mapTouch = dynamic_cast<TMap*>(resultTouch->At(0));
        if (!mapTouch){
                Log("SHUTTLE",Form("No map for touching command, returning without touching for run %i",GetCurrentRun()));
+               delete resultTouch;
+               resultTouch = 0x0; 
                return kFALSE;
        }
        TObjString *valueTouch = dynamic_cast<TObjString*>(mapTouch->GetValue("__result__"));
        if (!valueTouch){
                Log("SHUTTLE",Form("No value for \"__result__\" key set in the map for touching command, returning without touching for run %i",GetCurrentRun()));
+               delete resultTouch;
+               resultTouch = 0x0; 
                return kFALSE;
        }
        if (valueTouch->GetString()!="1"){
                Log("SHUTTLE",Form("Failing the touching command, returning without touching for run %i",GetCurrentRun()));
+               delete resultTouch;
+               resultTouch = 0x0; 
                return kFALSE;
        }
+       delete resultTouch;
+       resultTouch = 0x0; 
+       Log("SHUTTLE", "Sucessfully touched the file");
        return kTRUE;
 }