//
UpdateShuttleStatus(AliShuttleStatus::kStoreStarted);
-
+
if (fTestMode & kErrorGrid)
{
Log("SHUTTLE", "StoreOCDB - In TESTMODE - Simulating error while storing in the Grid");
AliCDBId aLocId = aLocEntry->GetId();
aLocEntry->SetVersion(-1);
aLocEntry->SetSubVersion(-1);
+
+ Log(fCurrentDetector.Data(), Form("Attempting to store %s", aLocId.ToString().Data()));
// If local object is valid up to infinity we store it only if it is
// the first unprocessed run!
Bool_t store = kTRUE;
TIter gridIter(gridIds);
AliCDBId* aGridId = 0;
- while((aGridId = dynamic_cast<AliCDBId*> (gridIter.Next()))){
- if(aGridId->GetPath() != aLocId.GetPath()) continue;
+ while ((aGridId = dynamic_cast<AliCDBId*> (gridIter.Next()))) {
+ if (aGridId->GetPath() != aLocId.GetPath())
+ continue;
// skip all objects valid up to infinity
- if(aGridId->GetLastRun() == AliCDBRunRange::Infinity()) continue;
+ if (aGridId->GetLastRun() == AliCDBRunRange::Infinity())
+ continue;
+
// if we get here, it means there's already some more recent object stored on Grid!
+ Log(fCurrentDetector.Data(),
+ Form("StoreOCDB - A more recent object already exists in %s storage: <%s>",
+ type, aGridId->ToString().Data()));
+
store = kFALSE;
break;
}
- // If we get here, the file can be stored!
- Bool_t storeOk = gridSto->Put(aLocEntry);
- if(!store || storeOk){
-
- if (!store)
- {
- Log(fCurrentDetector.Data(),
- Form("StoreOCDB - A more recent object already exists in %s storage: <%s>",
- type, aGridId->ToString().Data()));
- } else {
+ Bool_t storeOk = kFALSE;
+ if (store)
+ {
+ Log(fCurrentDetector.Data(), Form("Prechecks succeeded. Ready to store %s", aLocId.ToString().Data()));
+ storeOk = gridSto->Put(aLocEntry);
+ if (storeOk) {
Log("SHUTTLE",
- Form("StoreOCDB - Object <%s> successfully put into %s storage",
- aLocId.ToString().Data(), type));
+ Form("StoreOCDB - Object <%s> successfully put into %s storage",
+ aLocId.ToString().Data(), type));
Log(fCurrentDetector.Data(),
Form("StoreOCDB - Object <%s> successfully put into %s storage",
- aLocId.ToString().Data(), type));
+ aLocId.ToString().Data(), type));
+ } else {
+ Log("SHUTTLE",
+ Form("StoreOCDB - Grid %s storage of object <%s> failed",
+ type, aLocId.ToString().Data()));
+ Log(fCurrentDetector.Data(),
+ Form("StoreOCDB - Grid %s storage of object <%s> failed",
+ type, aLocId.ToString().Data()));
+ result = kFALSE;
}
-
- // removing local filename...
+ }
+
+ if (!store || storeOk) {
+ // removing local file...
TString filename;
localSto->IdToFilename(aLocId, filename);
Log("SHUTTLE", Form("StoreOCDB - Removing local file %s", filename.Data()));
RemoveFile(filename.Data());
- continue;
- } else {
- Log("SHUTTLE",
- Form("StoreOCDB - Grid %s storage of object <%s> failed",
- type, aLocId.ToString().Data()));
- Log(fCurrentDetector.Data(),
- Form("StoreOCDB - Grid %s storage of object <%s> failed",
- type, aLocId.ToString().Data()));
- result = kFALSE;
}
}
localEntries->Clear();
return kFALSE;
}
- SendMLInfo();
+ SendMLDetInfo();
return kTRUE;
}
AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry);
- SendMLInfo();
+ SendMLDetInfo();
}
//______________________________________________________________________________________________
-void AliShuttle::SendMLInfo()
+void AliShuttle::SendMLDetInfo()
{
//
// sends ML information about the current status of the current detector being processed
AliShuttleStatus* status = dynamic_cast<AliShuttleStatus*> (fStatusEntry->GetObject());
if (!status){
- Log("SHUTTLE", "SendMLInfo - UNEXPECTED: status could not be read from current CDB entry");
+ Log("SHUTTLE", "SendMLDetInfo - UNEXPECTED: status could not be read from current CDB entry");
return;
}
return cont;
}
+//______________________________________________________________________________________________
+void AliShuttle::SendMLRunInfo(const char* status)
+{
+ //
+ // Send information about this run to ML
+
+ TMonaLisaText mlStatus("SHUTTLE_status", status);
+ TString runType(fLogbookEntry->GetRunType());
+ if (strlen(fLogbookEntry->GetRunParameter("log")) > 0){
+
+ runType += "(";
+ runType += fLogbookEntry->GetRunParameter("log");
+ runType += ")";
+ }
+ TMonaLisaText mlRunType("SHUTTLE_runtype", runType);
+
+ TList mlList;
+ mlList.Add(&mlStatus);
+ mlList.Add(&mlRunType);
+
+ TString mlID;
+ mlID.Form("%d", GetCurrentRun());
+ fMonaLisa->SendParameters(&mlList, mlID);
+}
+
+//______________________________________________________________________________________________
+Int_t AliShuttle::GetMem(Int_t pid)
+{
+ // invokes ps to get the memory consumption of the process <pid>
+ // returns -1 in case of error
+
+ TString checkStr;
+ checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid);
+ FILE* pipe = gSystem->OpenPipe(checkStr, "r");
+ if (!pipe)
+ {
+ Log("SHUTTLE", Form("Process - Error: "
+ "Could not open pipe to %s", checkStr.Data()));
+ return -1;
+ }
+
+ char buffer[100];
+ if (!fgets(buffer, 100, pipe))
+ {
+ Log("SHUTTLE", "Process - Error: ps did not return anything");
+ gSystem->ClosePipe(pipe);
+ return -1;
+ }
+ gSystem->ClosePipe(pipe);
+
+ //Log("SHUTTLE", Form("ps returned %s", buffer));
+
+ Int_t mem = 0;
+ if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem)
+ {
+ Log("SHUTTLE", "Process - Error: Could not parse output of ps");
+ return -1;
+ }
+
+ return mem;
+}
+
//______________________________________________________________________________________________
Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry)
{
Log("SHUTTLE", Form("\t\t\t^*^*^*^*^*^*^*^*^*^*^*^* run %d: START ^*^*^*^*^*^*^*^*^*^*^*^*",
GetCurrentRun()));
- // Send the information to ML
CountOpenRuns();
- TMonaLisaText mlStatus("SHUTTLE_status", "Processing");
- TString runType(entry->GetRunType());
- if (strlen(entry->GetRunParameter("log")) > 0){
-
- runType += "(";
- runType += entry->GetRunParameter("log");
- runType += ")";
- }
- TMonaLisaText mlRunType("SHUTTLE_runtype", runType);
-
- TList mlList;
- mlList.Add(&mlStatus);
- mlList.Add(&mlRunType);
-
- TString mlID;
- mlID.Form("%d", GetCurrentRun());
- fMonaLisa->SendParameters(&mlList, mlID);
+ // Send the information to ML
+ SendMLRunInfo("Processing");
if (fLogbookEntry->IsDone())
{
GetCurrentRun(), aDetector->GetName()));
for(Int_t iSys=0;iSys<3;iSys++) fFXSCalled[iSys]=kFALSE;
+
+ Int_t initialMem = GetMem(getpid());
+ Log("SHUTTLE", Form("Memory consumption before forking is %d", initialMem));
Log(fCurrentDetector.Data(), "Process - Starting processing");
if (expiredTime > fConfig->GetPPTimeOut())
{
- TString tmp;
- tmp.Form("Process - Process of %s time out. "
- "Run time: %d seconds. Killing...",
- fCurrentDetector.Data(), expiredTime);
- Log("SHUTTLE", tmp);
- Log(fCurrentDetector, tmp);
+ TString logMsg;
+ AliShuttleStatus *currentStatus = ReadShuttleStatus();
+ AliShuttleStatus::Status newStatus = AliShuttleStatus::kInvalid;
+
+ if (currentStatus->GetStatus() <= AliShuttleStatus::kPPDone)
+ {
+ // in case pp not yet done set status to kPPTimeOut
+
+ logMsg.Form("Process - Process of %s timed out. Run time: %d seconds. Killing...",
+ fCurrentDetector.Data(), expiredTime);
+ newStatus = AliShuttleStatus::kPPTimeOut;
+ }
+ else if (currentStatus->GetStatus() == AliShuttleStatus::kStoreStarted)
+ {
+ // in case the pp goes in TimeOut while storing the objects in the OCDB
+ // set status to kStoreError
+
+ logMsg.Form("Process - Process of %s timed out while storing the OCDB object. Run time: %d seconds. Killing... and setting status to StoreError.",
+ fCurrentDetector.Data(), expiredTime);
+ newStatus = AliShuttleStatus::kStoreError;
+ }
+ else
+ {
+ // in other cases don't change the status
+
+ logMsg.Form("Process - Process of %s timed out in status = %s. Run time: %d seconds. Killing... without changing the status",
+ fCurrentDetector.Data(), currentStatus->GetStatusName(), expiredTime);
+ }
+
+ Log("SHUTTLE", logMsg);
+ Log(fCurrentDetector, logMsg);
kill(pid, 9);
- UpdateShuttleStatus(AliShuttleStatus::kPPTimeOut);
+ if (newStatus != AliShuttleStatus::kInvalid)
+ UpdateShuttleStatus(newStatus);
hasError = kTRUE;
gSystem->Sleep(1000);
{
gSystem->Sleep(1000);
- TString checkStr;
- checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid);
- FILE* pipe = gSystem->OpenPipe(checkStr, "r");
- if (!pipe)
- {
- Log("SHUTTLE", Form("Process - Error: "
- "Could not open pipe to %s", checkStr.Data()));
+ Int_t mem = GetMem(pid);
+
+ if (mem < 0)
continue;
- }
- char buffer[100];
- if (!fgets(buffer, 100, pipe))
- {
- Log("SHUTTLE", "Process - Error: ps did not return anything");
- gSystem->ClosePipe(pipe);
- continue;
- }
- gSystem->ClosePipe(pipe);
-
- //Log("SHUTTLE", Form("ps returned %s", buffer));
-
- Int_t mem = 0;
- if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem)
- {
- Log("SHUTTLE", "Process - Error: Could not parse output of ps");
- continue;
- }
+ mem -= initialMem;
+ if (mem < 0)
+ mem = 0;
if (expiredTime % 60 == 0)
{
fFirstUnprocessed[iDet] = kFALSE;
}
}
- TMonaLisaText mlStatusPending("SHUTTLE_status", "Pending");
- mlList.Clear();
- mlList.Add(&mlStatusPending);
- fMonaLisa->SendParameters(&mlList, mlID);
+ SendMLRunInfo("Pending");
}
}
TString totEventsStr = entry->GetRunParameter("totalEvents");
Int_t totEvents = totEventsStr.Atoi();
- if (startTime != 0 && endTime != 0 && endTime > startTime && totEvents > 0 && ecsSuccess)
+ UInt_t now = time(0);
+ // TODO make this a configuration parameter
+ Int_t dcsDelay = 120;
+
+ // runs are accepted if they have ecsSuccess set or more than 1 event
+ if (startTime != 0 && endTime != 0 && endTime > startTime && (totEvents > 1 || ecsSuccess) && (endTime < now - dcsDelay))
+ {
+ if (ecsSuccess == kFALSE)
+ Log("SHUTTLE", Form("Processing run %d although in status ECS failure, Reason: %s", run, entry->GetRunParameter("eor_reason")));
return entry;
-
- if (ecsSuccess == kFALSE)
+ }
+
+ Bool_t skip = kFALSE;
+
+ if (totEvents <= 1)
{
- Log("SHUTTLE", Form("Skipped run %d due to ECS failure, Reason: %s", run, entry->GetRunParameter("eor_reason")));
+ Log("SHUTTLE", Form("QueryRunParameters - Run %d has 1 event or less - Skipping!", run));
+ skip = kTRUE;
}
- else if (totEvents < 1)
+ else if (endTime != 0 && endTime >= now - dcsDelay)
{
- Log("SHUTTLE", Form("QueryRunParameters - Run %d has 0 events - Skipping!", run));
+ Log("SHUTTLE", Form("Skipping run %d for now, because DCS buffer time is not yet expired", run));
}
else
{
"startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!",
run, startTime, endTime));
}
-
- //Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run));
- //fLogbookEntry = entry;
- //if (!UpdateShuttleLogbook("shuttle_done"))
- //{
- // AliError(Form("Could not update logbook for run %d !", run));
- //}
- //fLogbookEntry = 0;
+
+ if (skip)
+ {
+ Log("SHUTTLE", Form("Marking SHUTTLE skipped for run %d", run));
+ fLogbookEntry = entry;
+ if (!UpdateShuttleLogbook("shuttle_skipped"))
+ {
+ AliError(Form("Could not update logbook for run %d !", run));
+ }
+ fLogbookEntry = 0;
+ }
delete entry;
return 0;
TString detName(detector);
TString setClause;
- if (detName == "shuttle_done" || detName == "shuttle_ignored")
+ if (detName == "shuttle_done" || detName == "shuttle_skipped")
{
setClause = "set shuttle_done=1";
-
+
if (detName == "shuttle_done")
{
- if (TouchFile()==kTRUE){
- //Send the information to ML
- TMonaLisaText mlStatus("SHUTTLE_status", "Done");
-
- TList mlList;
- mlList.Add(&mlStatus);
-
- TString mlID;
- mlID.Form("%d", GetCurrentRun());
- fMonaLisa->SendParameters(&mlList, mlID);
- }
- else{
+ if (TouchFile() != kTRUE)
+ {
+ SendMLRunInfo("Pending");
return kFALSE;
}
-
+
+ SendMLRunInfo("Done");
}
- } else {
+ else
+ SendMLRunInfo("Skipped");
+ }
+ else {
TString statusStr(status);
if(statusStr.Contains("done", TString::kIgnoreCase) ||
statusStr.Contains("failed", TString::kIgnoreCase)){
TString body;
if (target == kDCSEMail){
- subject = Form("%s Retrieval of data points for %s FAILED in run %d !",
+ subject = Form("%s CRITICAL Retrieval of data points for %s FAILED in run %d !",
tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun());
AliDebug(2, Form("subject: %s", subject.Data()));
"in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun());
}
else if (target == kFXSEMail){
- subject = Form("%s FXS communication for %s FAILED in run %d !",
+ subject = Form("%s CRITICAL FXS communication for %s FAILED in run %d !",
tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun());
AliDebug(2, Form("subject: %s", subject.Data()));
TString sys;
fCurrentDetector.Data());
if (fConfig->GetRunMode() == AliShuttleConfig::kTest)
{
- body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n");
+ body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?time=24 \n\n");
} else {
- body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=168 \n\n");
+ body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=24 \n\n");
}
}
// TODO implement when HLTMode is inserted in run logbook
- TString hltMode = fLogbookEntry->GetRunParameter("HLTMode");
+ TString hltMode = fLogbookEntry->GetRunParameter("HLTmode");
TSubString firstChar = hltMode(0,1);
AliDebug(2,Form("First char = %s ",firstChar.Data()));
if (firstChar == "A") {
TMap *mapLs = dynamic_cast<TMap*>(resultLs->At(0));
if (!mapLs){
Log("SHUTTLE",Form("No map for %s command, returning without touching",commandLs.Data()));
+ delete resultLs;
+ resultLs = 0x0;
return kFALSE;
}
TObjString *valueLsPath = dynamic_cast<TObjString*>(mapLs->GetValue("path"));
Bool_t boolMkdir = gGrid->Mkdir(dir.Data());
if (!boolMkdir) {
Log("SHUTTLE",Form("Impossible to create dir %s in alien catalogue for run %i!",dir.Data(),GetCurrentRun()));
+ delete resultLs;
+ resultLs = 0x0;
return kFALSE;
}
Log("SHUTTLE",Form("Directory %s successfully created in alien catalogue for run %i",dir.Data(),GetCurrentRun()));
Log("SHUTTLE",Form("Directory %s correctly found for run %i",dir.Data(),GetCurrentRun()));
}
+ delete resultLs;
+ resultLs = 0x0;
+
TString command;
command.Form("touch %s/%i", dir.Data(), GetCurrentRun());
Log("SHUTTLE", Form("Creating entry in file catalog: %s", command.Data()));
TMap *mapTouch = dynamic_cast<TMap*>(resultTouch->At(0));
if (!mapTouch){
Log("SHUTTLE",Form("No map for touching command, returning without touching for run %i",GetCurrentRun()));
+ delete resultTouch;
+ resultTouch = 0x0;
return kFALSE;
}
TObjString *valueTouch = dynamic_cast<TObjString*>(mapTouch->GetValue("__result__"));
if (!valueTouch){
Log("SHUTTLE",Form("No value for \"__result__\" key set in the map for touching command, returning without touching for run %i",GetCurrentRun()));
+ delete resultTouch;
+ resultTouch = 0x0;
return kFALSE;
}
if (valueTouch->GetString()!="1"){
Log("SHUTTLE",Form("Failing the touching command, returning without touching for run %i",GetCurrentRun()));
+ delete resultTouch;
+ resultTouch = 0x0;
return kFALSE;
}
+ delete resultTouch;
+ resultTouch = 0x0;
+ Log("SHUTTLE", "Sucessfully touched the file");
return kTRUE;
}