X-Git-Url: http://git.uio.no/git/?a=blobdiff_plain;f=SHUTTLE%2FAliShuttle.cxx;h=6116bf4cd2a5234ed1c81b706def495536db8bb2;hb=ff0191c7c7df1e8ed524a99126766d0a06853e07;hp=d5f2538f5f479f7654f500ee470fc61e1b3836de;hpb=8da81210a72b8a2d4026d0ad29c1af388a2a94e7;p=u%2Fmrichter%2FAliRoot.git diff --git a/SHUTTLE/AliShuttle.cxx b/SHUTTLE/AliShuttle.cxx index d5f2538f5f4..6116bf4cd2a 100644 --- a/SHUTTLE/AliShuttle.cxx +++ b/SHUTTLE/AliShuttle.cxx @@ -57,6 +57,7 @@ #include #include #include +#include #include @@ -78,7 +79,7 @@ fPreprocessorMap(), fLogbookEntry(0), fCurrentDetector(), fFirstProcessing(0), -fFXSError(kFALSE), +fFXSError(-1), fStatusEntry(0), fMonitoringMutex(0), fLastActionTime(0), @@ -199,6 +200,7 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, // // returns 0 if fail, 1 otherwise + if (fTestMode & kErrorStorage) { Log(fCurrentDetector, "StoreLocally - In TESTMODE - Simulating error while storing locally"); @@ -233,8 +235,11 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, if (!(AliCDBManager::Instance()->GetStorage(localUri))) { Log("SHUTTLE", Form("StoreLocally - Cannot activate local %s storage", cdbType)); } else { + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); result = AliCDBManager::Instance()->GetStorage(localUri) ->Put(object, id, metaData); + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); } if(!result) { @@ -242,6 +247,7 @@ Bool_t AliShuttle::StoreLocally(const TString& localUri, Log(fCurrentDetector, Form("StoreLocally - Can't store object <%s>!", id.ToString().Data())); } + return result; } @@ -255,7 +261,7 @@ Bool_t AliShuttle::StoreOCDB() // UpdateShuttleStatus(AliShuttleStatus::kStoreStarted); - + if (fTestMode & kErrorGrid) { Log("SHUTTLE", "StoreOCDB - In TESTMODE - Simulating error while storing in the Grid"); @@ -275,7 +281,7 @@ Bool_t AliShuttle::StoreOCDB() Bool_t resultMetadata = kTRUE; if(fCurrentDetector == "GRP") { - Log("StoreOCDB - SHUTTLE","Storing Run Metadata file ..."); + Log("SHUTTLE","StoreOCDB - Storing Run Metadata file ..."); resultMetadata = CopyFilesToGrid("metadata"); } @@ -374,6 +380,8 @@ Int_t AliShuttle::StoreOCDB(const TString& gridURI) AliCDBId aLocId = aLocEntry->GetId(); aLocEntry->SetVersion(-1); aLocEntry->SetSubVersion(-1); + + Log(fCurrentDetector.Data(), Form("Attempting to store %s", aLocId.ToString().Data())); // If local object is valid up to infinity we store it only if it is // the first unprocessed run! @@ -394,47 +402,51 @@ Int_t AliShuttle::StoreOCDB(const TString& gridURI) Bool_t store = kTRUE; TIter gridIter(gridIds); AliCDBId* aGridId = 0; - while((aGridId = dynamic_cast (gridIter.Next()))){ - if(aGridId->GetPath() != aLocId.GetPath()) continue; + while ((aGridId = dynamic_cast (gridIter.Next()))) { + if (aGridId->GetPath() != aLocId.GetPath()) + continue; // skip all objects valid up to infinity - if(aGridId->GetLastRun() == AliCDBRunRange::Infinity()) continue; + if (aGridId->GetLastRun() == AliCDBRunRange::Infinity()) + continue; + // if we get here, it means there's already some more recent object stored on Grid! + Log(fCurrentDetector.Data(), + Form("StoreOCDB - A more recent object already exists in %s storage: <%s>", + type, aGridId->ToString().Data())); + store = kFALSE; break; } - // If we get here, the file can be stored! - Bool_t storeOk = gridSto->Put(aLocEntry); - if(!store || storeOk){ - - if (!store) - { - Log(fCurrentDetector.Data(), - Form("StoreOCDB - A more recent object already exists in %s storage: <%s>", - type, aGridId->ToString().Data())); - } else { + Bool_t storeOk = kFALSE; + if (store) + { + Log(fCurrentDetector.Data(), Form("Prechecks succeeded. Ready to store %s", aLocId.ToString().Data())); + storeOk = gridSto->Put(aLocEntry); + if (storeOk) { Log("SHUTTLE", - Form("StoreOCDB - Object <%s> successfully put into %s storage", - aLocId.ToString().Data(), type)); + Form("StoreOCDB - Object <%s> successfully put into %s storage", + aLocId.ToString().Data(), type)); Log(fCurrentDetector.Data(), Form("StoreOCDB - Object <%s> successfully put into %s storage", - aLocId.ToString().Data(), type)); + aLocId.ToString().Data(), type)); + } else { + Log("SHUTTLE", + Form("StoreOCDB - Grid %s storage of object <%s> failed", + type, aLocId.ToString().Data())); + Log(fCurrentDetector.Data(), + Form("StoreOCDB - Grid %s storage of object <%s> failed", + type, aLocId.ToString().Data())); + result = kFALSE; } - - // removing local filename... + } + + if (!store || storeOk) { + // removing local file... TString filename; localSto->IdToFilename(aLocId, filename); Log("SHUTTLE", Form("StoreOCDB - Removing local file %s", filename.Data())); RemoveFile(filename.Data()); - continue; - } else { - Log("SHUTTLE", - Form("StoreOCDB - Grid %s storage of object <%s> failed", - type, aLocId.ToString().Data())); - Log(fCurrentDetector.Data(), - Form("StoreOCDB - Grid %s storage of object <%s> failed", - type, aLocId.ToString().Data())); - result = kFALSE; } } localEntries->Clear(); @@ -615,8 +627,8 @@ Bool_t AliShuttle::StoreRunMetadataFile(const char* localFile, const char* gridF lhcPeriod.Data())); } - TString target = Form("%s/GRP/RunMetadata/alice/data/%d/%s/%09d/raw/%s", - localBaseFolder.Data(), GetCurrentYear(), + TString target = Form("%s/GRP/RunMetadata%s%d/%s/%09d/raw/%s", + localBaseFolder.Data(), fConfig->GetAlienPath(), GetCurrentYear(), lhcPeriod.Data(), GetCurrentRun(), gridFileName); return CopyFileLocally(localFile, target); @@ -735,10 +747,10 @@ Bool_t AliShuttle::CopyFilesToGrid(const char* type) lhcPeriod.Append(Form("_%s", partition.Data())); } - dir = Form("%s/GRP/RunMetadata/alice/data/%d/%s/%09d/raw", - localBaseFolder.Data(), GetCurrentYear(), + dir = Form("%s/GRP/RunMetadata%s%d/%s/%09d/raw", + localBaseFolder.Data(), fConfig->GetAlienPath(), GetCurrentYear(), lhcPeriod.Data(), GetCurrentRun()); - alienDir = dir(dir.Index("/alice/data/"), dir.Length()); + alienDir = dir(dir.Index(fConfig->GetAlienPath()), dir.Length()); begin = ""; } @@ -858,7 +870,7 @@ const char* AliShuttle::GetRefFilePrefix(const char* base, const char* detector) // TString offDetStr(GetOfflineDetName(detector)); - TString dir; + static TString dir; if (offDetStr == "ITS" || offDetStr == "MUON" || offDetStr == "PHOS") { dir.Form("%s/%s/%s", base, offDetStr.Data(), detector); @@ -867,8 +879,6 @@ const char* AliShuttle::GetRefFilePrefix(const char* base, const char* detector) } return dir.Data(); - - } //______________________________________________________________________________________________ @@ -939,8 +949,9 @@ AliShuttleStatus* AliShuttle::ReadShuttleStatus() fStatusEntry = 0; } + Int_t path1 = GetCurrentRun()/10000; fStatusEntry = AliCDBManager::Instance()->GetStorage(GetLocalCDB()) - ->Get(Form("/SHUTTLE/STATUS/%s", fCurrentDetector.Data()), GetCurrentRun()); + ->Get(Form("/SHUTTLE/%s/%d", fCurrentDetector.Data(), path1), GetCurrentRun()); if (!fStatusEntry) return 0; fStatusEntry->SetOwner(1); @@ -967,21 +978,28 @@ Bool_t AliShuttle::WriteShuttleStatus(AliShuttleStatus* status) } Int_t run = GetCurrentRun(); + Int_t path1 = run/10000; + TString path1_string = Form("%d",path1); - AliCDBId id(AliCDBPath("SHUTTLE", "STATUS", fCurrentDetector), run, run); + AliCDBId id(AliCDBPath("SHUTTLE", fCurrentDetector, path1_string), run, run); fStatusEntry = new AliCDBEntry(status, id, new AliCDBMetaData); fStatusEntry->SetOwner(1); + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); + UInt_t result = AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry); - if (!result) { + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); + + if (!result) { Log("SHUTTLE", Form("WriteShuttleStatus - Failed for %s, run %d", fCurrentDetector.Data(), run)); return kFALSE; } - SendMLInfo(); + SendMLDetInfo(); return kTRUE; } @@ -1015,13 +1033,18 @@ void AliShuttle::UpdateShuttleStatus(AliShuttleStatus::Status newStatus, Bool_t status->SetStatus(newStatus); if (increaseCount) status->IncreaseCount(); + Int_t logLevel = AliLog::GetGlobalLogLevel(); + AliLog::SetGlobalLogLevel(AliLog::kError); + AliCDBManager::Instance()->GetStorage(fgkLocalCDB)->Put(fStatusEntry); - SendMLInfo(); + AliLog::SetGlobalLogLevel((AliLog::EType_t)logLevel); + + SendMLDetInfo(); } //______________________________________________________________________________________________ -void AliShuttle::SendMLInfo() +void AliShuttle::SendMLDetInfo() { // // sends ML information about the current status of the current detector being processed @@ -1030,7 +1053,7 @@ void AliShuttle::SendMLInfo() AliShuttleStatus* status = dynamic_cast (fStatusEntry->GetObject()); if (!status){ - Log("SHUTTLE", "SendMLInfo - UNEXPECTED: status could not be read from current CDB entry"); + Log("SHUTTLE", "SendMLDetInfo - UNEXPECTED: status could not be read from current CDB entry"); return; } @@ -1108,15 +1131,22 @@ Bool_t AliShuttle::ContinueProcessing() return WriteShuttleStatus(status); } - // The following two cases shouldn't happen if Shuttle Logbook was correctly updated. + // The following case shouldn't happen if Shuttle Logbook was correctly updated. // If it happens it may mean Logbook updating failed... let's do it now! if (status->GetStatus() == AliShuttleStatus::kDone || - status->GetStatus() == AliShuttleStatus::kFailed){ + status->GetStatus() == AliShuttleStatus::kFailed || + status->GetStatus() == AliShuttleStatus::kSkipped) { Log("SHUTTLE", Form("ContinueProcessing - %s is already %s. Updating Shuttle Logbook", fCurrentDetector.Data(), status->GetStatusName(status->GetStatus()))); - UpdateShuttleLogbook(fCurrentDetector.Data(), - status->GetStatusName(status->GetStatus())); + + if (status->GetStatus() == AliShuttleStatus::kSkipped) + { + UpdateShuttleLogbook(fCurrentDetector.Data(), "DONE"); + } + else + UpdateShuttleLogbook(fCurrentDetector.Data(), status->GetStatusName(status->GetStatus())); + return kFALSE; } @@ -1151,7 +1181,30 @@ Bool_t AliShuttle::ContinueProcessing() // Send mail to detector expert! Log("SHUTTLE", Form("ContinueProcessing - Sending mail to %s expert...", fCurrentDetector.Data())); - if (!SendMail()) + // det experts in to + TString to=""; + TIter *iterExperts = 0; + iterExperts = new TIter(fConfig->GetResponsibles(fCurrentDetector)); + TObjString *anExpert=0; + while ((anExpert = (TObjString*) iterExperts->Next())) + { + to += Form("%s, \n", anExpert->GetName()); + } + delete iterExperts; + + if (to.Length() > 0) + to.Remove(to.Length()-3); + AliDebug(2, Form("to: %s",to.Data())); + + if (to.IsNull()) { + Log("SHUTTLE", Form("List of %s responsibles not set!", fCurrentDetector.Data())); + return kFALSE; + } + + Log(fCurrentDetector.Data(), Form("ContinueProcessing - Sending mail to %s expert(s):", + fCurrentDetector.Data())); + Log(fCurrentDetector.Data(), Form("\n%s", to.Data())); + if (!SendMail(kPPEMail)) Log("SHUTTLE", Form("ContinueProcessing - Could not send mail to %s expert", fCurrentDetector.Data())); @@ -1172,6 +1225,71 @@ Bool_t AliShuttle::ContinueProcessing() return cont; } +//______________________________________________________________________________________________ +void AliShuttle::SendMLRunInfo(const char* status) +{ + // + // Send information about this run to ML + + TMonaLisaText mlStatus("SHUTTLE_status", status); + TString runType(fLogbookEntry->GetRunType()); + if (strlen(fLogbookEntry->GetRunParameter("log")) > 0){ + + runType += "("; + runType += fLogbookEntry->GetRunParameter("log"); + runType += ")"; + } + if (fLogbookEntry->GetDATestMode()){ + runType += " (DATest)"; + } + TMonaLisaText mlRunType("SHUTTLE_runtype", runType); + + TList mlList; + mlList.Add(&mlStatus); + mlList.Add(&mlRunType); + + TString mlID; + mlID.Form("%d", GetCurrentRun()); + fMonaLisa->SendParameters(&mlList, mlID); +} + +//______________________________________________________________________________________________ +Int_t AliShuttle::GetMem(Int_t pid) +{ + // invokes ps to get the memory consumption of the process + // returns -1 in case of error + + TString checkStr; + checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid); + FILE* pipe = gSystem->OpenPipe(checkStr, "r"); + if (!pipe) + { + Log("SHUTTLE", Form("Process - Error: " + "Could not open pipe to %s", checkStr.Data())); + return -1; + } + + char buffer[100]; + if (!fgets(buffer, 100, pipe)) + { + Log("SHUTTLE", "Process - Error: ps did not return anything"); + gSystem->ClosePipe(pipe); + return -1; + } + gSystem->ClosePipe(pipe); + + //Log("SHUTTLE", Form("ps returned %s", buffer)); + + Int_t mem = 0; + if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem) + { + Log("SHUTTLE", "Process - Error: Could not parse output of ps"); + return -1; + } + + return mem; +} + //______________________________________________________________________________________________ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { @@ -1189,17 +1307,10 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) Log("SHUTTLE", Form("\t\t\t^*^*^*^*^*^*^*^*^*^*^*^* run %d: START ^*^*^*^*^*^*^*^*^*^*^*^*", GetCurrentRun())); + CountOpenRuns(); + // Send the information to ML - TMonaLisaText mlStatus("SHUTTLE_status", "Processing"); - TMonaLisaText mlRunType("SHUTTLE_runtype", Form("%s (%s)", entry->GetRunType(), entry->GetRunParameter("log"))); - - TList mlList; - mlList.Add(&mlStatus); - mlList.Add(&mlRunType); - - TString mlID; - mlID.Form("%d", GetCurrentRun()); - fMonaLisa->SendParameters(&mlList, mlID); + SendMLRunInfo("Processing"); if (fLogbookEntry->IsDone()) { @@ -1247,22 +1358,27 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) // Initialization Bool_t hasError = kFALSE; - // Set the CDB and Reference folders according to the year and LHC period - TString lhcPeriod(GetLHCPeriod()); - if (lhcPeriod.Length() == 0) - { - Log("SHUTTLE","Process - LHCPeriod not found in logbook!"); - return 0; - } - - if (fgkMainCDB.Length() == 0) - fgkMainCDB = Form("alien://folder=/alice/data/%d/%s/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", - GetCurrentYear(), lhcPeriod.Data()); - - if (fgkMainRefStorage.Length() == 0) - fgkMainRefStorage = Form("alien://folder=/alice/data/%d/%s/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", - GetCurrentYear(), lhcPeriod.Data()); - + // Set the CDB and Reference folders according to the year + + // build cdb paths (repeat each time, run might be a DATest run) + if (!fLogbookEntry->GetDATestMode()){ + fgkMainCDB.Form("alien://folder=%s%d/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + + fgkMainRefStorage.Form("alien://folder=%s%d/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + } + else { + fgkMainCDB.Form("alien://folder=%s%d/DATest/OCDB?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + + fgkMainRefStorage.Form("alien://folder=%s%d/DATest/Reference?user=alidaq?cacheFold=/tmp/OCDBCache", + fConfig->GetAlienPath(), GetCurrentYear()); + } + + AliDebug(2,Form("Main CDB storage = %s",fgkMainCDB.Data())); + AliDebug(2,Form("Main Reference storage = %s",fgkMainRefStorage.Data())); + // Loop on detectors in the configuration TIter iter(fConfig->GetDetectors()); TObjString* aDetector = 0; @@ -1290,6 +1406,9 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) GetCurrentRun(), aDetector->GetName())); for(Int_t iSys=0;iSys<3;iSys++) fFXSCalled[iSys]=kFALSE; + + Int_t initialMem = GetMem(getpid()); + Log("SHUTTLE", Form("Memory consumption before forking is %d", initialMem)); Log(fCurrentDetector.Data(), "Process - Starting processing"); @@ -1314,16 +1433,42 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) if (expiredTime > fConfig->GetPPTimeOut()) { - TString tmp; - tmp.Form("Process - Process of %s time out. " - "Run time: %d seconds. Killing...", - fCurrentDetector.Data(), expiredTime); - Log("SHUTTLE", tmp); - Log(fCurrentDetector, tmp); + TString logMsg; + AliShuttleStatus *currentStatus = ReadShuttleStatus(); + AliShuttleStatus::Status newStatus = AliShuttleStatus::kInvalid; + + if (currentStatus->GetStatus() <= AliShuttleStatus::kPPDone) + { + // in case pp not yet done set status to kPPTimeOut + + logMsg.Form("Process - Process of %s timed out. Run time: %d seconds. Killing...", + fCurrentDetector.Data(), expiredTime); + newStatus = AliShuttleStatus::kPPTimeOut; + } + else if (currentStatus->GetStatus() == AliShuttleStatus::kStoreStarted) + { + // in case the pp goes in TimeOut while storing the objects in the OCDB + // set status to kStoreError + + logMsg.Form("Process - Process of %s timed out while storing the OCDB object. Run time: %d seconds. Killing... and setting status to StoreError.", + fCurrentDetector.Data(), expiredTime); + newStatus = AliShuttleStatus::kStoreError; + } + else + { + // in other cases don't change the status + + logMsg.Form("Process - Process of %s timed out in status = %s. Run time: %d seconds. Killing... without changing the status", + fCurrentDetector.Data(), currentStatus->GetStatusName(), expiredTime); + } + + Log("SHUTTLE", logMsg); + Log(fCurrentDetector, logMsg); kill(pid, 9); - UpdateShuttleStatus(AliShuttleStatus::kPPTimeOut); + if (newStatus != AliShuttleStatus::kInvalid) + UpdateShuttleStatus(newStatus); hasError = kTRUE; gSystem->Sleep(1000); @@ -1332,33 +1477,14 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) { gSystem->Sleep(1000); - TString checkStr; - checkStr.Form("ps -o vsize --pid %d | tail -n 1", pid); - FILE* pipe = gSystem->OpenPipe(checkStr, "r"); - if (!pipe) - { - Log("SHUTTLE", Form("Process - Error: " - "Could not open pipe to %s", checkStr.Data())); + Int_t mem = GetMem(pid); + + if (mem < 0) continue; - } - char buffer[100]; - if (!fgets(buffer, 100, pipe)) - { - Log("SHUTTLE", "Process - Error: ps did not return anything"); - gSystem->ClosePipe(pipe); - continue; - } - gSystem->ClosePipe(pipe); - - //Log("SHUTTLE", Form("ps returned %s", buffer)); - - Int_t mem = 0; - if ((sscanf(buffer, "%d\n", &mem) != 1) || !mem) - { - Log("SHUTTLE", "Process - Error: Could not parse output of ps"); - continue; - } + mem -= initialMem; + if (mem < 0) + mem = 0; if (expiredTime % 60 == 0) { @@ -1402,8 +1528,8 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) } else if (pid == 0) { - // client - Log("SHUTTLE", Form("Process - In client process of %d - %s", GetCurrentRun(), + // child + Log("SHUTTLE", Form("Process - In child process of %d - %s", GetCurrentRun(), aDetector->GetName())); Log("SHUTTLE", Form("Process - Redirecting output to %s log",fCurrentDetector.Data())); @@ -1421,8 +1547,9 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) } TString wd = gSystem->WorkingDirectory(); - TString tmpDir = Form("%s/%s_%d_process", GetShuttleTempDir(), - fCurrentDetector.Data(), GetCurrentRun()); + Int_t dir_lev1 = GetCurrentRun()/10000; + TString tmpDir = Form("%s/%d/%d/%s_process", GetShuttleTempDir(), + dir_lev1, GetCurrentRun(), fCurrentDetector.Data()); Int_t result = gSystem->GetPathInfo(tmpDir.Data(), 0, (Long64_t*) 0, 0, 0); if (!result) // temp dir already exists! @@ -1444,11 +1571,11 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) gSystem->Exit(1); } - Bool_t success = ProcessCurrentDetector(); - + Int_t success = ProcessCurrentDetector(); + gSystem->ChangeDirectory(wd.Data()); - if (success) // Preprocessor finished successfully! + if (success == 1) // Preprocessor finished successfully! { // remove temporary folder or DCS map if (!fConfig->KeepTempFolder()) @@ -1468,7 +1595,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) if (StoreOCDB() == kFALSE) success = kFALSE; } - else + else if (success == 0) { Log("SHUTTLE", Form("\t\t\t****** run %d - %s: PP ERROR ******", @@ -1525,6 +1652,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) fFirstUnprocessed[iDet] = kFALSE; } } + SendMLRunInfo("Pending"); } } @@ -1534,7 +1662,7 @@ Bool_t AliShuttle::Process(AliShuttleLogbookEntry* entry) } //______________________________________________________________________________________________ -Bool_t AliShuttle::ProcessCurrentDetector() +Int_t AliShuttle::ProcessCurrentDetector() { // // Makes data retrieval just for a specific detector (fCurrentDetector). @@ -1546,16 +1674,30 @@ Bool_t AliShuttle::ProcessCurrentDetector() TString wd = gSystem->WorkingDirectory(); if (!CleanReferenceStorage(fCurrentDetector.Data())) - return kFALSE; + return 0; gSystem->ChangeDirectory(wd.Data()); - TMap* dcsMap = new TMap(); - // call preprocessor AliPreprocessor* aPreprocessor = dynamic_cast (fPreprocessorMap.GetValue(fCurrentDetector)); + // check if the preprocessor wants to process this run type + if (aPreprocessor->ProcessRunType() == kFALSE) + { + UpdateShuttleStatus(AliShuttleStatus::kSkipped); + UpdateShuttleLogbook(fCurrentDetector, "DONE"); + if (!UpdateTableSkippedCase(fCurrentDetector.Data())) + { + AliError(Form("Could not update FXS tables for run %d !", GetCurrentRun())); + } + Log(fCurrentDetector, Form("ProcessCurrentDetector - %s preprocessor is not interested in this run type", fCurrentDetector.Data())); + + return 2; + } + + TMap* dcsMap = new TMap(); + aPreprocessor->Initialize(GetCurrentRun(), GetCurrentStartTime(), GetCurrentEndTime()); Bool_t processDCS = aPreprocessor->ProcessDCS(); @@ -1575,7 +1717,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() UpdateShuttleStatus(AliShuttleStatus::kDCSStarted); UpdateShuttleStatus(AliShuttleStatus::kDCSError); delete dcsMap; - return kFALSE; + return 0; } else { UpdateShuttleStatus(AliShuttleStatus::kDCSStarted); @@ -1596,9 +1738,10 @@ Bool_t AliShuttle::ProcessCurrentDetector() TMap* aliasMap = 0; TMap* dpMap = 0; - + if (fConfig->GetDCSAliases(fCurrentDetector, iServ)->GetEntries() > 0) { + Log(fCurrentDetector, Form("Querying %d DCS aliases", fConfig->GetDCSAliases(fCurrentDetector, iServ)->GetEntries())); aliasMap = GetValueSet(host, port, fConfig->GetDCSAliases(fCurrentDetector, iServ), kAlias, multiSplit); @@ -1610,17 +1753,18 @@ Bool_t AliShuttle::ProcessCurrentDetector() " Sending mail to DCS experts!", host.Data())); UpdateShuttleStatus(AliShuttleStatus::kDCSError); - if (!SendMailToDCS()) + if (!SendMail(kDCSEMail)) Log("SHUTTLE", Form("ProcessCurrentDetector - " "Could not send mail to DCS experts!")); delete dcsMap; - return kFALSE; + return 0; } } if (fConfig->GetDCSDataPoints(fCurrentDetector, iServ)->GetEntries() > 0) { + Log(fCurrentDetector, Form("Querying %d DCS data points", fConfig->GetDCSDataPoints(fCurrentDetector, iServ)->GetEntries())); dpMap = GetValueSet(host, port, fConfig->GetDCSDataPoints(fCurrentDetector, iServ), kDP, multiSplit); @@ -1632,13 +1776,13 @@ Bool_t AliShuttle::ProcessCurrentDetector() " Sending mail to DCS experts!", host.Data())); UpdateShuttleStatus(AliShuttleStatus::kDCSError); - if (!SendMailToDCS()) + if (!SendMail(kDCSEMail)) Log("SHUTTLE", Form("ProcessCurrentDetector - " "Could not send mail to DCS experts!")); if (aliasMap) delete aliasMap; delete dcsMap; - return kFALSE; + return 0; } } @@ -1675,15 +1819,16 @@ Bool_t AliShuttle::ProcessCurrentDetector() // DCS Archive DB processing successful. Call Preprocessor! UpdateShuttleStatus(AliShuttleStatus::kPPStarted); - fFXSError = kFALSE; // this variable is kTRUE after ::Process if an FXS error occured + fFXSError = -1; // this variable is kTRUE after ::Process if an FXS error occured UInt_t returnValue = aPreprocessor->Process(dcsMap); - if (fFXSError) { + if (fFXSError!=-1) { UpdateShuttleStatus(AliShuttleStatus::kFXSError); + SendMail(kFXSEMail, fFXSError); dcsMap->DeleteAll(); delete dcsMap; - return kFALSE; + return 0; } if (returnValue > 0) // Preprocessor error! @@ -1693,7 +1838,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() UpdateShuttleStatus(AliShuttleStatus::kPPError); dcsMap->DeleteAll(); delete dcsMap; - return kFALSE; + return 0; } // preprocessor ok! @@ -1704,7 +1849,7 @@ Bool_t AliShuttle::ProcessCurrentDetector() dcsMap->DeleteAll(); delete dcsMap; - return kTRUE; + return 1; } //______________________________________________________________________________________________ @@ -1712,6 +1857,8 @@ void AliShuttle::CountOpenRuns() { // Query DAQ's Shuttle logbook and sends the number of open runs to ML + SendAlive(); + // check connection, in case connect if (!Connect(3)) return; @@ -1790,7 +1937,7 @@ Bool_t AliShuttle::QueryShuttleLogbook(const char* whereClause, } // TODO Check field count! - const UInt_t nCols = 23; + const UInt_t nCols = 26; if (aResult->GetFieldCount() != (Int_t) nCols) { Log("SHUTTLE", "Invalid SQL result field number!"); delete aResult; @@ -1806,6 +1953,11 @@ Bool_t AliShuttle::QueryShuttleLogbook(const char* whereClause, if (!entry) continue; + // DA test mode flag + TString daTestModeString(aRow->GetField(2), aRow->GetFieldLength(2)); // field 2 = DA test mode flag + Bool_t daTestMode = (Bool_t)daTestModeString.Atoi(); + entry->SetDATestMode(daTestMode); + // loop on detectors for(UInt_t ii = 0; ii < nCols; ii++) entry->SetDetectorStatus(aResult->GetFieldName(ii), aRow->GetField(ii)); @@ -1864,119 +2016,63 @@ AliShuttleLogbookEntry* AliShuttle::QueryRunParameters(Int_t run) for (Int_t ii = 0; ii < aResult->GetFieldCount(); ii++) entry->SetRunParameter(aResult->GetFieldName(ii), aRow->GetField(ii)); + delete aRow; + delete aResult; + UInt_t startTime = entry->GetStartTime(); UInt_t endTime = entry->GetEndTime(); - -// if (!startTime || !endTime || startTime > endTime) -// { -// Log("SHUTTLE", -// Form("QueryRunParameters - Invalid parameters for Run %d: startTime = %d, endTime = %d. Skipping!", -// run, startTime, endTime)); -// -// Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); -// fLogbookEntry = entry; -// if (!UpdateShuttleLogbook("shuttle_done")) -// { -// AliError(Form("Could not update logbook for run %d !", run)); -// } -// fLogbookEntry = 0; -// -// delete entry; -// delete aRow; -// delete aResult; -// return 0; -// } - - if (!startTime) + Bool_t ecsSuccess = entry->GetECSSuccess(); + + TString totEventsStr = entry->GetRunParameter("totalEvents"); + Int_t totEvents = totEventsStr.Atoi(); + + UInt_t now = time(0); + // TODO make this a configuration parameter + Int_t dcsDelay = fConfig->GetDCSDelay()+fConfig->GetDCSQueryOffset(); + + // runs are accepted if they have ecsSuccess set or more than 1 event + if (startTime != 0 && endTime != 0 && endTime > startTime && (totEvents > 1 || ecsSuccess) && (endTime < now - dcsDelay)) { - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping!", - run, startTime, endTime)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) - { - AliError(Form("Could not update logbook for run %d !", run)); - } - fLogbookEntry = 0; + if (ecsSuccess == kFALSE) + Log("SHUTTLE", Form("Processing run %d although in status ECS failure, Reason: %s", run, entry->GetRunParameter("eor_reason"))); + return entry; + } + + Bool_t skip = kFALSE; - delete entry; - delete aRow; - delete aResult; - return 0; + if (endTime != 0 && endTime >= now - dcsDelay) + { + Log("SHUTTLE", Form("Skipping run %d for now, because DCS buffer time is not yet expired", run)); } - - if (startTime && !endTime) + else if (totEvents <= 1) { - // TODO Here we don't mark SHUTTLE done, because this may mean - //the run is still ongoing!! - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!", - run, startTime, endTime)); - - //Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - //fLogbookEntry = entry; - //if (!UpdateShuttleLogbook("shuttle_done")) - //{ - // AliError(Form("Could not update logbook for run %d !", run)); - //} - //fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; + Log("SHUTTLE", Form("QueryRunParameters - Run %d has 1 event or less - Skipping!", run)); + skip = kTRUE; } - - if (startTime && endTime && (startTime > endTime)) + else { - Log("SHUTTLE", - Form("QueryRunParameters - Invalid parameters for Run %d: " - "startTime = %d, endTime = %d. Skipping!", - run, startTime, endTime)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) - { - AliError(Form("Could not update logbook for run %d !", run)); - } - fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; + Log("SHUTTLE", Form("QueryRunParameters - Invalid parameters for Run %d: " + "startTime = %d, endTime = %d. Skipping (Shuttle won't be marked as DONE)!", + run, startTime, endTime)); } - - TString totEventsStr = entry->GetRunParameter("totalEvents"); - Int_t totEvents = totEventsStr.Atoi(); - if (totEvents < 1) + + if (skip) { - Log("SHUTTLE", - Form("QueryRunParameters - Run %d has 0 events - Skipping!", run)); - - Log("SHUTTLE", Form("Marking SHUTTLE done for run %d", run)); - fLogbookEntry = entry; - if (!UpdateShuttleLogbook("shuttle_ignored")) + Log("SHUTTLE", Form("Marking SHUTTLE skipped for run %d", run)); + fLogbookEntry = entry; + if (!UpdateShuttleLogbook("shuttle_skipped")) { AliError(Form("Could not update logbook for run %d !", run)); } + if (!UpdateTableSkippedCase("ALL")) + { + AliError(Form("Could not update FXS tables for run %d !", run)); + } fLogbookEntry = 0; - - delete entry; - delete aRow; - delete aResult; - return 0; } - - delete aRow; - delete aResult; - - return entry; + + delete entry; + return 0; } //______________________________________________________________________________________________ @@ -1994,13 +2090,17 @@ TMap* AliShuttle::GetValueSet(const char* host, Int_t port, const TSeqCollection TMap* result = 0; if (type == kAlias) { - result = client.GetAliasValues(entries, GetCurrentStartTime(), - GetCurrentEndTime()); + //result = client.GetAliasValues(entries, GetCurrentStartTime()-offset, + // GetCurrentEndTime()+offset); + result = client.GetAliasValues(entries, GetStartTimeDCSQuery(), + GetEndTimeDCSQuery()); } else if (type == kDP) { - result = client.GetDPValues(entries, GetCurrentStartTime(), - GetCurrentEndTime()); + //result = client.GetDPValues(entries, GetCurrentStartTime()-offset, + // GetCurrentEndTime()+offset); + result = client.GetDPValues(entries, GetStartTimeDCSQuery(), + GetEndTimeDCSQuery()); } if (result == 0) @@ -2040,7 +2140,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (!Connect(system)) { Log(detector, Form("GetFile - Couldn't connect to %s FXS database", GetSystemName(system))); - fFXSError = kTRUE; + fFXSError = system; return 0; } @@ -2075,7 +2175,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (!aResult) { Log(detector, Form("GetFile - Can't execute SQL query to %s database for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); - fFXSError = kTRUE; + fFXSError = system; return 0; } @@ -2092,7 +2192,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, Log(detector, Form("GetFile - More than one entry in %s FXS db for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); - fFXSError = kTRUE; + fFXSError = system; delete aResult; return 0; } @@ -2101,7 +2201,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, Log(detector, Form("GetFileName - Wrong field count in %s FXS db for: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); - fFXSError = kTRUE; + fFXSError = system; delete aResult; return 0; } @@ -2111,7 +2211,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (!aRow){ Log(detector, Form("GetFile - Empty set result in %s FXS db from query: id = %s, source = %s", GetSystemName(system), id, sourceName.Data())); - fFXSError = kTRUE; + fFXSError = system; delete aResult; return 0; } @@ -2127,10 +2227,12 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, filePath.Data(), fileSize.Data(), fileChecksum.Data())); // retrieved file is renamed to make it unique - TString localFileName = Form("%s/%s_%d_process/%s_%s_%d_%s_%s.shuttle", - GetShuttleTempDir(), detector, GetCurrentRun(), + Int_t dir_lev1 = GetCurrentRun()/10000; + TString localFileName = Form("%s/%d/%d/%s_process/%s_%s_%d_%s_%s.shuttle", + GetShuttleTempDir(), dir_lev1, GetCurrentRun(), detector, GetSystemName(system), detector, GetCurrentRun(), id, sourceName.Data()); + Log("SHUTTLE",Form("file from FXS = %s",localFileName.Data())); // file retrieval from FXS @@ -2172,7 +2274,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, { // compare md5sum of local file with the one stored in the FXS DB if(fileChecksum.Contains(' ')) fileChecksum.Resize(fileChecksum.First(' ')); - Int_t md5Comp = gSystem->Exec(Form("md5sum %s |grep %s 2>&1 > /dev/null", + Int_t md5Comp = gSystem->Exec(Form("md5sum %s |grep %s > /dev/null 2> /dev/null", localFileName.Data(), fileChecksum.Data())); if (md5Comp != 0) @@ -2191,7 +2293,7 @@ const char* AliShuttle::GetFile(Int_t system, const char* detector, if (!result) { - fFXSError = kTRUE; + fFXSError = system; return 0; } @@ -2234,26 +2336,11 @@ Bool_t AliShuttle::RetrieveFile(UInt_t system, const char* fxsFileName, const ch } } - TString baseFXSFolder; - if (system == kDAQ) - { - baseFXSFolder = "FES/"; - } - else if (system == kDCS) - { - baseFXSFolder = ""; - } - else if (system == kHLT) - { - baseFXSFolder = "/opt/FXS/"; - } - - - TString command = Form("scp -oPort=%d -2 %s@%s:%s%s %s", + TString command = Form("scp -oPort=%d -2 %s@%s:%s/%s %s", fConfig->GetFXSPort(system), fConfig->GetFXSUser(system), fConfig->GetFXSHost(system), - baseFXSFolder.Data(), + fConfig->GetFXSBaseFolder(system), fxsFileName, localFileName); @@ -2299,11 +2386,11 @@ TList* AliShuttle::GetFileSources(Int_t system, const char* detector, const char if (!Connect(system)) { Log(detector, Form("GetFileSources - Couldn't connect to %s FXS database", GetSystemName(system))); - fFXSError = kTRUE; + fFXSError = system; return NULL; } - TString sourceName = 0; + TString sourceName = ""; if (system == kDAQ) { sourceName = "DAQsource"; @@ -2327,7 +2414,7 @@ TList* AliShuttle::GetFileSources(Int_t system, const char* detector, const char if (!aResult) { Log(detector, Form("GetFileSources - Can't execute SQL query to %s database for id: %s", GetSystemName(system), id)); - fFXSError = kTRUE; + fFXSError = system; return 0; } @@ -2382,7 +2469,7 @@ TList* AliShuttle::GetFileIDs(Int_t system, const char* detector, const char* so return NULL; } - TString sourceName = 0; + TString sourceName = ""; if (system == kDAQ) { sourceName = "DAQsource"; @@ -2446,7 +2533,20 @@ Bool_t AliShuttle::Connect(Int_t system) // // check connection: if already connected return - if(fServer[system] && fServer[system]->IsConnected()) return kTRUE; + + if(fServer[system] && fServer[system]->IsConnected()) { + // ping the server + if (fServer[system]->PingVerify()==kTRUE){ // connection is still alive + return kTRUE; + } + else{ + AliWarning(Form("Connection got lost to FXS database for %s. Closing and reconnecting.", + AliShuttleInterface::GetSystemName(system))); + fServer[system]->Close(); + delete fServer[system]; + fServer[system] = 0x0; + } + } TString dbHost, dbUser, dbPass, dbName; @@ -2580,22 +2680,25 @@ Bool_t AliShuttle::UpdateTable() return result; } -//______________________________________________________________________________________________ -Bool_t AliShuttle::UpdateTableFailCase() +//_______________________________________________________________________________ +Bool_t AliShuttle::UpdateTableSkippedCase(const char* detector) { + // // Update FXS table filling time_processed field in all rows corresponding to current run and detector - // this is called in case the preprocessor is declared failed for the current run, because - // the fields are updated only in case of success + // if detector = "ALL" update all detectors + // Bool_t result = kTRUE; + TString detName(detector); + for (UInt_t system=0; system<3; system++) { + // check connection, in case connect if (!Connect(system)) { - Log(fCurrentDetector, Form("UpdateTableFailCase - Couldn't connect to %s FXS database", - GetSystemName(system))); + Log(fCurrentDetector, Form("UpdateTableSkippedCase - Couldn't connect to %s FXS database", GetSystemName(system))); result = kFALSE; continue; } @@ -2603,13 +2706,16 @@ Bool_t AliShuttle::UpdateTableFailCase() TTimeStamp now; // now // Loop on FXS list entries + TIter iter(&fFXSlist[system]); + + TString whereClause; + if (detName == "ALL") whereClause = Form("where run=%d and time_processed IS NULL;",GetCurrentRun()); + else whereClause = Form("where run=%d and detector=\"%s\" and time_processed IS NULL;",GetCurrentRun(), detector); - TString whereClause = Form("where run=%d and detector=\"%s\";", - GetCurrentRun(), fCurrentDetector.Data()); - + //Log("SHUTTLE",Form(" whereClause = %s ",whereClause.Data())); TString sqlQuery = Form("update %s set time_processed=%d %s", fConfig->GetFXSdbTable(system), - now.GetSec(), whereClause.Data()); + now.GetSec(), whereClause.Data()); AliDebug(2, Form("SQL query: \n%s",sqlQuery.Data())); @@ -2618,24 +2724,73 @@ Bool_t AliShuttle::UpdateTableFailCase() aResult = dynamic_cast (fServer[system]->Query(sqlQuery)); if (!aResult) { - Log(fCurrentDetector, Form("UpdateTableFailCase - %s db: can't execute SQL query <%s>", + Log("SHUTTLE", Form("UpdateTableSkippedCase - %s db: can't execute SQL query <%s>", GetSystemName(system), sqlQuery.Data())); result = kFALSE; continue; } delete aResult; + } return result; } - //______________________________________________________________________________________________ -Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status) +Bool_t AliShuttle::UpdateTableFailCase() { - // - // Update Shuttle logbook filling detector or shuttle_done column - // ex. of usage: UpdateShuttleLogbook("PHOS", "DONE") or UpdateShuttleLogbook("shuttle_done") - // + // Update FXS table filling time_processed field in all rows corresponding to current run and detector + // this is called in case the preprocessor is declared failed for the current run, because + // the fields are updated only in case of success + + Bool_t result = kTRUE; + + for (UInt_t system=0; system<3; system++) + { + // check connection, in case connect + if (!Connect(system)) + { + Log(fCurrentDetector, Form("UpdateTableFailCase - Couldn't connect to %s FXS database", + GetSystemName(system))); + result = kFALSE; + continue; + } + + TTimeStamp now; // now + + // Loop on FXS list entries + + TString whereClause = Form("where run=%d and detector=\"%s\";", + GetCurrentRun(), fCurrentDetector.Data()); + + + TString sqlQuery = Form("update %s set time_processed=%d %s", fConfig->GetFXSdbTable(system), + now.GetSec(), whereClause.Data()); + + AliDebug(2, Form("SQL query: \n%s",sqlQuery.Data())); + + // Query execution + TSQLResult* aResult; + aResult = dynamic_cast (fServer[system]->Query(sqlQuery)); + if (!aResult) + { + Log(fCurrentDetector, Form("UpdateTableFailCase - %s db: can't execute SQL query <%s>", + GetSystemName(system), sqlQuery.Data())); + result = kFALSE; + continue; + } + delete aResult; + } + + return result; +} + +//______________________________________________________________________________________________ +Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status) +{ + // + // Update Shuttle logbook filling detector or shuttle_done column + // ex. of usage: UpdateShuttleLogbook("PHOS", "DONE") or UpdateShuttleLogbook("shuttle_done") + // // check connection, in case connect if(!Connect(3)){ @@ -2645,23 +2800,24 @@ Bool_t AliShuttle::UpdateShuttleLogbook(const char* detector, const char* status TString detName(detector); TString setClause; - if (detName == "shuttle_done" || detName == "shuttle_ignored") + if (detName == "shuttle_done" || detName == "shuttle_skipped") { setClause = "set shuttle_done=1"; - + if (detName == "shuttle_done") { - // Send the information to ML - TMonaLisaText mlStatus("SHUTTLE_status", "Done"); - - TList mlList; - mlList.Add(&mlStatus); - - TString mlID; - mlID.Form("%d", GetCurrentRun()); - fMonaLisa->SendParameters(&mlList, mlID); + if (TouchFile() != kTRUE) + { + SendMLRunInfo("Pending"); + return kFALSE; + } + + SendMLRunInfo("Done"); } - } else { + else + SendMLRunInfo("Skipped"); + } + else { TString statusStr(status); if(statusStr.Contains("done", TString::kIgnoreCase) || statusStr.Contains("failed", TString::kIgnoreCase)){ @@ -2722,7 +2878,6 @@ UInt_t AliShuttle::GetCurrentEndTime() const return fLogbookEntry ? fLogbookEntry->GetEndTime() : 0; } - //______________________________________________________________________________________________ UInt_t AliShuttle::GetCurrentYear() const { @@ -2757,11 +2912,12 @@ void AliShuttle::Log(const char* detector, const char* message) // // Fill log string with a message // - - TString logRunDir = GetShuttleLogDir(); - if (GetCurrentRun() >=0) - logRunDir += Form("/%d", GetCurrentRun()); + TString logRunDir = GetShuttleLogDir(); + if (GetCurrentRun() >=0) { + Int_t logDir_lev1 = GetCurrentRun()/10000; + logRunDir += Form("/%d/%d", logDir_lev1, GetCurrentRun()); + } void* dir = gSystem->OpenDirectory(logRunDir.Data()); if (dir == NULL) { if (gSystem->mkdir(logRunDir.Data(), kTRUE)) { @@ -2773,7 +2929,7 @@ void AliShuttle::Log(const char* detector, const char* message) gSystem->FreeDirectory(dir); } - TString toLog = Form("%s (%d): %s - ", TTimeStamp(time(0)).AsString("s"), getpid(), detector); + TString toLog = Form("%s UTC (%d): %s - ", TTimeStamp(time(0)).AsString("s"), getpid(), detector); if (GetCurrentRun() >= 0) toLog += Form("run %d - ", GetCurrentRun()); toLog += Form("%s", message); @@ -2812,8 +2968,9 @@ TString AliShuttle::GetLogFileName(const char* detector) const if (GetCurrentRun() >= 0) { - fileName.Form("%s/%d/%s_%d.log", GetShuttleLogDir(), GetCurrentRun(), - detector, GetCurrentRun()); + Int_t logDir_lev1 = GetCurrentRun()/10000; + fileName.Form("%s/%d/%d/%s.log", GetShuttleLogDir(), logDir_lev1, GetCurrentRun(), + detector); } else { fileName.Form("%s/%s.log", GetShuttleLogDir(), detector); } @@ -2855,7 +3012,6 @@ Bool_t AliShuttle::Collect(Int_t run) if (!fMonaLisa) fMonaLisa = new TMonaLisaWriter(fConfig->GetMonitorHost(), fConfig->GetMonitorTable()); - SendAlive(); CountOpenRuns(); TString whereClause("where shuttle_done=0"); @@ -2886,9 +3042,9 @@ Bool_t AliShuttle::Collect(Int_t run) { // query Shuttle logbook for earlier runs, check if some detectors are unprocessed, // flag them into fFirstUnprocessed array - TString whereClause(Form("where shuttle_done=0 and run < %d", run)); + TString whereClauseBis(Form("where shuttle_done=0 and run < %d", run)); TObjArray tmpLogbookEntries; - if (!QueryShuttleLogbook(whereClause, tmpLogbookEntries)) + if (!QueryShuttleLogbook(whereClauseBis, tmpLogbookEntries)) { Log("SHUTTLE", "Collect - Can't retrieve entries from Shuttle logbook"); return kFALSE; @@ -3039,7 +3195,7 @@ AliCDBEntry* AliShuttle::GetFromOCDB(const char* detector, const AliCDBPath& pat } //______________________________________________________________________________________________ -Bool_t AliShuttle::SendMail() +Bool_t AliShuttle::SendMail(EMailTarget target, Int_t system) { // // sends a mail to the subdetector expert in case of preprocessor error @@ -3051,35 +3207,77 @@ Bool_t AliShuttle::SendMail() if (!fConfig->SendMail()) return kTRUE; + if (target == kDCSEMail || target == kFXSEMail) { + if (!fFirstProcessing) + return kTRUE; + } + + Int_t runMode = (Int_t)fConfig->GetRunMode(); + TString tmpStr; + if (runMode == 0) tmpStr = " Nightly Test:"; + else tmpStr = " Data Taking:"; + void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); + if (dir == NULL) + { + if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) + { + Log("SHUTTLE", Form("SendMail - Can't open directory <%s>", GetShuttleLogDir())); + return kFALSE; + } + + } else { + gSystem->FreeDirectory(dir); + } + + // det experts in to TString to=""; - TIter iterExperts(fConfig->GetResponsibles(fCurrentDetector)); + TIter *iterExperts = 0; + if (target == kDCSEMail) { + iterExperts = new TIter(fConfig->GetAdmins(AliShuttleConfig::kAmanda)); + } + else if (target == kFXSEMail) { + iterExperts = new TIter(fConfig->GetAdmins(system)); + } + if (iterExperts) { + TObjString *anExpert=0; + while ((anExpert = (TObjString*) iterExperts->Next())) + { + to += Form("%s,", anExpert->GetName()); + } + delete iterExperts; + } + + // add subdetector experts + iterExperts = new TIter(fConfig->GetResponsibles(fCurrentDetector)); TObjString *anExpert=0; - while ((anExpert = (TObjString*) iterExperts.Next())) + while ((anExpert = (TObjString*) iterExperts->Next())) { to += Form("%s,", anExpert->GetName()); } + delete iterExperts; + if (to.Length() > 0) to.Remove(to.Length()-1); AliDebug(2, Form("to: %s",to.Data())); if (to.IsNull()) { - Log("SHUTTLE", "List of detector responsibles not set!"); + Log("SHUTTLE", Form("List of %d responsibles not set!", (Int_t) target)); return kFALSE; } - void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); - if (dir == NULL) + // SHUTTLE responsibles in cc + TString cc=""; + TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); + TObjString *anAdmin=0; + while ((anAdmin = (TObjString*) iterAdmins.Next())) { - if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) - { - Log("SHUTTLE", Form("SendMail - Can't open directory <%s>", GetShuttleLogDir())); - return kFALSE; - } - - } else { - gSystem->FreeDirectory(dir); + cc += Form("%s,", anAdmin->GetName()); } + if (cc.Length() > 0) + cc.Remove(cc.Length()-1); + AliDebug(2, Form("cc: %s",to.Data())); + // mail body TString bodyFileName; bodyFileName.Form("%s/mail.body", GetShuttleLogDir()); gSystem->ExpandPathName(bodyFileName); @@ -3093,32 +3291,51 @@ Bool_t AliShuttle::SendMail() return kFALSE; } - TString cc=""; - TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); - TObjString *anAdmin=0; - while ((anAdmin = (TObjString*) iterAdmins.Next())) - { - cc += Form("%s,", anAdmin->GetName()); - } - if (cc.Length() > 0) - cc.Remove(cc.Length()-1); - AliDebug(2, Form("cc: %s",to.Data())); - TString subject = Form("%s Shuttle preprocessor FAILED in run %d (run type = %s)!", - fCurrentDetector.Data(), GetCurrentRun(), GetRunType()); - AliDebug(2, Form("subject: %s", subject.Data())); + TString subject; + TString body; + + if (target == kDCSEMail){ + subject = Form("%s CRITICAL Retrieval of data points for %s FAILED in run %d !", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun()); + AliDebug(2, Form("subject: %s", subject.Data())); + + body = Form("Dear DCS experts, \n\n"); + body += Form("SHUTTLE couldn\'t retrieve the data points for detector %s " + "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); + } + else if (target == kFXSEMail){ + subject = Form("%s CRITICAL FXS communication for %s FAILED in run %d !", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun()); + AliDebug(2, Form("subject: %s", subject.Data())); + TString sys; + if (system == kDAQ) sys="DAQ"; + else if (system == kDCS) sys="DCS"; + else if (system == kHLT) sys="HLT"; + else return kFALSE; + body = Form("Dear %s FXS experts, \n\n",sys.Data()); + body += Form("SHUTTLE couldn\'t retrieve data from the FXS for detector %s " + "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); + body += Form("The contacted server was:\nDB: %s\nFXS:%s\n\n", fConfig->GetFXSdbHost(system), fConfig->GetFXSHost(system)); + } + else { + subject = Form("%s %s Shuttle preprocessor FAILED in run %d (run type = %s)!", + tmpStr.Data(), fCurrentDetector.Data(), GetCurrentRun(), GetRunType()); + AliDebug(2, Form("subject: %s", subject.Data())); + + body = Form("Dear %s expert(s), \n\n", fCurrentDetector.Data()); + body += Form("SHUTTLE just detected that your preprocessor " + "failed processing run %d (run type = %s)!!\n\n", + GetCurrentRun(), GetRunType()); + } - TString body = Form("Dear %s expert(s), \n\n", fCurrentDetector.Data()); - body += Form("SHUTTLE just detected that your preprocessor " - "failed processing run %d (run type = %s)!!\n\n", - GetCurrentRun(), GetRunType()); body += Form("Please check %s status on the SHUTTLE monitoring page: \n\n", fCurrentDetector.Data()); if (fConfig->GetRunMode() == AliShuttleConfig::kTest) { - body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n"); + body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?time=24 \n\n"); } else { - body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=168 \n\n"); + body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD&time=24 \n\n"); } @@ -3128,10 +3345,10 @@ Bool_t AliShuttle::SendMail() body += Form("Find the %s log for the current run on \n\n" - "\thttp://pcalishuttle01.cern.ch:8880/%s/%d/%s_%d.log \n\n", - fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - body += Form("The last 10 lines of %s log file are following:\n\n", fCurrentDetector.Data()); + "\thttp://pcalishuttle02.cern.ch/%s/%d/%d/%s.log \n\n", + fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun()/10000, + GetCurrentRun(), fCurrentDetector.Data()); + body += Form("The last 15 lines of %s log file are following:\n\n", fCurrentDetector.Data()); AliDebug(2, Form("Body begin: %s", body.Data())); @@ -3139,9 +3356,9 @@ Bool_t AliShuttle::SendMail() mailBody.close(); mailBody.open(bodyFileName, ofstream::out | ofstream::app); - TString logFileName = Form("%s/%d/%s_%d.log", GetShuttleLogDir(), - GetCurrentRun(), fCurrentDetector.Data(), GetCurrentRun()); - TString tailCommand = Form("tail -n 10 %s >> %s", logFileName.Data(), bodyFileName.Data()); + TString logFileName = Form("%s/%d/%d/%s.log", GetShuttleLogDir(), + GetCurrentRun()/10000, GetCurrentRun(), fCurrentDetector.Data()); + TString tailCommand = Form("tail -n 15 %s >> %s", logFileName.Data(), bodyFileName.Data()); if (gSystem->Exec(tailCommand.Data())) { mailBody << Form("%s log file not found ...\n\n", fCurrentDetector.Data()); @@ -3170,172 +3387,187 @@ Bool_t AliShuttle::SendMail() return result == 0; } - //______________________________________________________________________________________________ -Bool_t AliShuttle::SendMailToDCS() +const char* AliShuttle::GetRunType() { // - // sends a mail to the DCS Amanda experts in case of DCS data point retrieval error + // returns run type read from "run type" logbook // - - if (fTestMode != kNone) - return kTRUE; - if (!fConfig->SendMail()) - return kTRUE; - - if (!fFirstProcessing) - return kTRUE; - - void* dir = gSystem->OpenDirectory(GetShuttleLogDir()); - if (dir == NULL) - { - if (gSystem->mkdir(GetShuttleLogDir(), kTRUE)) - { - Log("SHUTTLE", Form("SendMailToDCS - Can't open directory <%s>", GetShuttleLogDir())); - return kFALSE; - } - - } else { - gSystem->FreeDirectory(dir); + if(!fLogbookEntry) { + AliError("No logbook entry!"); + return 0; } - TString bodyFileName; - bodyFileName.Form("%s/mail.body", GetShuttleLogDir()); - gSystem->ExpandPathName(bodyFileName); - - ofstream mailBody; - mailBody.open(bodyFileName, ofstream::out); + return fLogbookEntry->GetRunType(); +} - if (!mailBody.is_open()) - { - Log("SHUTTLE", Form("SendMailToDCS - Could not open mail body file %s", bodyFileName.Data())); - return kFALSE; - } +//______________________________________________________________________________________________ +Bool_t AliShuttle::GetHLTStatus() +{ + // Return HLT status (ON=1 OFF=0) + // Converts the HLT status from the mode string read in the run logbook (not just a bool) - TString to=""; - TIter iterExperts(fConfig->GetAdmins(AliShuttleConfig::kAmanda)); - TObjString *anExpert=0; - while ((anExpert = (TObjString*) iterExperts.Next())) - { - to += Form("%s,", anExpert->GetName()); + if(!fLogbookEntry) { + AliError("No logbook entry!"); + return 0; } - if (to.Length() > 0) - to.Remove(to.Length()-1); - AliDebug(2, Form("to: %s",to.Data())); - if (to.IsNull()) { - Log("SHUTTLE", "List of Amanda server administrators not set!"); + // TODO implement when HLTMode is inserted in run logbook + TString hltMode = fLogbookEntry->GetRunParameter("HLTmode"); + TSubString firstChar = hltMode(0,1); + AliDebug(2,Form("First char = %s ",firstChar.Data())); + if (firstChar == "A") { return kFALSE; } - - TString cc=""; - TIter iterAdmins(fConfig->GetAdmins(AliShuttleConfig::kGlobal)); - TObjString *anAdmin=0; - while ((anAdmin = (TObjString*) iterAdmins.Next())) - { - cc += Form("%s,", anAdmin->GetName()); + else if ((firstChar == "B") || (firstChar == "C") || (firstChar == "D") || (firstChar == "E")) { + return kTRUE; } - if (cc.Length() > 0) - cc.Remove(cc.Length()-1); - AliDebug(2, Form("cc: %s",to.Data())); + else { + Log("SHUTTLE","Unexpected HLT mode! Returning 0...."); + return kFALSE; + } +} - TString subject = Form("Retrieval of data points for %s FAILED in run %d !", - fCurrentDetector.Data(), GetCurrentRun()); - AliDebug(2, Form("subject: %s", subject.Data())); +//______________________________________________________________________________________________ +const char* AliShuttle::GetTriggerConfiguration() +{ + // Receives the trigger configuration from the DAQ logbook for the current run + + // check connection, if needed reconnect + if (!Connect(3)) + return 0; - TString body = Form("Dear DCS experts, \n\n"); - body += Form("SHUTTLE couldn\'t retrieve the data points for detector %s " - "in run %d!!\n\n", fCurrentDetector.Data(), GetCurrentRun()); - body += Form("Please check %s status on the SHUTTLE monitoring page: \n\n", - fCurrentDetector.Data()); - if (fConfig->GetRunMode() == AliShuttleConfig::kTest) + TString sqlQuery; + sqlQuery.Form("SELECT configFile FROM logbook_trigger_config WHERE run = %d", GetCurrentRun()); + TSQLResult* result = fServer[3]->Query(sqlQuery); + if (!result) { - body += Form("\thttp://pcalimonitor.cern.ch:8889/shuttle.jsp?time=168 \n\n"); - } else { - body += Form("\thttp://pcalimonitor.cern.ch/shuttle.jsp?instance=PROD?time=168 \n\n"); + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; } - - TString logFolder = "logs"; - if (fConfig->GetRunMode() == AliShuttleConfig::kProd) - logFolder += "_PROD"; + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "WARNING: Trigger configuration not found in logbook_trigger_config"); + delete result; + return 0; + } - body += Form("Find the %s log for the current run on \n\n" - "\thttp://pcalishuttle01.cern.ch:8880/%s/%d/%s_%d.log \n\n", - fCurrentDetector.Data(), logFolder.Data(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - body += Form("The last 10 lines of %s log file are following:\n\n", fCurrentDetector.Data()); - - AliDebug(2, Form("Body begin: %s", body.Data())); - - mailBody << body.Data(); - mailBody.close(); - mailBody.open(bodyFileName, ofstream::out | ofstream::app); - - TString logFileName = Form("%s/%d/%s_%d.log", GetShuttleLogDir(), GetCurrentRun(), - fCurrentDetector.Data(), GetCurrentRun()); - TString tailCommand = Form("tail -n 10 %s >> %s", logFileName.Data(), bodyFileName.Data()); - if (gSystem->Exec(tailCommand.Data())) + TSQLRow* row = result->Next(); + if (!row) { - mailBody << Form("%s log file not found ...\n\n", fCurrentDetector.Data()); + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_config data"); + delete result; + return 0; } - TString endBody = Form("------------------------------------------------------\n\n"); - endBody += Form("In case of problems please contact the SHUTTLE core team.\n\n"); - endBody += "Please do not answer this message directly, it is automatically generated.\n\n"; - endBody += "Greetings,\n\n \t\t\tthe SHUTTLE\n"; - - AliDebug(2, Form("Body end: %s", endBody.Data())); - - mailBody << endBody.Data(); - - mailBody.close(); - - // send mail! - TString mailCommand = Form("mail -s \"%s\" -c %s %s < %s", - subject.Data(), - cc.Data(), - to.Data(), - bodyFileName.Data()); - AliDebug(2, Form("mail command: %s", mailCommand.Data())); - - Bool_t result = gSystem->Exec(mailCommand.Data()); - - return result == 0; + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerConfig(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found trigger configuration: %s", triggerConfig.Data())); + + return triggerConfig; } //______________________________________________________________________________________________ -const char* AliShuttle::GetRunType() +const char* AliShuttle::GetCTPTimeParams() { - // - // returns run type read from "run type" logbook - // + // Receives the CTP time parameters from the DAQ logbook for the current run + + // check connection, if needed reconnect + if (!Connect(3)) + return 0; - if(!fLogbookEntry) { - AliError("No logbook entry!"); + TString sqlQuery; + sqlQuery.Form("SELECT alignmentFile FROM logbook_trigger_config WHERE run = %d", GetCurrentRun()); + TSQLResult* result = fServer[3]->Query(sqlQuery); + if (!result) + { + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; + } + + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "WARNING: CTP time params not found in logbook_trigger_config"); + delete result; + return 0; + } + + TSQLRow* row = result->Next(); + if (!row) + { + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_config data"); + delete result; return 0; } - return fLogbookEntry->GetRunType(); + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerTimeParams(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found trigger time parameters: %s", triggerTimeParams.Data())); + + return triggerTimeParams; } //______________________________________________________________________________________________ -Bool_t AliShuttle::GetHLTStatus() +const char* AliShuttle::GetTriggerDetectorMask() { - // Return HLT status (ON=1 OFF=0) - // Converts the HLT status from the status string read in the run logbook (not just a bool) + // Receives the trigger detector mask from DAQ logbook + + // check connection, if needed reconnect + if (!Connect(3)) + return 0; - if(!fLogbookEntry) { - AliError("No logbook entry!"); + TString sqlQuery; + sqlQuery.Form("SELECT BIN(BIT_OR(inputDetectorMask)) from logbook_trigger_clusters WHERE run = %d;", GetCurrentRun()); + TSQLResult* result = fServer[3]->Query(sqlQuery); + if (!result) + { + Log("SHUTTLE", Form("ERROR: Can't execute query <%s>!", sqlQuery.Data())); + return 0; + } + + if (result->GetRowCount() == 0) + { + Log("SHUTTLE", "ERROR: Trigger Detector Mask not found in logbook_trigger_clusters"); + delete result; + return 0; + } + + TSQLRow* row = result->Next(); + if (!row) + { + Log("SHUTTLE", "ERROR: Could not receive logbook_trigger_clusters data"); + delete result; return 0; } - // TODO implement when HLTStatus is inserted in run logbook - //TString hltStatus = fLogbookEntry->GetRunParameter("HLTStatus"); - //if(hltStatus == "OFF") {return kFALSE}; - - return kTRUE; + // static, so that pointer remains valid when it is returned to the calling class + static TString triggerDetectorMask(row->GetField(0)); + + delete row; + row = 0; + + delete result; + result = 0; + + Log("SHUTTLE", Form("Found Trigger Detector Mask: %s", triggerDetectorMask.Data())); + + return triggerDetectorMask; } //______________________________________________________________________________________________ @@ -3357,3 +3589,108 @@ void AliShuttle::SetShuttleLogDir(const char* logDir) fgkShuttleLogDir = gSystem->ExpandPathName(logDir); } +//______________________________________________________________________________________________ +Bool_t AliShuttle::TouchFile() +{ + // + // touching a file on the grid if run has been DONE + // + + if (!gGrid) + { + Log("SHUTTLE",Form("No TGrid connection estabilished!")); + Log("SHUTTLE",Form("Could not touch file for run %i",GetCurrentRun())); + return kFALSE; + } + + TString dir; + dir.Form("%s%d/SHUTTLE_DONE", fConfig->GetAlienPath(), GetCurrentYear()); + // checking whether directory for touch command exists + TString commandLs; + commandLs.Form("ls %s",dir.Data()); + TGridResult *resultLs = dynamic_cast(gGrid->Command(commandLs)); + if (!resultLs){ + Log("SHUTTLE",Form("No result for %s command, returning without touching",commandLs.Data())); + return kFALSE; + } + TMap *mapLs = dynamic_cast(resultLs->At(0)); + if (!mapLs){ + Log("SHUTTLE",Form("No map for %s command, returning without touching",commandLs.Data())); + delete resultLs; + resultLs = 0x0; + return kFALSE; + } + TObjString *valueLsPath = dynamic_cast(mapLs->GetValue("path")); + if (!valueLsPath || (valueLsPath->GetString()).CompareTo(dir)!=1){ + Log("SHUTTLE",Form("No directory %s found, creating it",dir.Data())); + + // creating the directory + + Bool_t boolMkdir = gGrid->Mkdir(dir.Data()); + if (!boolMkdir) { + Log("SHUTTLE",Form("Impossible to create dir %s in alien catalogue for run %i!",dir.Data(),GetCurrentRun())); + delete resultLs; + resultLs = 0x0; + return kFALSE; + } + Log("SHUTTLE",Form("Directory %s successfully created in alien catalogue for run %i",dir.Data(),GetCurrentRun())); + } + else { + Log("SHUTTLE",Form("Directory %s correctly found for run %i",dir.Data(),GetCurrentRun())); + } + + delete resultLs; + resultLs = 0x0; + + TString command; + command.Form("touch %s/%i", dir.Data(), GetCurrentRun()); + Log("SHUTTLE", Form("Creating entry in file catalog: %s", command.Data())); + TGridResult *resultTouch = dynamic_cast(gGrid->Command(command)); + if (!resultTouch){ + Log("SHUTTLE",Form("No result for touching command, returning without touching for run %i",GetCurrentRun())); + return kFALSE; + } + TMap *mapTouch = dynamic_cast(resultTouch->At(0)); + if (!mapTouch){ + Log("SHUTTLE",Form("No map for touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + TObjString *valueTouch = dynamic_cast(mapTouch->GetValue("__result__")); + if (!valueTouch){ + Log("SHUTTLE",Form("No value for \"__result__\" key set in the map for touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + if (valueTouch->GetString()!="1"){ + Log("SHUTTLE",Form("Failing the touching command, returning without touching for run %i",GetCurrentRun())); + delete resultTouch; + resultTouch = 0x0; + return kFALSE; + } + delete resultTouch; + resultTouch = 0x0; + Log("SHUTTLE", "Sucessfully touched the file"); + return kTRUE; +} +//______________________________________________________________________________________________ +UInt_t AliShuttle::GetStartTimeDCSQuery() +{ + // Return Start Time for the DCS query + // + // The call is delegated to AliShuttleInterface + + return GetCurrentStartTime()-fConfig->GetDCSQueryOffset(); +} +//______________________________________________________________________________________________ +UInt_t AliShuttle::GetEndTimeDCSQuery() +{ + // Return End Time for the DCS query + // + // The call is delegated to AliShuttleInterface + + return GetCurrentEndTime()+fConfig->GetDCSQueryOffset(); +} +